Gevent link crawler

https://stackoverflow.com/questions/19210008

30-06-2022
|

Question

Here i have written the code using python and beautiful soup to parse all the links on that page into a repository of links. Next, it fetches the contents of any of the url from the repository just created, parses the links from this new content into the repository and continues this process for all links in the repository until stopped or after a given number of links are fetched.

But this code is very slow. How can i improve it by using asynchronous programming using gevents in python ?

Code

class Crawler(object):


def __init__(self):

    self.soup = None                                        # Beautiful Soup object
    self.current_page   = "http://www.python.org/"          # Current page's address
    self.links          = set()                             # Queue with every links fetched
    self.visited_links  = set()

    self.counter = 0 # Simple counter for debug purpose

def open(self):

    # Open url
    print self.counter , ":", self.current_page
    res = urllib2.urlopen(self.current_page)
    html_code = res.read()
    self.visited_links.add(self.current_page) 

    # Fetch every links
    self.soup = BeautifulSoup.BeautifulSoup(html_code)

    page_links = []
    try :
        page_links = itertools.ifilter(  # Only deal with absolute links 
                                        lambda href: 'http://' in href,
                                            ( a.get('href') for a in self.soup.findAll('a') )  )
    except Exception as e: # Magnificent exception handling
        print 'Error: ',e
        pass



    # Update links 
    self.links = self.links.union( set(page_links) ) 



    # Choose a random url from non-visited set
    self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
    self.counter+=1


def run(self):

    # Crawl 3 webpages (or stop if all url has been fetched)
    while len(self.visited_links) < 3 or (self.visited_links == self.links):
        self.open()

    for link in self.links:
        print link



if __name__ == '__main__':

C = Crawler()
C.run()

Update 1

import gevent.monkey; gevent.monkey.patch_thread()
from bs4 import BeautifulSoup
import urllib2
import itertools
import random
import urlparse
import sys

import gevent.monkey; gevent.monkey.patch_all(thread=False)




class Crawler(object):


def __init__(self):
self.soup = None                                        # Beautiful Soup object
self.current_page   = "http://www.python.org/"          # Current page's address
self.links          = set()                             # Queue with every links fetched
self.visited_links  = set()

self.counter = 0 # Simple counter for debug purpose

def open(self):

# Open url
print self.counter , ":", self.current_page
res = urllib2.urlopen(self.current_page)
html_code = res.read()
self.visited_links.add(self.current_page)

# Fetch every links
self.soup = BeautifulSoup(html_code)

page_links = []
try :
    for link in [h.get('href') for h in self.soup.find_all('a')]:
        print "Found link: '" + link + "'"
        if link.startswith('http'):
    print 'entered in if link: ',link
            page_links.append(link)
            print "Adding link" + link + "\n"
        elif link.startswith('/'):
    print 'entered in elif link: ',link
            parts = urlparse.urlparse(self.current_page)
            page_links.append(parts.scheme + '://' + parts.netloc + link)
            print "Adding link " + parts.scheme + '://' + parts.netloc + link + "\n"
        else:
    print 'entered in else link: ',link
            page_links.append(self.current_page+link)
            print "Adding link " + self.current_page+link + "\n"

except Exception, ex: # Magnificent exception handling
    print ex

# Update links 
self.links = self.links.union( set(page_links) )

# Choose a random url from non-visited set
self.current_page = random.sample( self.links.difference(self.visited_links),1)[0]
self.counter+=1

def run(self):

# Crawl 3 webpages (or stop if all url has been fetched)
crawling_greenlets = []

for i in range(3):
  crawling_greenlets.append(gevent.spawn(self.open))


gevent.joinall(crawling_greenlets)

#while len(self.visited_links) < 4 or (self.visited_links == self.links):
#    self.open()

for link in self.links:
  print link

if __name__ == '__main__':
C = Crawler()
C.run()

Solution

import gevent and make sure monkey-patching is done to make standard library calls non-blocking and aware of gevent:

import gevent
from gevent import monkey; monkey.patch_all()

(you can selectively decide what has to be monkey-patched, but let's say it is not your problem at the moment)

In your run, make your open function to be called inside a greenlet. run can return the greenlet object, so you can wait for it whenever you need to get the results using gevent.joinall for example. Something like this:

def run(self):
    return gevent.spawn(self.open)

c1 = Crawler()
c2 = Crawler()
c3 = Crawler()
crawling_tasks = [c.run() for c in (c1,c2,c3)]
gevent.joinall(crawling_tasks)

print [c.links for c in (c1, c2, c3)]

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow