Multithreaded lxml scraper executes without any error or output

https://stackoverflow.com/questions/23376141

12-07-2023
|

Question

Here is the code.

import requests
from lxml import html
from pprint import pprint
from urlparse import urljoin
from thready import threaded
import dataset
import os
from hashlib import sha1
import re
import math

# Inventory link
STARTING_URL = 'http://example.com/en/search/?h=3&k=&p=%d&sid=w'
BASE_URL = 'http://example.com'

# connect to our database
db = dataset.connect('mysql://root:@localhost/opencartdb')

# a directory for images
IMAGE_DIR = os.path.join(os.path.dirname(__file__), 'wimagepy')

def url_to_filename(image_url):
    """ Make a URL into a file name, using SHA1 hashes. """
    # use a sha1 hash to convert the url into a unique filename
    hash_file = sha1(image_url).hexdigest() + '.jpg'
    return os.path.join(IMAGE_DIR, hash_file)

def store_local(image_url, content):
    """ Save a local copy of the image file. """
    # If the image directory does not exist, make one.
    if not os.path.isdir(IMAGE_DIR):
        os.makedirs(IMAGE_DIR)
        # Save to disk.
    local_path = url_to_filename(image_url)
    with open(local_path, 'wb') as f:
        f.write(content)

def scrape_raku_inventory():
    """ Scrape all the inventory pages from a list """
    response = requests.get('http://example.com/en/search/?h=3&k=&p=1&sid=w')
    results_per_page = 60
    div = page.xpath("//div[contains(@class, 'b-tabs-utility')]")[0].text
    last_pg = math.ceil(int(div.split()[-2]) / results_per_page)
    for i in xrange(last_pg):
        response = requests.get(STARTING_URL % i)
        parsed_body = html.fromstring(response.content)
        urls = []
        links = [urlparse.urljoin(response.url, url) for url in parsed_body.xpath("//div[contains(@class, 'b-thumb-128px')]//a")]
        for link in links:
            url = urljoin(BASE_URL, link)
            # iteratively populate this list 
            urls.append(url)

    # download and parse inventory via multiple threads
    threaded(urls, scrape_inventory_page, num_threads=10)

def scrape_inventory_page(url):
    """Extract information from individual item page"""
    # log the url we're scraping
    print "scraping %s ..." % url
    # retrieve the inventory page with requests
    response = requests.get(url)
    # Parse the html of the inventory page
    parsed_body = html.fromstring(response.content)
    # Download images
    image_urls = re.sub(r'_ex=50x50\?', "", parsed_body.xpath("//img[contains(@src, '_ex50x50')]/@src"))
    for image_url in image_urls:
        store_local(image_url)
    data = {
    'scrape_url': url,
    'name': re.sub(r'\D\W\S', "", parsed_body.xpath("//h1[contains(@class, 'b-ttl-main')]").text)
    }
    # Upsert data into database
    db['raku'].upsert(data, ['scrape_url'])

if  __name__ == '__name__':
    scrape_raku_inventory()

As in the title, the code executes but there's no error and no output.

Solution

Try changing your last if statement to

if __name__ == '__main__'

instead of '__name__'

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow