Question

I have written a computer program in python but it runs a lot slower than I want it to.

Here is the code:

from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib
import urllib2
import cookielib

TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')

def download(url):
    print "Downloading:", url
    s = urllib2.urlopen(url).read()
    if s[:2] == '\x1f\x8b': # assume it's gzipped data
        ifh = GzipFile(mode='rb', fileobj=StringIO(s))
        s = ifh.read()
    print "Downloaded: ", url
    return s

def replace_chars(text, replacements):
    return ''.join(replacements.get(x,x) for x in text)

def handle_listing(listing_url):
    listing_document = BeautifulSoup(download(listing_url))

    # ignore pages that link to yellowpages
    if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
        listing_title = listing_document.title.text
        reps = {' ':'-', ',':'', '\'':'', '[':'', ']':''}
        if TITLE_MATCH.match(listing_title) is not None:
            title, = TITLE_MATCH.match(listing_title).groups()
            address, = ADDRESS_MATCH.match(listing_title).groups()

            yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
                replace_chars(address, reps),
                replace_chars(title, reps),
            )

            yellow_page = BeautifulSoup(download(yellow_page_url))

            page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
            if page_url:
                page_url = page_url.a["href"]

                business_name = title[:title.index(",")]

                page = BeautifulSoup(download(page_url))
                yellow_page_address =  page.find("span", {"class" : "street-address"})
                if yellow_page_address:

                    if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
                        pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
                        page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})

                        final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
                                pid, page_escaped)
                        return final_url

def log_in(final_url):
    data = urllib.urlencode({"inUserName":"jacob.grannis@gmail.com", "inUserPass":"secretword"})
    jar = cookielib.FileCookieJar("cookies")
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
    opener.addheaders.append(('User-agent', 'Mozilla/4.0'))
    opener.addheaders.append(('Referer', 'http://www.locationary.com/'))
    opener.addheaders.append(('Cookie','site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.1.10.1325009956; __utmc=47547066'))
    opener.addheaders.append(('Cookie','Cookie: site_version=REGULAR; __utma=47547066.912030359.1322003402.1324959960.1325009956.58; __utmz=47547066.1324655802.52.13.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=cache:dr23PN5fUj4J:www.locationary.com/%20locationary; nickname=jacob501; jforumUserId=1; PMS=1; locaCountry=1033; locaState=1786; locaCity=Vancouver; JSESSIONID=5CDDA2D527C20A6CDD04936115DE3FA2; PSESSIONID=c677beb4e6b8d58f1443d9b9585b225f579ef29a; Locacookie=enable; __utmb=47547066.4.10.1325009956; __utmc=47547066'))
    request = urllib2.Request("https://www.locationary.com/index.jsp?ACTION_TOKEN=tile_loginBar_jsp$JspView$LoginAction", data)
    response = opener.open(request) 
    url = str(final_url)
    anything = opener.open(url)
    page = anything.read()

States = [#'Alabama',
          #'Alaska',
          'Arizona',
          'Arkansas',
          'California',
          'Colorado',
          'Connecticut',
          'Delaware',
          'Florida',
          'Georgia',
          'Hawaii',
          'Idaho',
          'Illinois',
          'Indiana',
          'Iowa',
          'Kansas',
          'Kentucky',
          'Louisiana',
          'Maine',
          'Maryland',
          'Massachusetts',
          'Michigan',
          'Minnesota',
          'Mississippi',
          'Missouri',
          'Montana',
          'Nebraska',
          'Nevada',
          'New_Hampshire',
          'New_Jersey',
          'New_Mexico',
          'New_York',
          'North_Carolina',
          'North_Dakota',
          'Ohio',
          'Oklahoma',
          'Oregon',
          'Pennsylvania',
          'Rhode_Island',
          'South_Carolina',
          'South_Dakota',
          'Tennessee',
          'Texas',
          'Utah',
          'Vermont',
          'Virginia',
          'Washington',
          'West_Virginia',
          'Wisconsin',
          'Wyoming']

Cities = []

def find_cities(state):
    state_url = 'http://www.locationary.com/place/en/US/' + str(state)
    state_document = download(str(state_url))
    findCities = re.compile('<b>(.*)</b>')
    getCities = re.findall(findCities,state_document)

    for City in getCities:
        reps = {' ':'_'}
        City = replace_chars(City, reps)
        Cities.append(str(City))

bestworst = ['0','1']

def main():
    for state in States:
        find_cities(state)
        for city in Cities:
            for num in range(0,1):
                for pagenum in range(15,16):
                    print '------------------------------------------------------------------------------------------------------------------------------------------------------------'
                    print '------------------------------------------------------------------------------------------------------------------------------------------------------------'
                    if str(num) == '0':
                        print str(state) + ', ' + str(city) + ', ' + 'Best Profiles' + ', ' + 'Page ' + str(pagenum)
                    else:
                        print str(state) + ', ' + str(city) + ', ' + 'Worst Profiles' + ', ' + 'Page ' + str(pagenum)
                    START_URL = 'http://www.locationary.com/place/en/US/' + str(state) + '/' + city + '-page' + str(pagenum) + '/?ACTION_TOKEN=NumericAction&order=' + str(num)
                    pool = eventlet.GreenPool()
                    listings_document = BeautifulSoup(download(START_URL))
                    listings = listings_document.findAll("a", href = LOCATION_LISTING)
                    listings = [listing['href'] for listing in listings]

                    count_listings = 0

                    for final_url in pool.imap(handle_listing, listings):
                        print final_url
                        if final_url is not None:
                            log_in(final_url)

if __name__ == '__main__':
    main()

Is there a way to make it faster or is it impossible? It has to download URL's from the internet a lot but I'm pretty sure I can't make my internet connection 10 to 50 times faster than it already is...And my computer isn't very slow...so, is there any way to make my program, say, 10-50 times faster? I know that might sound ridiculous, but how do professional programmers make their programs faster then?

No correct solution

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top