Question

I would like to estimate the impact of the news on the Dow Jones quotes. For this, I wrote the Python html parser, using the beutifullsoup library. I extract an article and store it in XML file for the further analysis using NLTK library. How can I increase the speed of parsing? The code below does the required task, but in a very slow manner.

Here is the code of the html parser:

import urllib2
import re
import xml.etree.cElementTree as ET
import nltk
from bs4 import BeautifulSoup
from datetime import date
from dateutil.rrule import rrule, DAILY
from nltk.corpus import stopwords
from collections import defaultdict

def main_parser():
    #starting date
    a = date(2014, 3, 27)
    #ending date
    b = date(2014, 3, 27)
    articles = ET.Element("articles")
    f = open('~/Documents/test.xml', 'w')
    #loop through the links and per each link extract the text of the article, store the latter at xml file
    for dt in rrule(DAILY, dtstart=a, until=b):
        url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime("%d") + ".html"
        page = urllib2.urlopen(url)
        #use html5lib ??? possibility to use another parser
        soup = BeautifulSoup(page.read(), "html5lib")
        article_date = ET.SubElement(articles, "article_date")
        article_date.text = str(dt)
        for links in soup.find_all("div", "headlineMed"):
            anchor_tag = links.a
            if not 'video' in anchor_tag['href']:
                try:
                    article_time = ET.SubElement(article_date, "article_time")
                    article_time.text = str(links.text[-11:])

                    article_header = ET.SubElement(article_time, "article_name")
                    article_header.text = str(anchor_tag.text)

                    article_link = ET.SubElement(article_time, "article_link")
                    article_link.text = str(anchor_tag['href']).encode('utf-8')

                    try:
                        article_text = ET.SubElement(article_time, "article_text")
                        #get text and remove all stop words
                        article_text.text = str(remove_stop_words(extract_article(anchor_tag['href']))).encode('ascii','ignore')
                    except Exception:
                        pass
                except Exception:
                    pass

    tree = ET.ElementTree(articles)
    tree.write("~/Documents/test.xml","utf-8")

#getting the article text from the spicific url
def extract_article(url):
    plain_text = ""
    html = urllib2.urlopen(url).read()
    soup = BeautifulSoup(html, "html5lib")
    tag = soup.find_all("p")
    #replace all html tags
    plain_text = re.sub(r'<p>|</p>|[|]|<span class=.*</span>|<a href=.*</a>', "", str(tag))
    plain_text = plain_text.replace(", ,", "")
    return str(plain_text)

def remove_stop_words(text):
    text=nltk.word_tokenize(text)
    filtered_words = [w for w in text if not w in stopwords.words('english')]
    return ' '.join(filtered_words)
Was it helpful?

Solution

Several fixes can be applied (without changing modules you are currently using):

  • use lxml parser instead of html5lib - it is much much (and 3 more muches) faster
  • parse only a part of document with SoupStrainer (note that html5lib doesn't support SoupStrainer - it will always parse the whole document slowly)

Here's how the code would look like after the changes. Brief performance test shows at least 3x improvement:

import urllib2
import xml.etree.cElementTree as ET
from datetime import date

from bs4 import SoupStrainer, BeautifulSoup
import nltk
from dateutil.rrule import rrule, DAILY
from nltk.corpus import stopwords


def main_parser():
    a = b = date(2014, 3, 27)
    articles = ET.Element("articles")
    for dt in rrule(DAILY, dtstart=a, until=b):
        url = "http://www.reuters.com/resources/archive/us/" + dt.strftime("%Y") + dt.strftime("%m") + dt.strftime(
            "%d") + ".html"

        links = SoupStrainer("div", "headlineMed")
        soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=links)

        article_date = ET.SubElement(articles, "article_date")
        article_date.text = str(dt)
        for link in soup.find_all('a'):
            if not 'video' in link['href']:
                try:
                    article_time = ET.SubElement(article_date, "article_time")
                    article_time.text = str(link.text[-11:])

                    article_header = ET.SubElement(article_time, "article_name")
                    article_header.text = str(link.text)

                    article_link = ET.SubElement(article_time, "article_link")
                    article_link.text = str(link['href']).encode('utf-8')

                    try:
                        article_text = ET.SubElement(article_time, "article_text")
                        article_text.text = str(remove_stop_words(extract_article(link['href']))).encode('ascii', 'ignore')
                    except Exception:
                        pass
                except Exception:
                    pass

    tree = ET.ElementTree(articles)
    tree.write("~/Documents/test.xml", "utf-8")


def extract_article(url):
    paragraphs = SoupStrainer('p')
    soup = BeautifulSoup(urllib2.urlopen(url), "lxml", parse_only=paragraphs)
    return soup.text


def remove_stop_words(text):
    text = nltk.word_tokenize(text)
    filtered_words = [w for w in text if not w in stopwords.words('english')]
    return ' '.join(filtered_words)

Note that I've removed the regular expression processing from extract_article() - looks like you can just get the whole text from the p tags.

I might have introduced some problems - please check if everything is correct.


Another solution would be to use lxml for everything from parsing (replace beautifulSoup) to creating the xml (replace xml.etree.ElementTree).


Another solution (definitely the fastest) would be to switch to Scrapy web-scraping web-framework. It is simple and very fast. There are all kind of batteries, you can imagine, included. For example there are link extractors, XML exporters, database pipelines etc. Worth looking.

Hope that helps.

OTHER TIPS

You want to pick the best parser. python parser benchmark result

We benchmark most of the parser / platform when building: http://serpapi.com

Here is a full article on Medium: https://medium.com/@vikoky/fastest-html-parser-available-now-f677a68b81dd

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top