How to separate rows of parsed html table with Python

https://stackoverflow.com/questions/23621476

21-07-2023
|

Вопрос

(Updated) I am trying to parse some html tables and I have a problem to divide rows and columns. I am trying to extract the tables of some html files: (http://www.sec.gov/Archives/edgar/data/5094/000095012313004020/h30303def14a.htm)

So I get the html and then use beautiful soup to give me the tables: soup=BeautifulSoup(table) then I have a function that I use to seperate rows and columns: data=collapsetable(soup)

I use background color to separate rows but I am not sure how to separate the tables that do not have background color as their row separator.

def collapsetable(soup,combine_rows=True):
    rows=[]
    lastcolor=None
    for tr in soup('tr'):
        try:
            color=tr['bgcolor']
        except:
            color=''

        row=[]
        for td in tr('th')+tr('td'):
            try:
                span=int(td['colspan'])
            except:
                span=1

            try:
                color=td['bgcolor']
            except:
                pass


            datum=''.join([getdeepcontent(t) for t in td.contents])
            row+=[datum]+['']*(span-1)

        # Use Colors to find the row split
        if color==lastcolor and combine_rows:
            for i in range(len(row)):
                if i>=len(rows[-1]):
                    rows[-1].append(row[i])
                else:
                    rows[-1][i]+=' '+row[i]
        else:
            rows.append(row)
            lastcolor=color
    clean_rows(rows)                
    return rows

For example the html table I want in this file is the one with "independent trustees:" title. With my function I will get all the columns but don't know where to separate the rows.

For example here is the html portion of one of the tables:

<table border="0" width="100%" align="center" cellpadding="0" cellspacing="0" style="font-size: 8pt; font-family: 'Times New Roman', Times; color: #000000; background: transparent"><!-- Table Width Row BEGIN --><tr style="font-size: 1pt" valign="bottom"> <td width="25%">&nbsp;</td> <!-- colindex=01 type=maindata --> <td width="1%">&nbsp;</td> <!-- colindex=02 type=gutter --> <td width="6%">&nbsp;</td> <!-- colindex=02 type=maindata --> <td width="2%">&nbsp;</td> <!-- colindex=03 type=gutter --> <td width="9%">&nbsp;</td> <!-- colindex=03 type=maindata --> <td width="1%">&nbsp;</td> <!-- colindex=04 type=gutter --> <td width="23%">&nbsp;</td> <!-- colindex=04 type=maindata --> <td width="2%">&nbsp;</td> <!-- colindex=05 type=gutter --> <td width="6%">&nbsp;</td> <!-- colindex=05 type=maindata --> <td width="2%">&nbsp;</td> <!-- colindex=06 type=gutter --> <td width="23%">&nbsp;</td> <!-- colindex=06 type=maindata --></tr><!-- Table Width Row END --><!-- TableOutputHead --><tr style="font-size: 8pt" valign="bottom" align="center"><td nowrap="nowrap" align="left" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>Number of<br /> </b></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td></tr><tr style="font-size: 8pt" valign="bottom" align="center"><td nowrap="nowrap" align="left" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>Funds in<br /> </b></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td></tr><tr style="font-size: 8pt" valign="bottom" align="center"><td nowrap="nowrap" align="left" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>Fund<br /> </b></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td></tr><tr style="font-size: 8pt" valign="bottom" align="center"><td nowrap="nowrap" align="left" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>Position(s)<br /> </b></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>Term of Office<br /> </b></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom">&nbsp;</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>Complex<br /> </b></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>Other Directorships<br /> </b></td></tr><tr style="font-size: 8pt" valign="bottom" align="center"><td nowrap="nowrap" align="left" valign="bottom"> <b>Name and Year of<br /> </b></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>Held with<br /> </b></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>and Length of<br /> </b></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>Principal Occupation(s)<br /> </b></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>Overseen<br /> </b></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"> <b>Held by Trustee<br /> </b></td></tr><tr style="font-size: 8pt" valign="bottom" align="center"><td nowrap="nowrap" align="left" valign="bottom"><div style="border-bottom: 1px solid #000000; width: 1%; padding-bottom: 1px"> <b>Birth of Trustee</b></div></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"><div style="border-bottom: 1px solid #000000; width: 1%; padding-bottom: 1px"> <b>Funds</b></div></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"><div style="border-bottom: 1px solid #000000; width: 1%; padding-bottom: 1px"> <b>Time Served</b></div></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"><div style="border-bottom: 1px solid #000000; width: 1%; padding-bottom: 1px"> <b>During the Past Five Years</b></div></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"><div style="border-bottom: 1px solid #000000; width: 1%; padding-bottom: 1px"> <b>by Trustee</b></div></td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="bottom"><div style="border-bottom: 1px solid #000000; width: 1%; padding-bottom: 1px"> <b>During the Past Five Years</b></div></td></tr><tr style="line-height: 3pt; font-size: 1pt"><td>&nbsp;</td></tr><!-- TableOutputBody --><tr valign="bottom"><td align="left" valign="top">    David C. Arch (1945)</td><td>&nbsp;</td><td nowrap="nowrap" align="left" valign="top">    Trustee</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="top">    &#134;</td><td>&nbsp;</td><td align="left" valign="top">    Chairman and Chief Executive Officer of Blistex Inc., a consumer    health care products manufacturer. <br />    Formerly: Member of the Heartland Alliance Advisory Board, a    nonprofit organization serving human needs based in Chicago.</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="top">    136</td><td>&nbsp;</td><td align="left" valign="top">    Trustee/Managing General Partner of funds in the Fund Complex.    Board member of the Illinois Manufacturers&#146; Association.    Member of the Board of Visitors, Institute for the Humanities,    University of Michigan.</td></tr><tr valign="bottom" style="line-height: 6pt"><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td></tr><tr valign="bottom"><td align="left" valign="top">    Jerry D. Choate (1938)</td><td>&nbsp;</td><td nowrap="nowrap" align="left" valign="top">    Trustee</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="top">    &#134;</td><td>&nbsp;</td><td align="left" valign="top">    Retired. From 1995 to 1999, Chairman and Chief Executive Officer    of the Allstate Corporation (&#147;Allstate&#148;) and Allstate    Insurance Company. From 1994 to 1995, President and Chief    Executive Officer of Allstate. Prior to 1994, various management    positions at Allstate.</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="top">    13</td><td>&nbsp;</td><td align="left" valign="top">    Trustee/Managing General Partner of funds in the Fund Complex.    Director since 1998 and member of the governance and nominating    committee, executive committee, compensation and management    development committee and equity award committee, of Amgen Inc.,    a biotechnological company. Director since 1999 and member of    the nominating and governance committee and compensation and    executive committee, of Valero Energy Corporation, a crude oil    refining and marketing company. Previously, from 2006 to 2007,    Director and member of the compensation committee and audit    committee, of H&#038;R Block, a tax preparation services company.</td></tr><tr valign="bottom" style="line-height: 6pt"><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td><td>&nbsp;</td></tr><tr valign="bottom"><td align="left" valign="top">    Linda Hutton    Heagy<sup style="font-size: 85%; vertical-align: top">1</sup>    (1948)</td><td>&nbsp;</td><td nowrap="nowrap" align="left" valign="top">    Trustee</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="top">    &#134;</td><td>&nbsp;</td><td align="left" valign="top">    Retired. Prior to June 2008, Managing Partner of Heidrick &#038;    Struggles, the second largest global executive search firm, and    from 2001-2004, Regional Managing Director of U.S. operations at    Heidrick &#038; Struggles. Prior to 1997, Managing Partner of    Ray &#038; Berndtson, Inc., an executive recruiting firm. Prior    to 1995, Executive Vice President of ABN AMRO, N.A., a bank    holding company, with oversight for treasury management    operations including all non-credit product pricing. Prior to    1990, experience includes Executive Vice President of The    Exchange National Bank with oversight of treasury management    including capital markets operations, Vice President of Northern    Trust Company and a trainee at Price Waterhouse.</td><td>&nbsp;</td><td nowrap="nowrap" align="center" valign="top">    13</td><td>&nbsp;</td><td align="left" valign="top">    Trustee/Managing General Partner of funds in the Fund Complex.    Prior to 2010, Trustee on the University of Chicago Medical    Center Board, Vice Chair of the Board of the YMCA of    Metropolitan Chicago and a member of the Women&#146;s Board of    the University of Chicago.</td></tr></table>

Any help is much appreciated.

Решение

If you uncomment

print('{}: {}'.format(len(row), row))

in the code below, you'll see stuff like

11: ['', '', '', '', '', '', '', '', '', '', '']
11: ['', '', '', '', '', '', '', '', u'Number of', '', '']
11: ['', '', '', '', '', '', '', '', u'Funds in', '', '']
11: ['', '', '', '', '', '', '', '', u'Fund', '', '']
11: ['', '', u'Position(s)', '', u'Term of Office', '', '', '', u'Complex', '', u'Other Directorships']
11: [u'Name and Year of', '', u'Held with', '', u'and Length of', '', u'Principal Occupation(s)', '', u'Overseen', '', u'Held by Trustee']
11: [u'Birth of Trustee', '', u'Funds', '', u'Time Served', '', u'During the Past Five Years', '', u'by Trustee', '', u'During the Past Five Years']
1: ['']
11: [u'David C. Arch (1945)', '', u'Trustee', '', u'\x86', '', u'Chairman and Chief Executive Officer of Blistex Inc., a consumer\n    health care products manufacturer.Formerly: Member of the Heartland Alliance Advisory Board, a\n    nonprofit organization serving human needs based in Chicago.', '', u'136', '', u'Trustee/Managing General Partner of funds in the Fund Complex.\n    Board member of the Illinois Manufacturers\x92 Association.\n    Member of the Board of Visitors, Institute for the Humanities,\n    University of Michigan.']
11: ['', '', '', '', '', '', '', '', '', '', '']

This shows that the header is separated from the row data by a row of length 1:

1: ['']

So instead of using the bgcolor to identify rows to combine, perhaps you could use the length of the row as a signal that all previous rows need to be combined.

import bs4 as bs
import urllib2


def collapse(table):
    result = []
    rows = []
    for tr in table('tr'):
        row = []
        for td in tr('th') + tr('td'):
            try:
                span = int(td['colspan'])
            except KeyError:
                span = 1
            datum = ''.join(td.stripped_strings)
            row.extend([datum] + [''] * (span - 1))
        if row:
            # print('{}: {}'.format(len(row), row))
            if len(row) > 1:
                if any(row):
                    rows.append(row)
            else:
                result.extend(combine(rows))
                rows = []
    if rows:
        result.extend(rows)
    return result


def combine(rows):
    return [[' '.join(col) for col in zip(*rows)]]

# url = 'http://www.sec.gov/Archives/edgar/data/5094/000095012313004020/h30303def14a.htm'
# soup = bs.BeautifulSoup(urllib2.urlopen(url))

# used for developing/debugging
with open('/tmp/def14a.htm', 'r') as f:
    soup = bs.BeautifulSoup(f.read())
for table in soup.find_all('table'):
    print(collapse(table))
    print('-' * 80)

Лицензировано под: CC-BY-SA с атрибуция

Не связан с StackOverflow