If you uncomment
print('{}: {}'.format(len(row), row))
in the code below, you'll see stuff like
11: ['', '', '', '', '', '', '', '', '', '', '']
11: ['', '', '', '', '', '', '', '', u'Number of', '', '']
11: ['', '', '', '', '', '', '', '', u'Funds in', '', '']
11: ['', '', '', '', '', '', '', '', u'Fund', '', '']
11: ['', '', u'Position(s)', '', u'Term of Office', '', '', '', u'Complex', '', u'Other Directorships']
11: [u'Name and Year of', '', u'Held with', '', u'and Length of', '', u'Principal Occupation(s)', '', u'Overseen', '', u'Held by Trustee']
11: [u'Birth of Trustee', '', u'Funds', '', u'Time Served', '', u'During the Past Five Years', '', u'by Trustee', '', u'During the Past Five Years']
1: ['']
11: [u'David C. Arch (1945)', '', u'Trustee', '', u'\x86', '', u'Chairman and Chief Executive Officer of Blistex Inc., a consumer\n health care products manufacturer.Formerly: Member of the Heartland Alliance Advisory Board, a\n nonprofit organization serving human needs based in Chicago.', '', u'136', '', u'Trustee/Managing General Partner of funds in the Fund Complex.\n Board member of the Illinois Manufacturers\x92 Association.\n Member of the Board of Visitors, Institute for the Humanities,\n University of Michigan.']
11: ['', '', '', '', '', '', '', '', '', '', '']
This shows that the header is separated from the row data by a row of length 1:
1: ['']
So instead of using the bgcolor
to identify rows to combine, perhaps you could use the length of the row as a signal that all previous rows need to be combined.
import bs4 as bs
import urllib2
def collapse(table):
result = []
rows = []
for tr in table('tr'):
row = []
for td in tr('th') + tr('td'):
try:
span = int(td['colspan'])
except KeyError:
span = 1
datum = ''.join(td.stripped_strings)
row.extend([datum] + [''] * (span - 1))
if row:
# print('{}: {}'.format(len(row), row))
if len(row) > 1:
if any(row):
rows.append(row)
else:
result.extend(combine(rows))
rows = []
if rows:
result.extend(rows)
return result
def combine(rows):
return [[' '.join(col) for col in zip(*rows)]]
# url = 'http://www.sec.gov/Archives/edgar/data/5094/000095012313004020/h30303def14a.htm'
# soup = bs.BeautifulSoup(urllib2.urlopen(url))
# used for developing/debugging
with open('/tmp/def14a.htm', 'r') as f:
soup = bs.BeautifulSoup(f.read())
for table in soup.find_all('table'):
print(collapse(table))
print('-' * 80)