Get every strong
tag inside p
, then get it's parent and next parent's siblings until there is another p
tag with strong
tag inside or no more siblings left:
from lxml.html import fromstring
html_data = """<div class="TabItem">
<p><strong>Product Composition</strong></p>
<p>93% Polyamide 7% Elastane</p>
<p>Lining: 100% Polyester</p><p>Dress Length: 90 cm</p>
<p><strong>Product Attributes;</strong></p>
<p>: Boat Neck, Long Sleeve, Midi, Zip, Concealed, Laced, Side</p>
<p>Lining Type: Full Lining</p>
</div>"""
tree = fromstring(html_data)
data = {}
for strong in tree.xpath('//p/strong'):
parent = strong.getparent()
description = []
next_p = parent.getnext()
while next_p is not None and not next_p.xpath('.//strong'):
description.append(next_p.text)
next_p = next_p.getnext()
data[strong.text] = " ".join(description)
print data
prints:
{'Product Composition': '93% Polyamide 7% Elastane Lining: 100% Polyester',
'Product Attributes;': ': Boat Neck, Long Sleeve, Midi, Zip, Concealed, Laced, Side Lining Type: Full Lining'}