You can still use unwrap
, you just need to get a bit recursive.
from bs4 import Tag
def unwrapper(tags, keep = ('ul','li')):
for el in tags:
if isinstance(el,Tag):
unwrapper(el) # recurse first, unwrap later
if el.name not in keep:
el.unwrap()
demo:
s = '''"Hello <a>this</a> is <ul><li>me</li><li><b>Dr</b> Pablov</li></ul>"'''
soup = BeautifulSoup(s, 'html.parser') # force html.parser to avoid lxml's auto-inclusion of <html><body>
unwrapper(soup)
soup
Out[63]: "Hello this is <ul><li>me</li><li>Dr Pablov</li></ul>"
This approach should work on any arbitrary nestings of tags, i.e.
s = '''"<a><b><ul><c><li><d>Hello</d></li></c></ul></b></a>"'''
soup = BeautifulSoup(s, 'html.parser')
unwrapper(soup)
soup
Out[19]: "<ul><li>Hello</li></ul>"