lxml expects your html to have a tree structure, ie a single root node. If it does not have one, it adds it.
How to use Cleaner, lxml.html without returning div tag?
Question
I have this code:
evil = "<script>malignus script</script><b>bold text</b><i>italic text</i>"
cleaner = Cleaner(remove_unknown_tags=False, allow_tags=['p', 'br', 'b'],
page_structure=True)
print cleaner.clean_html(evil)
I expected to get this:
<b>bold text</b>italic text
But instead I'm getting this:
<div><b>bold text</b>italic text</div>
Is there an attribute to remove the div
tag wrapper?
Solution
OTHER TIPS
Cleaner always wraps the result in an element. A good solution is to parse the HTML manually and send the resulting document object to cleaner- then the result is also a document object, and you can use text_content to extract the text from the root.
from lxml.html import document_fromstring
from lxml.html.clean import Cleaner
evil = "<script>malignus script</script><b>bold text</b><i>italic
text</i>"
doc = document_fromstring(evil)
cleaner = Cleaner(remove_unknown_tags=False, allow_tags=['p', 'br', 'b'],
page_structure=True)
print cleaner.clean_html(doc).text_content()
This can also be done as a one liner
Here is what I've come up with.
import lxml
from lxml.html.clean import Cleaner
def clean_html(html):
if html:
is_wrap_in_div = check_is_wrap_in_div(html)
cleaner = Cleaner()
html = cleaner.clean_html(html)
if not is_wrap_in_div:
html = remove_root_div(html)
return html
def check_is_wrap_in_div(html):
is_wrapped = False
try:
tree = lxml.etree.fromstring(html)
if tree.tag == 'div':
return True
except lxml.etree.XMLSyntaxError:
pass
return is_wrapped
def remove_root_div(html):
root_div_regex = r'^(\s*<div[\s\S]*?>)([\s\S]*)(<\/div>[\s\S]*?)$'
return re.sub(root_div_regex, r'\2', html)
# use it as
cleaned_html = clean_html(evil_html)
And unittest
class TestBase(unittest.TestCase):
def test_check_is_wrap_in_div(self):
with self.subTest('test html wrap in div'):
self.assertTrue(
utils.check_is_wrap_in_div('<div></div>'),
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div>
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div class="test" style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;"
>
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;">
<div>
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
<div>
<p>Hi</p>
</div>
</div>
""")
)
self.assertTrue(
utils.check_is_wrap_in_div("""
<div
class="test"
style="color: blue;">
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
""")
)
with self.subTest('test html not wrap in div'):
html_list = [
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
"""
<p>HELLO</p>
<p>TEST</p>
""",
"""
<section>
<div>
<p>hello</p>
</div>
</section>
""",
'<p>HELLO</p><p>TEST</p>',
"""
<body>
<div class="HELO">
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;"
>
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
"""
<p>Hello</p>
<p>World</p>
<div class="testing">
Hello
</div>
""",
"""
<div>
<p>Hello</p>
<p>World</p>
<p>Hello</p>
<p>World</p>
</div>
<div> </div>
""",
"""
<div>
<p>Hello</p>
<p>World</p>
<p>Hello</p>
<p>World</p>
</div>
<div> </div>
""",
"""
<div>
<div>
<p>Hello</p>
<p>World</p>
<p>Hello</p>
<p>World</p>
</div>
</div>
<span>
<div> </div>
</span>
""",
]
for html in html_list:
self.assertFalse(
utils.check_is_wrap_in_div(html),
)
def test_remove_root_div(self):
with self.subTest('test remove root html'):
self.assertEqual(
utils.remove_root_div('<div></div>'),
'',
)
self.assertEqual(
utils.remove_root_div(
"""
<div>
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div class="test" style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div
class="test"
style="color: blue;"
>
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div
class="test"
style="color: blue;">
<p>Hello</p>
<p>Test</p>
</div>
"""
).strip(),
"""
<p>Hello</p>
<p>Test</p>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""
<div
class="test"
style="color: blue;">
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
"""
).strip(),
"""
<div>
<p>Hello</p>
<p>Test</p>
</div>
""".strip(),
)
self.assertEqual(
utils.remove_root_div(
"""<div
class="test"
style="color: blue;">
<div>
<p>Hello</p>
<p>Test</p>
</div>
</div>
"""
).strip(),
"""
<div>
<p>Hello</p>
<p>Test</p>
</div>
""".strip(),
)
with self.subTest('test not root html'):
html_list = [
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
"""
<p>HELLO</p>
<p>TEST</p>
""",
"""
<section>
<div>
<p>hello</p>
</div>
</section>
""",
'<p>HELLO</p><p>TEST</p>',
"""
<body>
<div class="HELO">
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;"
>
<p>hello</p>
</div>
</body>
""",
"""
<body>
<div
class="test"
style="color: blue;">
<p>hello</p>
</div>
</body>
""",
]
for html in html_list:
self.assertEqual(
utils.remove_root_div(html),
html,
)
Wrap the HTML string in a <div>
…</div>
and use regex to strip the leading <div>
and trailing </div></body></html>\n
that Cleaner appends:
#!/usr/bin/python3
from lxml.html.clean import Cleaner
import re
evil = "<script>malignus script</script><b>bold text</b><i>italic text</i>"
evil = "<div>" + evil + "</div>"
cleaner = Cleaner(remove_unknown_tags=False, allow_tags=['p', 'br', 'b'])
htmlstr = cleaner.clean_html(evil)
htmlstr = re.sub(r'^<div>', '', htmlstr)
htmlstr = re.sub(r'</div></body></html>\n$', '', htmlstr)
print(htmlstr)
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow