Question

I have this code:

evil = "<script>malignus script</script><b>bold text</b><i>italic text</i>"
cleaner = Cleaner(remove_unknown_tags=False, allow_tags=['p', 'br', 'b'],
                  page_structure=True)
print cleaner.clean_html(evil)

I expected to get this:

<b>bold text</b>italic text

But instead I'm getting this:

<div><b>bold text</b>italic text</div>

Is there an attribute to remove the div tag wrapper?

Was it helpful?

Solution

lxml expects your html to have a tree structure, ie a single root node. If it does not have one, it adds it.

OTHER TIPS

Cleaner always wraps the result in an element. A good solution is to parse the HTML manually and send the resulting document object to cleaner- then the result is also a document object, and you can use text_content to extract the text from the root.

from lxml.html import document_fromstring
from lxml.html.clean import Cleaner
evil = "<script>malignus script</script><b>bold text</b><i>italic 
text</i>"
doc = document_fromstring(evil)
cleaner = Cleaner(remove_unknown_tags=False, allow_tags=['p', 'br', 'b'],
              page_structure=True)
print cleaner.clean_html(doc).text_content()

This can also be done as a one liner

Here is what I've come up with.

import lxml
from lxml.html.clean import Cleaner

def clean_html(html):
    if html:
        is_wrap_in_div = check_is_wrap_in_div(html)

        cleaner = Cleaner()
        html = cleaner.clean_html(html)

        if not is_wrap_in_div:
            html = remove_root_div(html)

    return html


def check_is_wrap_in_div(html):
    is_wrapped = False
    try:
        tree = lxml.etree.fromstring(html)
        if tree.tag == 'div':
            return True
    except lxml.etree.XMLSyntaxError:
        pass

    return is_wrapped


def remove_root_div(html):
    root_div_regex = r'^(\s*<div[\s\S]*?>)([\s\S]*)(<\/div>[\s\S]*?)$'
    return re.sub(root_div_regex, r'\2', html)

# use it as
cleaned_html = clean_html(evil_html)

And unittest

class TestBase(unittest.TestCase):
    def test_check_is_wrap_in_div(self):
        with self.subTest('test html wrap in div'):
            self.assertTrue(
                utils.check_is_wrap_in_div('<div></div>'),
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div>
                        <p>Hello</p>
                        <p>Test</p>
                    </div>
                """)
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div class="test" style="color: blue;">
                        <p>Hello</p>
                        <p>Test</p>
                    </div>
                """)
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div
                        class="test"
                        style="color: blue;"
                    >
                        <p>Hello</p>
                        <p>Test</p>
                    </div>
                """)
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div
                        class="test"
                        style="color: blue;">
                        <p>Hello</p>
                        <p>Test</p>
                    </div>
                """)
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div
                        class="test"
                        style="color: blue;">
                        <div>
                            <div>
                                <p>Hello</p>
                                <p>Test</p>
                            </div>
                        </div>
                        <div>
                            <p>Hi</p>
                        </div>
                    </div>
                """)
            )
            self.assertTrue(
                utils.check_is_wrap_in_div("""
                    <div
                        class="test"
                        style="color: blue;">
                        <div>
                            <p>Hello</p>
                            <p>Test</p>
                        </div>
                    </div>
                """)
            )

        with self.subTest('test html not wrap in div'):
            html_list = [
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;">
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <p>HELLO</p>
                    <p>TEST</p>
                """,
                """
                    <section>
                        <div>
                            <p>hello</p>
                        </div>
                    </section>
                """,
                '<p>HELLO</p><p>TEST</p>',
                """
                    <body>
                        <div class="HELO">
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;"
                        >
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;">
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <p>Hello</p>
                    <p>World</p>
                    <div class="testing">
                        Hello
                    </div>
                """,
                """
                    <div>
                        <p>Hello</p>
                        <p>World</p>
                        <p>Hello</p>
                        <p>World</p>
                    </div>
                    <div> </div>
                """,
                """
                    <div>
                        <p>Hello</p>
                        <p>World</p>
                        <p>Hello</p>
                        <p>World</p>
                    </div>
                    <div> </div>
                """,
                """
                    <div>
                        <div>
                            <p>Hello</p>
                            <p>World</p>
                            <p>Hello</p>
                            <p>World</p>
                        </div>
                    </div>
                    <span>
                        <div> </div>
                    </span>
                """,
            ]
            for html in html_list:
                self.assertFalse(
                    utils.check_is_wrap_in_div(html),
                )

    def test_remove_root_div(self):
        with self.subTest('test remove root html'):
            self.assertEqual(
                utils.remove_root_div('<div></div>'),
                '',
            )
            self.assertEqual(
                utils.remove_root_div(
                    """
                        <div>
                            <p>Hello</p>
                            <p>Test</p>
                        </div>
                    """
                ).strip(),
                """
                            <p>Hello</p>
                            <p>Test</p>
                """.strip(),
            )
            self.assertEqual(
                utils.remove_root_div(
                    """
                        <div class="test" style="color: blue;">
                            <p>Hello</p>
                            <p>Test</p>
                        </div>
                    """
                ).strip(),
                """
                            <p>Hello</p>
                            <p>Test</p>
                """.strip(),
            )
            self.assertEqual(
                utils.remove_root_div(
                    """
                        <div
                            class="test"
                            style="color: blue;"
                        >
                            <p>Hello</p>
                            <p>Test</p>
                        </div>
                    """
                ).strip(),
                """
                            <p>Hello</p>
                            <p>Test</p>
                """.strip(),
            )
            self.assertEqual(
                utils.remove_root_div(
                    """
                        <div
                            class="test"
                            style="color: blue;">
                            <p>Hello</p>
                            <p>Test</p>
                        </div>
                    """
                ).strip(),
                """
                            <p>Hello</p>
                            <p>Test</p>
                """.strip(),
            )
            self.assertEqual(
                utils.remove_root_div(
                    """
                        <div
                            class="test"
                            style="color: blue;">
                            <div>
                                <p>Hello</p>
                                <p>Test</p>
                            </div>
                        </div>
                    """
                ).strip(),
                """
                            <div>
                                <p>Hello</p>
                                <p>Test</p>
                            </div>
                """.strip(),
            )
            self.assertEqual(
                utils.remove_root_div(
                    """<div
                        class="test"
                        style="color: blue;">
                            <div>
                                <p>Hello</p>
                                <p>Test</p>
                            </div>
                        </div>
                    """
                ).strip(),
                """
                            <div>
                                <p>Hello</p>
                                <p>Test</p>
                            </div>
                """.strip(),
            )

        with self.subTest('test not root html'):
            html_list = [
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;">
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <p>HELLO</p>
                    <p>TEST</p>
                """,
                """
                    <section>
                        <div>
                            <p>hello</p>
                        </div>
                    </section>
                """,
                '<p>HELLO</p><p>TEST</p>',
                """
                    <body>
                        <div class="HELO">
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;"
                        >
                            <p>hello</p>
                        </div>
                    </body>
                """,
                """
                    <body>
                        <div
                            class="test"
                            style="color: blue;">
                            <p>hello</p>
                        </div>
                    </body>
                """,
            ]
            for html in html_list:
                self.assertEqual(
                    utils.remove_root_div(html),
                    html,
                )

Wrap the HTML string in a <div></div> and use regex to strip the leading <div> and trailing </div></body></html>\n that Cleaner appends:

#!/usr/bin/python3

from lxml.html.clean import Cleaner
import re

evil = "<script>malignus script</script><b>bold text</b><i>italic text</i>"
evil = "<div>" + evil + "</div>"

cleaner = Cleaner(remove_unknown_tags=False, allow_tags=['p', 'br', 'b'])

htmlstr = cleaner.clean_html(evil)
htmlstr = re.sub(r'^<div>', '', htmlstr)
htmlstr = re.sub(r'</div></body></html>\n$', '', htmlstr)
print(htmlstr)
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top