تحويل كيانات XML/HTML إلى سلسلة Unicode في Python [نسخة مكررة]

https://stackoverflow.com/questions/57708

09-06-2019
|

سؤال

هذا السؤال لديه بالفعل إجابة هنا:

فك تشفير كيانات HTML في سلسلة بايثون؟ 5 إجابات

أقوم ببعض عمليات تجريف الويب وتستخدم المواقع بشكل متكرر كيانات HTML لتمثيل أحرف غير ascii.هل لدى Python أداة مساعدة تأخذ سلسلة مع كيانات HTML وترجع نوع Unicode؟

على سبيل المثال:

أعود:

&#x01ce;

والذي يمثل حرف "Ǝ" مع علامة النغمة.في النظام الثنائي، يتم تمثيل ذلك على أنه 16 بت 01ce.أريد تحويل كيان html إلى القيمة u'\u01ce'

المحلول

يحتوي HTMLParser الخاص بالمكتبة القياسية على وظيفة غير موثقة unescape() والتي تفعل بالضبط ما تعتقد أنها تفعله:

import HTMLParser
h = HTMLParser.HTMLParser()
h.unescape('&copy; 2010') # u'\xa9 2010'
h.unescape('&#169; 2010') # u'\xa9 2010'

نصائح أخرى

بايثون لديها htmlentitydefs الوحدة النمطية، ولكن هذا لا يتضمن وظيفة لإلغاء الهروب من كيانات HTML.

مطور بايثون فريدريك لونده (مؤلف Elementtree، من بين أمور أخرى) لديه مثل هذه الوظيفة على موقعه على الانترنت, ، الذي يعمل مع الكيانات العشرية والست عشرية والمسماة:

import re, htmlentitydefs

##
# Removes HTML or XML character references and entities from a text string.
#
# @param text The HTML (or XML) source text.
# @return The plain text, as a Unicode string, if necessary.

def unescape(text):
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16))
                else:
                    return unichr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)

استخدم المدمج unichr - BeautifulSoup ليس ضروريًا:

>>> entity = '&#x01ce'
>>> unichr(int(entity[3:],16))
u'\u01ce'

بديل إذا كان لديك lxml:

>>> import lxml.html
>>> lxml.html.fromstring('&#x01ce').text
u'\u01ce'

إذا كنت تستخدم Python 3.4 أو أحدث، فيمكنك ببساطة استخدام html.unescape:

import html

s = html.unescape(s)

يمكنك العثور على إجابة هنا -- الحصول على شخصيات دولية من صفحة ويب؟

يحرر:انها تبدو مثل BeautifulSoup لا يحول الكيانات المكتوبة في شكل سداسي عشري.يمكن إصلاحه:

import copy, re
from BeautifulSoup import BeautifulSoup

hexentityMassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
# replace hexadecimal character reference by decimal one
hexentityMassage += [(re.compile('&#x([^;]+);'), 
                     lambda m: '&#%d;' % int(m.group(1), 16))]

def convert(html):
    return BeautifulSoup(html,
        convertEntities=BeautifulSoup.HTML_ENTITIES,
        markupMassage=hexentityMassage).contents[0].string

html = '<html>&#x01ce;&#462;</html>'
print repr(convert(html))
# u'\u01ce\u01ce'

يحرر:

unescape() الوظيفة التي ذكرها @dF الذي يستخدم htmlentitydefs الوحدة القياسية و unichr() قد يكون أكثر ملاءمة في هذه الحالة.

هذه هي الوظيفة التي من المفترض أن تساعدك على القيام بذلك بشكل صحيح وتحويل الكيانات مرة أخرى إلى أحرف utf-8.

def unescape(text):
   """Removes HTML or XML character references 
      and entities from a text string.
   @param text The HTML (or XML) source text.
   @return The plain text, as a Unicode string, if necessary.
   from Fredrik Lundh
   2008-01-03: input only unicode characters string.
   http://effbot.org/zone/re-sub.htm#unescape-html
   """
   def fixup(m):
      text = m.group(0)
      if text[:2] == "&#":
         # character reference
         try:
            if text[:3] == "&#x":
               return unichr(int(text[3:-1], 16))
            else:
               return unichr(int(text[2:-1]))
         except ValueError:
            print "Value Error"
            pass
      else:
         # named entity
         # reescape the reserved characters.
         try:
            if text[1:-1] == "amp":
               text = "&amp;amp;"
            elif text[1:-1] == "gt":
               text = "&amp;gt;"
            elif text[1:-1] == "lt":
               text = "&amp;lt;"
            else:
               print text[1:-1]
               text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
         except KeyError:
            print "keyerror"
            pass
      return text # leave as is
   return re.sub("&#?\w+;", fixup, text)

لست متأكدًا من سبب عدم تشمل خيط Overflow Stack "؛" في البحث/الاستبدال (أيلامدا م:'&#٪د*;*') إذا لم تقم بذلك، يمكن لـ BeautifulSoup أن تحظر لأنه يمكن تفسير الحرف المجاور كجزء من كود HTML (أي:&#39B لـ &#39Blackout).

كان هذا أفضل بالنسبة لي:

import re
from BeautifulSoup import BeautifulSoup

html_string='<a href="/cgi-bin/article.cgi?f=/c/a/2010/12/13/BA3V1GQ1CI.DTL"title="">&#x27;Blackout in a can; on some shelves despite ban</a>'

hexentityMassage = [(re.compile('&#x([^;]+);'), 
lambda m: '&#%d;' % int(m.group(1), 16))]

soup = BeautifulSoup(html_string, 
convertEntities=BeautifulSoup.HTML_ENTITIES, 
markupMassage=hexentityMassage)

يقوم int(m.group(1), 16) بتحويل الرقم (المحدد بتنسيق الأساس 16) مرة أخرى إلى عدد صحيح.
تقوم m.group(0) بإرجاع المطابقة بأكملها، بينما تقوم m.group(1) بإرجاع مجموعة الالتقاط regexp
استخدام markupMessage بشكل أساسي هو نفسه:
html_string = re.sub('&#x([^;]+);', لامدا م:'&#٪د؛' ٪ int (M.Group (1) ، 16) ، html_string)

الحل الآخر هو المكتبة المضمنة xml.sax.saxutils (لكل من html وxml).ومع ذلك، سيتم تحويل &gt و&amp و&lt فقط.

from xml.sax.saxutils import unescape

escaped_text = unescape(text_to_escape)

هنا هو الإصدار بايثون 3 من إجابة dF:

import re
import html.entities

def unescape(text):
    """
    Removes HTML or XML character references and entities from a text string.

    :param text:    The HTML (or XML) source text.
    :return:        The plain text, as a Unicode string, if necessary.
    """
    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return chr(int(text[3:-1], 16))
                else:
                    return chr(int(text[2:-1]))
            except ValueError:
                pass
        else:
            # named entity
            try:
                text = chr(html.entities.name2codepoint[text[1:-1]])
            except KeyError:
                pass
        return text # leave as is
    return re.sub("&#?\w+;", fixup, text)

التغييرات الرئيسية تتعلق htmlentitydefs هذا هو الآن html.entities و unichr هذا هو الآن chr.انظر الى هذا دليل نقل بايثون 3.

مرخصة بموجب: CC-BY-SA مع الإسناد

لا تنتمي إلى StackOverflow