Python 正则表达式decode xml entities

来源:互联网 发布:澳洲金融专业知乎 编辑:程序博客网 时间:2024/05/21 19:21

法1:

def xmlEntityDecode(capture1 = None, capture2 = None, capture3 = None):    # Define name map dict    nameDict = {'quot': 34,                 'amp': 38,                 'apos': 39,                 'lt': 60,                 'gt': 62}        # Switch decode char to primary char    if capture1 is not None:        charCode = int(capture1, 10)    elif capture2 is not None:        charCode = int(capture2, 16)    else:        charCode = nameDict[capture3]    return unichr(charCode)import resubject = 'Σ'match = re.search("&(?:#([0-9]+)|#x([0-9a-fA-F]+)|([0-9a-zA-Z]+));", subject)if match:capture1 = match.group(1)capture2 = match.group(2)capture3 = match.group(3)print xmlEntityDecode(capture1=capture1, capture2 = capture2, capture3 = capture3)

法2:

import re### Removes HTML markup from a text string.## @param text The HTML source.# @return The plain text.  If the HTML source contains non-ASCII#     entities or character references, this is a Unicode string.def strip_html(text):    def fixup(m):        text = m.group(0)        if text[:1] == "<":            return "" # ignore tags        if text[:2] == "&#":            try:                if text[:3] == "&#x":                    return unichr(int(text[3:-1], 16))                else:                    return unichr(int(text[2:-1]))            except ValueError:                pass        elif text[:1] == "&":            import htmlentitydefs            entity = htmlentitydefs.entitydefs.get(text[1:-1])            if entity:                if entity[:2] == "&#":                    try:                        return unichr(int(entity[2:-1]))                    except ValueError:                        pass                else:                    return unicode(entity, "iso-8859-1")        return text # leave as is    return re.sub("(?s)<[^>]*>|&#?\w+;", fixup, text)

附:

html entities

0 0
原创粉丝点击