中文的处理

来源：互联网发布：淘宝黑设备编辑：程序博客网时间：2024/06/05 11:04

import HTMLParserhtml_parser = HTMLParser.HTMLParser()s = html_parser.unescape('& # 23567; & # 23376;')print(s)

#coding:utf-8import urllibimport refrom urllib import quotefrom uConvert import parser as udef getWeiboInfo(keyword):    webcontent = urllib.urlopen(url).read()        webcontent = webcontent.replace('\\"','"')    webcontent = webcontent.replace('\\n','')    webcontent = webcontent.replace('\\/','/')        txts = re.findall('<dd class="content">([\s\S]*?)<p class="info W_linkb W_textb">',webcontent)        print repr(txts)            txtInfo = ""    for txt in txts:        imageList = []        imageList += re.findall('<img class="" src="(.*?)"',txt)        imageList += re.findall('<img class="bigcursor" src="(.*?)"',txt)                if imageList:            print "Images Found..."            strImageList = " ".join(imageList).replace("/square/","/bmiddle/")            strImageList = strImageList.replace("/thumbnail/","/bmiddle/")        else:            strImageList = ""            print "No Image..."                    imageList = strImageList.split(" ")                txtInfo += u(txt).encode("utf-8") +"\n"+"\n".join(imageList)+"\n"            f = open("result.inc",'w')       f.writelines(txtInfo)    f.close()

STATE_NORMAL = -1STATE_SLASH = 0STATE_BEGIN = 1STATE_UNICODE = 2def parser(inpstr):    result = ''    state = STATE_NORMAL    counter = 0    unicode_hex = ''    for char in inpstr:        if char == '\\' and state == STATE_NORMAL:            state = STATE_SLASH        elif char == 'u' and state == STATE_SLASH:            state = STATE_BEGIN        elif state == STATE_BEGIN:            state = STATE_UNICODE            counter = 1            unicode_hex = char        elif state == STATE_UNICODE:            if counter < 4:                if char == '\\':                    result += '\\u' + unicode_hex                    state = STATE_SLASH                else:                    unicode_hex += char                    counter += 1            if counter == 4:                try:                    result += unichr(int(unicode_hex, 16))                except:                    result += '\\u' + unicode_hex                state = STATE_NORMAL        else:            result += char    if state >= STATE_SLASH:        result += '\\'    if state >= STATE_BEGIN:        result += 'u'    if state == STATE_UNICODE:        result += unicode_hex    return result