中文的处理
来源:互联网 发布:淘宝黑设备 编辑:程序博客网 时间:2024/06/05 11:04
import HTMLParserhtml_parser = HTMLParser.HTMLParser()s = html_parser.unescape('& # 23567; & # 23376;')print(s)
#coding:utf-8import urllibimport refrom urllib import quotefrom uConvert import parser as udef getWeiboInfo(keyword): webcontent = urllib.urlopen(url).read() webcontent = webcontent.replace('\\"','"') webcontent = webcontent.replace('\\n','') webcontent = webcontent.replace('\\/','/') txts = re.findall('<dd class="content">([\s\S]*?)<p class="info W_linkb W_textb">',webcontent) print repr(txts) txtInfo = "" for txt in txts: imageList = [] imageList += re.findall('<img class="" src="(.*?)"',txt) imageList += re.findall('<img class="bigcursor" src="(.*?)"',txt) if imageList: print "Images Found..." strImageList = " ".join(imageList).replace("/square/","/bmiddle/") strImageList = strImageList.replace("/thumbnail/","/bmiddle/") else: strImageList = "" print "No Image..." imageList = strImageList.split(" ") txtInfo += u(txt).encode("utf-8") +"\n"+"\n".join(imageList)+"\n" f = open("result.inc",'w') f.writelines(txtInfo) f.close()
STATE_NORMAL = -1STATE_SLASH = 0STATE_BEGIN = 1STATE_UNICODE = 2def parser(inpstr): result = '' state = STATE_NORMAL counter = 0 unicode_hex = '' for char in inpstr: if char == '\\' and state == STATE_NORMAL: state = STATE_SLASH elif char == 'u' and state == STATE_SLASH: state = STATE_BEGIN elif state == STATE_BEGIN: state = STATE_UNICODE counter = 1 unicode_hex = char elif state == STATE_UNICODE: if counter < 4: if char == '\\': result += '\\u' + unicode_hex state = STATE_SLASH else: unicode_hex += char counter += 1 if counter == 4: try: result += unichr(int(unicode_hex, 16)) except: result += '\\u' + unicode_hex state = STATE_NORMAL else: result += char if state >= STATE_SLASH: result += '\\' if state >= STATE_BEGIN: result += 'u' if state == STATE_UNICODE: result += unicode_hex return result