lxml 中文乱码解决

来源:互联网 发布:淘宝的安逸猿 编辑:程序博客网 时间:2024/06/03 13:59

原文  http://www.waitalone.cn/lxml-text.html

# -*- coding: utf_8 -*-# Date: 2016/2/14# Created by 独自等待# 博客 http://www.waitalone.cn/import urllib2from lxml import etreefrom lxml.html.clean import Cleanerimport sysreload(sys)sys.setdefaultencoding("utf-8")def getText(url):    '''    获取指定url返回页的所有文字    :param url: 需要抓取的url    :return: 返回文字    '''    page = urllib2.urlopen(url, timeout=10).read()    page = unicode(page, "utf-8")  # 转换编码,否则会导致输出乱码    cleaner = Cleaner(style=True, scripts=True, page_structure=False, safe_attrs_only=False)  # 清除掉CSS等    str = etree.HTML(cleaner.clean_html(page))    return strtexts = getText('http://www.qiandaoribao.com/news/95682').xpath('//div[@id="news-detail"]/p/text()')  # 获取所有文本body=''for t in texts:    body+=t.replace('\n','').replace('\r\n','').strip().encode('utf-8')with open(r'D:/1.txt','w') as f:    f.write(body)