python Some Crawl

来源:互联网 发布:备忘录软件哪个好 编辑:程序博客网 时间:2024/06/05 23:06

re

xpath

json analysis

use chrome driver

The simplest one:

#encoding=utf-8                                                                              import urllibimport redef youdao(keyword):        url='http://www.youdao.com/w/eng/'+keyword        page=urllib.urlopen(url).read()        find_result=re.findall(r'<div class="trans-container">(.*?)</div>',page,re.S|re.M)        return_string=find_result[0].strip()        return_string=re.sub('<(.*?)>','',return_string).strip()        num=max(map(len,return_string.split('\n')))        print(''.join(['*']*num))        print return_string        print(''.join(['*'*num]))        return '\n'+keyword+' : '+return_string+'\n'youdao('你好')youdao('hello')

Demo2:

import urllibfrom lxml import etree url="http://www.dioenglish.com/home.php?mod=space&uid=114322&do=blog&id=55535"xp = '//div[@id="blog_article"]'def get(url,xp):        t = urllib.urlopen(url).read()        sele = etree.HTML(t)        #content = sele.xpath('//div[@id="blog_article"]/p/span/font/text()')        if xp[-2:] =='()':            info = sele.xpath(xp)            else:           content = sele.xpath(xp)           info = content[0].xpath('string(.)').encode('utf-8')        return info