python 爬虫百度搜索结果

来源:互联网 发布:高原红怎么去除知乎 编辑:程序博客网 时间:2024/04/26 08:54

因为合作的关系,用户在百度上搜索一些检索词时,百度会把我们网站放到搜索结果的第一位,但是经过实践发现,好多关键字都不是这样的。

所以写了一个小爬虫来检验2000-3000个关键字的移动和pc端百度搜索结果。

利用put的方法提交url,然后对返回的数据进行正则匹配,

找出第一位的搜索结果是不是含有我们网站的链接。

python 

# coding=utf-8import urllibimport urllib2from urllib2 import Request, urlopen, URLError, HTTPErrorimport reimport timeimport random# 将正则表达式编译成Pattern对象#pattern = re.compile(r'(id="1"){1}.{2,30}(book.zongheng.com){1}.*\.html') #pcpattern = re.compile(r'(>1.{0,6}<em>){1}.*(zongheng\.com){1}.*(2&?#?160)') #wap\h5output = open('D:\\result.txt', 'a')k=0# 使用Pattern匹配文本,获得匹配结果,无法匹配时将返回None#match = pattern.search('http://book.zongheng.com/book/262883.html?fr=p')def search(key):    global k    url = 'http://m.baidu.com/s?word='+key            user_agent = 'Mozilla/4.1 (compatible; MSIE 5.5; Windows NT)'        values = {'wd' : '111' }              headers = { 'User-Agent' : user_agent }        data = urllib.urlencode(values) # 编码工作    req = urllib2.Request(url)  # 发送请求同时传data表单    #response = urllib2.urlopen(req)  #接受反馈的信息    #the_page = response.read()  #读取反馈的内容    #print the_page    #req = Request('http://bbs.csdn.net/callmewhy')     try:                response = urllib2.urlopen(req)           except URLError, e:          if hasattr(e, 'code'):                    print key+' The server couldn\'t fulfill the request.'                    print 'Error code: ', e.code            search(key)        elif hasattr(e, 'reason'):                    print key+' We failed to reach a server.'                    print 'Reason: ', e.reason            search(key)                else:          #print 'No exception was raised.'          # everything is fine        the_page = response.read()  #读取反馈的内容        #print 'Real url :' + response.geturl()        #print response.info()        #print the_page        k+=1        if k%20==0:            time.sleep(30)        if '即可恢复使用' in the_page:            print 'wait'            time.sleep(30)            print 'wait done'            search(key)        #return 1        match = pattern.search(the_page)        if match:            pass            # 使用Match获得分组信息            #print key +" this key is ok"            #print match.group()        else:            print key+" this key is not ok"            output .write(key+"\n\t")search('水系法师的春天')file = open("D:\\test.txt")while 1:    lines = file.readlines(100000)    if not lines:        break    for line in lines:       time.sleep(2*random.random())       search(line.strip())       output .close() 


0 0
原创粉丝点击