python百度语法爬虫

来源:互联网 发布:js execcommand 编辑:程序博客网 时间:2024/05/17 07:52
详情请看:本人个人微博
#-*- coding: utf-8 -*-#python3环境下运行 模块安装时可能会出现一些问题大家自行百度#该脚本参考地址朋友的博客http://www.warmeng.com/2017/06/09/spider/import urllib.requestfrom bs4 import BeautifulSoupimport requestsimport sysfrom socket import *header = {}#头部构造header['Host'] = 'www.baidu.com'header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'header['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'header['Accept-Language'] = 'zh-CN,zh;q=0.8'header['Accept-Encoding'] = 'gzip, deflate, sdch, br'header['Cookie'] = ''header['Connection'] = 'close'header['Upgrade-Insecure-Requests'] = '1'# header['X-Forwarded-For'] = '1.1.1.1'indexUrl = 'https://www.baidu.com'words = input("请输入百度语法:")#words = 'inurl:asp?id='start_number = input("请输入起始页:")#从第一页开始page_number = input("请输入爬取页数:")f = open('./result.txt','a+')#打开或创建txt文件 用来保存获取数据fal_num = int(start_number)+int(page_number)f.write('关键词: '+words+'\n'+'爬取页码范围: '+start_number+'--'+str(fal_num)+'\n')for page_number in range(int(start_number)*10,int(start_number)*10+int(page_number)*10,10):#爬取页数10的倍数    targetUrl = indexUrl+'/s?wd='+words+'&pn='+str(page_number)+'&oq='+words+'&tn=93063693_hao_pg&ie=utf-8&usm=1&rsv_pq=93cdb6350000eadd&rsv_t=150bff5LzGew8qDr0ARHTq%2BNBvCwnE7s0KgrfxwcY5Sqc4xAsDyOFQIo%2FUOfuybbSkFMa5Cz&rsv_jmp=slow'    results = requests.get(targetUrl, headers=header,timeout=15)#创建链接对象#    print(results.status_code)#打印返回的状态码    results.raise_for_status()#异常抛出#    print(results.text)    detail = BeautifulSoup(results.content, 'lxml')#获取页面并用lxml方式解析    #BeautifulSoup能很好的格式化网页内容方便你提取 详细说明参考http://cuiqingcai.com/1319.html    setdefaulttimeout(1)#全局超时设置    for x in detail.find_all('div'):#遍历所有的div标签的内容        link = x.get('data-tools')#直接获取属性data-tools值        if link:            try:                url = str(link)[link.find('"url"'):]                #print(link.find('"url"'))                print(url)                url = url[7:-2]#截取url中的内容                final_url = urllib.request.urlopen(url).geturl()#尝试连接 获取真实url                print(final_url+'\n')                if words[6:] in final_url: #再次过滤筛选出包含asp?id=的url                    f.write(final_url+'\n')            except:             passf.close()print('end!')

原创粉丝点击