简单爬虫爬取百度贴吧
来源:互联网 发布:酒店网络部署方案 编辑:程序博客网 时间:2024/05/19 14:00
import urllib.requestimport urllib.parseimport osimport sslssl._create_default_https_context = ssl._create_unverified_context#定义生成request对象方法def request_handle(base_url,bar_name,page): #构造参数 pn = (page-1)*50 data = { 'kw':bar_name, 'pn':pn } data = urllib.parse.urlencode(data) #根据参数,生成对应的url url = base_url+data #构造headers headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36' } request = urllib.request.Request(url=url) #返回request对象 return request#定义下载方法def download(request,page,bar_name): response = urllib.request.urlopen(request) if not os.path.exists(bar_name): os.mkdir(os.path.join(os.getcwd(),bar_name)) filename = '第'+str(page)+'页.html' filepath = os.path.join(bar_name,filename) with open(filepath,'wb') as f1: f1.write(response.read())#主函数def main(): bar_name = input('请输入您想爬取的吧名:') start_page = int(input('请输入您想爬取的起始页码:')) end_page = int(input('请输入您想爬取的结束页面:')) base_url = 'http://tieba.baidu.com/f?ie=utf-8&' #遍历需要爬取的页码,依次开始爬取内容 for page in range(start_page,end_page+1): #构造reuest请求 request = request_handle(base_url,bar_name,page) # 执行爬取函数,开始爬取 print('开始下载第:' + str(page) + '页') download(request,page,bar_name) print('结束下载第:' + str(page) + '页')
阅读全文