python bs4 抓取糗事百科资源
来源:互联网 发布:淘宝冰点营销使用教程 编辑:程序博客网 时间:2024/06/06 01:45
首先需要创建接个文件
Main.py (入口文件)
html_parser(解析下载的html 文件)
downloader (下载文件)
outputer(存储文件)
url_manager(爬虫链接管理文件)
mysql_help(mysql 操作文件)
理解爬虫步骤:
首先向 爬取一个网页,在网页里解析《a》标签将已经解析到的超链接放到url_manager(爬虫链接管理文件)里待后面取出,将爬下来的页面用BS4对其取出有用的资源
存到字典中,待爬虫全部结束后再循环写入到数据库中。
源代码如下
Main.py (入口文件)
# -*- coding:utf-8 -*-import url_managerimport html_parserimport htmldownloaderimport htmloutputerclass Main(object): def __init__(self): self.urls = url_manager.UrlManager() self.parser = html_parser.HtmlParser() self.downloader = htmldownloader.HtmlDownloader() self.outputer = htmloutputer.OutPuter() def craw(self,root_url): self.urls.add_new_url(root_url) count = 1 while self.urls.has_new_url(): # try: new_url = self.urls.get_new_url() print u"执行:%d %s"%(count,new_url) html_cont = self.downloader.download(new_url)#下载页面内容 new_urls,new_data = self.parser.parser(new_url,html_cont)#解析器 self.urls.add_new_urls(new_urls) self.outputer.collect_data(new_data) if count == 1000:#多少停止 break count = count+1 # except: # print u"页面访问失败" # self.outputer.output_html() self.outputer.set_mysql()if __name__ == "__main__": root_url = "https://www.qiushibaike.com/article/118853097" obj_main = Main() obj_main.craw(root_url)html_parser(解析下载的html 文件)
# -*- coding:utf-8 -*-from bs4 import BeautifulSoupimport reimport urlparseclass HtmlParser(object): # 获取该页的所有url def _get_new_urls(self, page_url, soup): new_urls = set() links = soup.find_all('a', href=re.compile(r"/article/\.*")) # print links # quit() for link in links: new_url = link['href'] new_urls.add(urlparse.urljoin(page_url, new_url)) return new_urls # 获取数据 def _get_new_data(self, page_url, soup): res_data = {} # 抓取的路由 res_data['url'] = page_url #抓取用户头像 head_img_url = soup.find("div",class_="author").find('img') res_data['head_img_url'] = urlparse.urljoin(page_url, head_img_url['src']) #获取用户姓名 title_node = soup.find('div', class_='author').find('h2') res_data['name'] = title_node.get_text() #获取段子 summary_node = soup.find('div', id="single-next-link") res_data['content'] = summary_node.get_text() try: content_img = summary_node.find('div',class_='thumb').find('img') res_data['content_img'] = urlparse.urljoin(page_url, content_img['src']) except: res_data['content_img'] ='' return res_data #解析出新的url,和数据 def parser(self,page_url,html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8') new_urls = self._get_new_urls(page_url,soup) new_data = self._get_new_data(page_url,soup) return new_urls, new_data
downloader (下载文件)
# -*- coding:utf -8-*-import urllib2,cookielibclass HtmlDownloader(object): def download(self,url): if url is None: return # cj = cookielib.CookieJar() # res = urllib2.urlopen(url) # urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } req = urllib2.Request( url=url, headers=headers ) res = urllib2.urlopen(req) # print res.getcode() # quit() if res.getcode() != 200: return return res.read()
outputer(存储文件)
# -*- coding:utf-8 -*-import MyswlHelpimport codecsclass OutPuter(object): def __init__(self): self.datas = [] def collect_data(self,data): if data is None: return self.datas.append(data) #存入文件 def output_html(self): file = codecs.open('text.html','a',encoding='utf-8') file.seek(1) for v in self.datas: try: file.write("{url:%s,title:%s,summary:%s}" % (v['url'], v['name'].encode('gb18030'), v['content'].encode('gb18030'))) except: file.write("{url:%s,title:%s,summary:%s}" % (v['url'], v['name'].encode('utf-8'), v['content'].encode('utf-8'))) # continue file.flush() file.close() #存到数据库 def set_mysql(self): my_help = MyswlHelp.MysqlHelp() for v in self.datas: try: if v['content_img'] == '': sql = "insert into qqbk (name,content,head_img_url) VALUES ('"+v['name'].encode('utf-8')+"','"+v['content'].encode('utf-8')+"','"+v['head_img_url'].encode('utf-8')+"')" else: sql = "insert into qqbk (name,content,content_img,head_img_url) VALUES ('"+v['name'].encode('utf-8')+"','"+v['content'].encode('utf-8')+"','"+v['content_img'].encode('utf-8')+"','"+v['head_img_url'].encode('utf-8')+"')" # print sql my_help.execute(sql) except: continue
url_manager(爬虫链接管理文件)
# -*- coding:utf-8 -*-class UrlManager(object): def __init__(self): self.new_urls = set() self.old_urls = set() #填加单个url def add_new_url(self,url): if url is None: return if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url) #批量添加 def add_new_urls(self,urls): if urls is None or len(urls) == 0: return for url in urls: self.add_new_url(url) #获取一个url def get_new_url(self): url = self.new_urls.pop() self.old_urls.add(url) return url #判断 def has_new_url(self): return len(self.new_urls) != 0
mysql_help(mysql 操作文件)
#!/usr/bin/python# -*- coding: UTF-8 -*-import MySQLdbclass MysqlHelp: "mysql 操作类" localhost="127.0.0.1"#地址 db_name="python"#数据库名称 access="root"#账号 password=""#密码 db="" cursor="" #初始化 def __init__(self): # 打开数据库连接 self.db = MySQLdb.connect(self.localhost, self.access, self.password, self.db_name,charset="utf8") # 使用cursor()方法获取操作游标 self.cursor = self.db.cursor() #增删改 def execute(self,sql): try: # 执行SQL语句 self.cursor.execute(sql) # 提交到数据库执行 res = self.db.commit() return res except: # 发生错误时回滚 self.db.rollback() # 关闭数据库连接 self.db.close() #查询 def query(self,sql): try: self.cursor.execute(sql) res = self.cursor.fetchall() except: res= "Error: unable to fecth data" self.db.close() return res
执行:93 https://www.qiushibaike.com/article/115767514
执行:94 https://www.qiushibaike.com/article/118964920
执行:95 https://www.qiushibaike.com/article/51456225
执行:96 https://www.qiushibaike.com/article/116486378
执行:97 https://www.qiushibaike.com/article/288171
执行:98 https://www.qiushibaike.com/article/119549247
执行:99 https://www.qiushibaike.com/article/213184
执行:100 https://www.qiushibaike.com/article/119441053
看查看数据库
多出了100条记录
- python bs4 抓取糗事百科资源
- Python抓取糗事百科
- Python抓取糗事百科邀请码
- Python网络爬虫抓取糗事百科
- python抓取糗事百科的段子
- Python实现抓取糗事百科的段子
- python抓取糗事百科段子 图片
- python抓取糗事百科文字内容
- Python3抓取糗事百科
- 实战抓取糗事百科
- Python - 静态页面抓取(抓取‘糗事百科’段子)
- [Python 爬虫之路1] 爬取糗事百科(requests,bs4)
- [python爬虫] 抓取糗事百科的爬虫程序
- Python抓取糗事百科网页信息以及源码下载
- 【python】scrapy初探_抓取糗事百科hot帖子
- 用python抓取糗事百科的小程序
- Python爬虫实例2-多线程爬虫抓取糗事百科数据
- Python网络爬虫(5)糗事百科段子抓取
- webpack less-loader 的modifyVars配置方式
- Jodd介绍
- 什么是搜索引擎分词技术?
- 移动界面侧滑(框架)
- SDUT_3361_数据结构实验之图论四:迷宫探索
- python bs4 抓取糗事百科资源
- Tracking-Learning-Detection
- maven pom.xml文件教程详解
- Struts学习笔记
- node和bootstrap-table实现最简单的服务端分页
- UI设计规范之Android尺寸单位换算及切图规范
- DataTable通过Linq转List
- 城市云脑,智慧城市2.0产生背后的深层原因,两个重要特征是关键
- Git删除分支