python bs4 抓取糗事百科资源

来源:互联网 发布:淘宝冰点营销使用教程 编辑:程序博客网 时间:2024/06/06 01:45

首先需要创建接个文件

Main.py (入口文件)

html_parser(解析下载的html 文件)

downloader (下载文件)

outputer(存储文件)

url_manager(爬虫链接管理文件)

mysql_help(mysql 操作文件)

理解爬虫步骤:

首先向 爬取一个网页,在网页里解析《a》标签将已经解析到的超链接放到url_manager(爬虫链接管理文件)里待后面取出,将爬下来的页面用BS4对其取出有用的资源

存到字典中,待爬虫全部结束后再循环写入到数据库中。

源代码如下

Main.py (入口文件)

# -*- coding:utf-8 -*-import url_managerimport html_parserimport htmldownloaderimport htmloutputerclass Main(object):    def __init__(self):        self.urls = url_manager.UrlManager()        self.parser = html_parser.HtmlParser()        self.downloader = htmldownloader.HtmlDownloader()        self.outputer = htmloutputer.OutPuter()    def craw(self,root_url):        self.urls.add_new_url(root_url)        count = 1        while self.urls.has_new_url():            # try:                new_url = self.urls.get_new_url()                print u"执行:%d  %s"%(count,new_url)                html_cont = self.downloader.download(new_url)#下载页面内容                new_urls,new_data = self.parser.parser(new_url,html_cont)#解析器                self.urls.add_new_urls(new_urls)                self.outputer.collect_data(new_data)                if count == 1000:#多少停止                    break                count = count+1            # except:            #     print u"页面访问失败"        # self.outputer.output_html()        self.outputer.set_mysql()if __name__ == "__main__":    root_url = "https://www.qiushibaike.com/article/118853097"    obj_main = Main()    obj_main.craw(root_url)
html_parser(解析下载的html 文件)

# -*- coding:utf-8 -*-from bs4 import BeautifulSoupimport reimport urlparseclass HtmlParser(object):    # 获取该页的所有url    def _get_new_urls(self, page_url, soup):        new_urls = set()        links = soup.find_all('a', href=re.compile(r"/article/\.*"))        # print links        # quit()        for link in links:            new_url = link['href']            new_urls.add(urlparse.urljoin(page_url, new_url))        return new_urls    # 获取数据    def _get_new_data(self, page_url, soup):        res_data = {}        # 抓取的路由        res_data['url'] = page_url        #抓取用户头像        head_img_url = soup.find("div",class_="author").find('img')        res_data['head_img_url'] = urlparse.urljoin(page_url, head_img_url['src'])        #获取用户姓名        title_node = soup.find('div', class_='author').find('h2')        res_data['name'] = title_node.get_text()        #获取段子        summary_node = soup.find('div', id="single-next-link")        res_data['content'] = summary_node.get_text()        try:            content_img = summary_node.find('div',class_='thumb').find('img')            res_data['content_img'] = urlparse.urljoin(page_url, content_img['src'])        except:            res_data['content_img'] =''        return res_data    #解析出新的url,和数据    def parser(self,page_url,html_cont):        if page_url is None or html_cont is None:            return        soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')        new_urls = self._get_new_urls(page_url,soup)        new_data = self._get_new_data(page_url,soup)        return new_urls, new_data

downloader (下载文件)

# -*- coding:utf -8-*-import urllib2,cookielibclass HtmlDownloader(object):    def download(self,url):        if url is None:            return        # cj = cookielib.CookieJar()        # res = urllib2.urlopen(url)        # urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))        headers = {            'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'        }        req = urllib2.Request(            url=url,            headers=headers        )        res = urllib2.urlopen(req)        # print res.getcode()        # quit()        if res.getcode() != 200:            return        return res.read()

outputer(存储文件)

# -*- coding:utf-8 -*-import MyswlHelpimport codecsclass OutPuter(object):    def __init__(self):        self.datas = []    def collect_data(self,data):        if data is None:            return        self.datas.append(data)    #存入文件    def output_html(self):        file = codecs.open('text.html','a',encoding='utf-8')        file.seek(1)        for v in self.datas:            try:                file.write("{url:%s,title:%s,summary:%s}" % (v['url'], v['name'].encode('gb18030'), v['content'].encode('gb18030')))            except:                file.write("{url:%s,title:%s,summary:%s}" % (v['url'], v['name'].encode('utf-8'), v['content'].encode('utf-8')))            #     continue        file.flush()        file.close()    #存到数据库    def set_mysql(self):        my_help = MyswlHelp.MysqlHelp()        for v in self.datas:            try:                if v['content_img'] == '':                    sql = "insert into qqbk (name,content,head_img_url) VALUES ('"+v['name'].encode('utf-8')+"','"+v['content'].encode('utf-8')+"','"+v['head_img_url'].encode('utf-8')+"')"                else:                    sql = "insert into qqbk (name,content,content_img,head_img_url) VALUES ('"+v['name'].encode('utf-8')+"','"+v['content'].encode('utf-8')+"','"+v['content_img'].encode('utf-8')+"','"+v['head_img_url'].encode('utf-8')+"')"                # print sql                my_help.execute(sql)            except:                continue

url_manager(爬虫链接管理文件)

# -*- coding:utf-8 -*-class UrlManager(object):    def __init__(self):        self.new_urls = set()        self.old_urls = set()    #填加单个url    def add_new_url(self,url):        if url is None:            return        if url not in self.new_urls and url not in self.old_urls:            self.new_urls.add(url)    #批量添加    def add_new_urls(self,urls):        if urls is None or len(urls) == 0:            return        for url in urls:            self.add_new_url(url)    #获取一个url    def get_new_url(self):        url = self.new_urls.pop()        self.old_urls.add(url)        return url    #判断    def has_new_url(self):        return len(self.new_urls) != 0

mysql_help(mysql 操作文件)

#!/usr/bin/python# -*- coding: UTF-8 -*-import MySQLdbclass MysqlHelp:    "mysql 操作类"    localhost="127.0.0.1"#地址    db_name="python"#数据库名称    access="root"#账号    password=""#密码    db=""    cursor=""    #初始化    def __init__(self):        # 打开数据库连接        self.db = MySQLdb.connect(self.localhost, self.access, self.password, self.db_name,charset="utf8")        # 使用cursor()方法获取操作游标        self.cursor = self.db.cursor()    #增删改    def execute(self,sql):        try:            # 执行SQL语句            self.cursor.execute(sql)            # 提交到数据库执行            res = self.db.commit()            return res        except:            # 发生错误时回滚            self.db.rollback()        # 关闭数据库连接        self.db.close()    #查询    def query(self,sql):        try:            self.cursor.execute(sql)            res = self.cursor.fetchall()        except:            res= "Error: unable to fecth data"        self.db.close()        return res

执行:93  https://www.qiushibaike.com/article/115767514
执行:94  https://www.qiushibaike.com/article/118964920
执行:95  https://www.qiushibaike.com/article/51456225
执行:96  https://www.qiushibaike.com/article/116486378
执行:97  https://www.qiushibaike.com/article/288171
执行:98  https://www.qiushibaike.com/article/119549247
执行:99  https://www.qiushibaike.com/article/213184
执行:100  https://www.qiushibaike.com/article/119441053

看查看数据库

多出了100条记录