Python 爬虫(以赛马数据为例)之总体架构

来源:互联网 发布:qq空间 for mac客户端 编辑:程序博客网 时间:2024/06/06 11:47

一、以香港赛马会的赛马数据的爬取为例:

http://racing.hkjc.com/racing/Info/meeting/Results/chinese/Local/20170625/ST/2



爬取表格中的全部数据。


几个数据类:

(1)场地信息类ChangInfo,就是表格前面的那部分内容。包括班次,米,场地状况,比赛类型,赛道和钱

class ChangInfo(object):    def __init__(self,dijiban,mi,changdizhuangkuang,bisaileixing,saidao,qian):        self.dijiban = dijiban        self.mi = mi        self.changdizhuangkuang = changdizhuangkuang        self.bisaileixing = bisaileixing        self.saidao = saidao        self.qian = qian    def __init__(self):        return    def __str__(self):        return "ChangInfo:" + self.dijiban + "," + self.mi + "," + self.changdizhuangkuang + "," + self.bisaileixing + "," + self.saidao + "," + self.qian

(2)赛马信息类MaInfo,对应于表格里的内容

(3)访问路径类Path类,包括每场对赛的网页地址,比赛日期,比赛标记和场次
class Path(object):    def __init__(self,url,date,flag,changci,isFinished):        self.__url = url        self.__date = date        self.__flag = flag        self.__changci = changci        self.__isFinished = isFinished    @property    def url(self):        return self.__url    @url.setter    def url(self, url):        self.__url = url    @property    def date(self):        return self.__date    @date.setter    def date(self, date):        self.__date = date    @property    def flag(self):        return self.__flag    @flag.setter    def flag(self, flag):        self.__flag = flag    @property    def changci(self):        return self.__changci    @changci.setter    def changci(self, changci):        self.__changci = changci    @property    def isFinished(self):        return self.__isFinished    @isFinished.setter    def isFinished(self, isFinished):        self.__isFinished = isFinished    def __str__(self):        return "Path:" + "url=" + self.__url + ",date=" + self.__date + ",flag=" + self.__flag + ",changci=" + str(self.__changci) + ",isFinished=" + str(self.__isFinished)
(4)Path类和MaInfo类的数据库访问对象类PathDao和MaInfoDao,其中封装了数据库的访问操作。
二、整体框架

使用Python自带的urllib来访问网页得到网页源代码。使用Python第三库BeautifulSoup进行html代码解析,得到数据。

import chenjie.url_manager#导入自定义url管理器类import chenjie.html_parser#导入自定义解析器类import chenjie.html_outputer#导入自定义输出器import chenjie.html_downloader#导入自定义下载器import chenjie.path#导入访问路径类import chenjie.mainfodao#导入赛马数据数据库访问对象类import chenjie.pathdao#导入访问路径数据数据库访问对象类class SpiderMain(object):    def __init__(self):        self.paths = chenjie.url_manager.UrlManager()#paths保存访问路径列表        self.downloader = chenjie.html_downloader.HtmlDownloader()#downloader为自定义下载器        self.parser = chenjie.html_parser.HtmlParser()#parser为自定义解析器        self.outputer = chenjie.html_outputer.HtmlOutputer()#output为自定义输出器    def craw(self):        '''        new_path = chenjie.path.Path("http://racing.hkjc.com/racing/Info/meeting/Results/chinese/Local/20170625/ST/2",                                     "20170625",                                     "ST",                                     3,                                     0                                     )        '''        paths = chenjie.pathdao.PathDao().getAllPath()        #从数据库中得到所有的访问路径        for path in paths:            #对于每一个访问路径            print(path)            new_path = chenjie.path.Path(path['url'],path['date'],path['flag'],path['changci'],path['isFinished'])            #将它封装成对象            self.paths.add_new_path(new_path)            #添加到访问路径列表中        while self.paths.has_new_path():            #当访问路径列表中还有未访问的路径时            try:                new_path = self.paths.get_new_path()                #得到一条访问路径                print ('craw  : %s'%(new_path))                html_content = self.downloader.download(new_path.url)                #使用自定义网页下载器将其下载并保存                new_paths,new_data =  self.parser.parse(new_path,html_content)                #使用自定义网页解析器将网页内容解析为新的访问路径和新的数据                print("main得到new_data",new_data)                self.paths.add_new_paths(new_paths)                #将新的访问路径添加到访问路径列表中                #self.outputer.collect_data(new_data)                for data in new_data:                    #对于新的数据列表中的每一条数据                    dao = chenjie.mainfodao.MainfoDao()                    #生成一个赛马数据的自定义数据库访问对象                    dao.save(data)                    #调用它的save方法将新的数据保存到mysql数据库中            except Exception as e:                print("失败哒:",repr(e))        #self.outputer.output_html()if __name__ == "__main__":    #root_url = "http://baike.baidu.com/item/Python"    obj_spider  = SpiderMain()    while True:        obj_spider.craw()



阅读全文
0 0
原创粉丝点击