python爬取豆瓣图书

来源:互联网 发布:网络销售要求 编辑:程序博客网 时间:2024/03/29 17:39

最近突然想学下爬虫爬取一下豆瓣的图书,按类别来爬取并分别存储,然后就用正则写了一份初级爬虫,目前只是分类的页面爬取,后面完善一下,希望能够得到每本书的isbn编号,生成自己的数据库。

# -*- coding:utf-8 -*-import urllib2import reimport systags = [u'小说', u'散文', u'历史', u'爱情', u'管理', u'编程', u'生活', u'心理']haveBooked = set()class BookSpider:    def __init__(self):        reload(sys)        sys.setdefaultencoding('utf-8')        self.start = 0        self.tagIndex = 0        self.param = '&filter=&type='        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)'}        self.filePath = 'DoubanTop250.txt'    def GetPage(self):        try:            URL = 'https://book.douban.com/tag/' + tags[self.tagIndex]            request = urllib2.Request(url=URL, headers=self.headers)            response = urllib2.urlopen(request)            page = response.read().decode('utf-8')            self.start += 20            pageNum = self.start / 20            print '正在抓取 ' + tags[self.tagIndex] + ' 的第' + str(pageNum) + '页数据'            return page        except urllib2.URLError, e:            if hasattr(e, 'reason'):                print '抓取失败,具体原因:', e.reason    def WriteBookTitle(self, titleInfo, fileBook):        patternTitle = re.compile(u'(.*?)<span.*?>(.*?)</span>.*?', re.S)        result = re.match(patternTitle, titleInfo)        if result is None:            fileBook.write('title: ' + titleInfo + '\r\n')        else:            titles = re.findall(patternTitle, titleInfo)            fileBook.write('title: ' + titles[0][0].strip() + titles[0][1].strip() + '\r\n')    def WriteBookPubInfo(self, pubInfo, fileBook):        pubInfo += '!'        patternPub1 = re.compile(u'(.*?)/(.*?)/(.*?)/(.*?)/(.*?)!', re.S)        patternPub2 = re.compile(u'(.*?)/(.*?)/(.*?)/(.*?)!', re.S)        result = re.match(patternPub1, pubInfo)        if result is None:            pubs = re.findall(patternPub2, pubInfo)            fileBook.write('author: ' + pubs[0][0].strip() + '\r\n')            fileBook.write('pubHouse: ' + pubs[0][1].strip() + '\r\n')            fileBook.write('pubData: ' + pubs[0][2].strip() + '\r\n')            fileBook.write('price: ' + pubs[0][3].strip() + '\r\n\r\n')        else:            pubs = re.findall(patternPub1, pubInfo)            fileBook.write('author: ' + pubs[0][0].strip() + '\r\n')            fileBook.write('translator: ' + pubs[0][1].strip() + '\r\n')            fileBook.write('pubHouse: ' + pubs[0][2].strip() + '\r\n')            fileBook.write('pubData: ' + pubs[0][3].strip() + '\r\n')            fileBook.write('price: ' + pubs[0][4].strip() + '\r\n\r\n')    def GetBook(self):    pattern = re.compile(u'<li.*?class="subject-item">.*?<div.*?class="info">.*?'            + u'<h2.*?class="">.*?<a.*?subject_id:\'(.*?)\'.*?>(.*?)</a>.*?<div.*?class="pub">'            + u'(.*?)</div>', re.S)        try:            while self.tagIndex < 2:                fileName = "book" + tags[self.tagIndex] + ".txt"                fileName.decode("utf-8").encode("gb2312")                print fileName                fileBook = open(fileName, 'w')                while self.start < 1:                    page = self.GetPage()                    books = re.findall(pattern, page)                    for book in books:                        fileBook.write('subject_id: ' + book[0].strip() + '\r\n')                        self.WriteBookTitle(book[1].strip(), fileBook)                        self.WriteBookPubInfo(book[2].strip(), fileBook)                    self.start += 1                fileBook.close()                self.start = 0                self.tagIndex += 1        except:            print '抓取 ' + tags[self.tagIndex] + ' 第 ' + self.start / 20 + 1 + ' 页失败'    def main(self):        print '开始抓取图书数据'        self.GetBook()        print '抓取完毕...'DoubanSpoder = BookSpider()DoubanSpoder.main()


0 0
原创粉丝点击