Python3实现豆瓣读书爬虫
来源:互联网 发布:真三国无双6 知乎 编辑:程序博客网 时间:2024/05/16 12:19
doubanSpider.py
# -*- coding: UTF-8 -*-import sysimport timeimport urllibimport requestsimport numpy as npfrom bs4 import BeautifulSoupfrom openpyxl import Workbook# Some User Agentshds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, \ { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \ {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]def book_spider(book_tag): page_num = 0; book_list = [] try_times = 0 while (1): # url='http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/book?start=0' # For Test url = 'http://www.douban.com/tag/' + urllib.request.quote(book_tag) + '/book?start=' + str(page_num * 15) time.sleep(np.random.rand() * 5) # Last Version try: req = urllib.request.Request(url, headers=hds[page_num % len(hds)]) plain_text = urllib.request.urlopen(req).read() #plain_text = str(source_code) except (urllib.request.HTTPError, urllib.request.URLError) as e: print(e) continue ##Previous Version, IP is easy to be Forbidden # source_code = requests.get(url) # plain_text = source_code.text soup = BeautifulSoup(plain_text, "lxml") list_soup = soup.find('div', {'class': 'mod book-list'}) try_times += 1; if list_soup == None and try_times <= 2: continue elif try_times>3: break # Break when no informatoin got after 200 times requesting for book_info in list_soup.findAll('dd'): title = book_info.find('a', {'class': 'title'}).string.strip() desc = book_info.find('div', {'class': 'desc'}).string.strip() desc_list = desc.split('/') book_url = book_info.find('a', {'class': 'title'}).get('href') try: author_info = '作者/译者: ' + '/'.join(desc_list[0:-3]) except: author_info = '作者/译者: 暂无' try: pub_info = '出版信息: ' + '/'.join(desc_list[-3:]) except: pub_info = '出版信息: 暂无' try: rating = book_info.find('span', {'class': 'rating_nums'}).string.strip() except: rating = '0.0' try: # people_num = book_info.findAll('span')[2].string.strip() people_num = get_people_num(book_url) people_num = people_num.strip('人评价') except: people_num = '0' book_list.append([title.encode().decode(), rating, people_num, author_info, pub_info]) #try_times = 0 # set 0 when got valid information page_num += 1 print('从页面%d 下载信息 ' % page_num) return book_listdef get_people_num(url): # url='http://book.douban.com/subject/6082808/?from=tag_all' # For Test try: req = urllib.request.Request(url, headers=hds[np.random.randint(0, len(hds))]) source_code = urllib.request.urlopen(req).read() plain_text = str(source_code) except (urllib.request.HTTPError, urllib.request.URLError) as e: print(e) soup = BeautifulSoup(plain_text, "lxml") people_num = soup.find('div', {'class': 'rating_sum'}).findAll('span')[1].string.strip() return people_numdef do_spider(book_tag_lists): book_lists = [] for book_tag in book_tag_lists: book_list = book_spider(book_tag) book_list = sorted(book_list, key=lambda x: x[1], reverse=True) book_lists.append(book_list) return book_listsdef print_book_lists_excel(book_lists, book_tag_lists): wb = Workbook(write_only=True) ws = [] for i in range(len(book_tag_lists)): ws.append(wb.create_sheet(title=book_tag_lists[i])) for i in range(len(book_tag_lists)): ws[i].append(['序号', '书名', '评分', '评价人数', '作者', '出版社']) count = 1 for bl in book_lists[i]: ws[i].append([count, bl[0], float(bl[1]), int(bl[2]), bl[3], bl[4]]) count += 1 save_path = 'book_list' for i in range(len(book_tag_lists)): save_path += ('-' + book_tag_lists[i]) save_path += '.xlsx' wb.save(save_path)if __name__ == '__main__': # book_tag_lists = ['心理','判断与决策','算法','数据结构','经济','历史'] # book_tag_lists = ['传记','哲学','编程','创业','理财','社会学','佛教'] # book_tag_lists = ['思想','科技','科学','web','股票','爱情','两性'] # book_tag_lists = ['计算机','机器学习','linux','android','数据库','互联网'] # book_tag_lists = ['数学'] # book_tag_lists = ['摄影','设计','音乐','旅行','教育','成长','情感','育儿','健康','养生'] # book_tag_lists = ['商业','理财','管理'] # book_tag_lists = ['名著'] # book_tag_lists = ['科普','经典','生活','心灵','文学'] # book_tag_lists = ['科幻','思维','金融'] book_tag_lists = ['计算机', '编程','科幻','创业'] book_lists = do_spider(book_tag_lists) print_book_lists_excel(book_lists, book_tag_lists)
运行结果如图:
每页15条,爬取3页,每个工作簿都是45条记录
阅读全文
0 0
- Python3实现豆瓣读书爬虫
- Python3之爬虫爬取豆瓣读书Top250
- python3 [入门基础实战] 爬虫入门之爬取豆瓣读书随笔页面
- Python爬虫 爬取豆瓣读书
- 使用Jsoup对豆瓣读书进行爬虫
- Python爬虫-爬取豆瓣读书
- Python 爬虫 抓取豆瓣读书TOP250
- python3 爬虫 模拟登陆豆瓣修改签名
- Pyton爬虫实现豆瓣登陆
- Python爬虫豆瓣读书评分9分以上榜单
- Python3实现简单爬虫
- Python3 爬虫(三) -- 爬取豆瓣首页图片
- Python3 爬虫(三) -- 爬取豆瓣首页图片
- Python3爬虫入门之爬取豆瓣Top250电影名称
- python3[爬虫基础入门实战] 爬取豆瓣电影排行top250
- python3 爬虫—爬取豆瓣电影图片(一)
- python3--爬虫实战一:爬取豆瓣电影250
- 使用BeautifulSoup实现简单豆瓣爬虫
- popupWindow+listView实现qq登录下拉最近登录账号
- Delphi实现Android 广播事件监听(动态注册广播)
- Java菜鸟学习日记4
- HTML笔记
- 【剑指offer】题53:正则表达式匹配
- Python3实现豆瓣读书爬虫
- 31-连续子数组的最大和
- json抓取疑难杂症
- 并查集——Codeforces744A Hongcow Builds A Nation
- 2^x mod n = 1
- 数组的排序,方法Arrays.sort(你创建的数组)
- 浅谈C/C++动态内存管理
- 5-8 哈利·波特的考试
- 微服务的一种开源实现方式——dubbo+zookeeper