python写的整本书的小说爬虫(并写入txt文件)

来源:互联网 发布:三维景观建筑设计软件 编辑:程序博客网 时间:2024/04/30 23:07

终于将一本书的小说爬虫完善了。在这里我爬取的是“去看看小说网”中的“剑来”一书。

代码中关于爬取内容的地方,读者可参考,然后按照自己想要爬取的网页进行修改。

(以下代码为完整代码,已测试)

#-*- coding=utf-8 -*-import urllib2import urlparseimport Queueimport timefrom bs4 import BeautifulSoup# s1 = '大家好啊'# open('hello.txt','w').write(s1)  # 得到小说内容def link_crawler(seed_url):    firstTitle = BeautifulSoup(download(seed_url)).title.text.split('(')[0]     # 拿到小说名字    filename = firstTitle+'.txt'    file = open(filename,'w+')      # 创建以小说名为名的txt文件,并将文件设置为追加模式    file.write(firstTitle+'\n\n')#     print firstTitle    crawler_queue = Queue.deque([seed_url])     # 将链接存到crawler_queue列表中,并且按照降序存放    seen = set(crawler_queue)       # 将访问过的链接存放在seen中    while crawler_queue:        url = crawler_queue.pop()   # 提取第一个链接        html = download(url)#         soup = BeautifulSoup(html)        soup = BeautifulSoup(html).find_all('dd',{'class':'col-md-3'})#         print soup        for link in soup:            title = link.string     # 章节名称            file.write(title+'\n')            link = link.find('a')['href']            print title   #             print link            if link not in seen:                # 因第一个链接所示内容为新书感言,需要从里头拿到的链接与后面的不一样,故单独记录                first = link.split('.')[0]                  link = urlparse.urljoin(url,link)   # 将链接补充完整                html2 = download(link)                content1 = BeautifulSoup(html2).find(id='htmlContent')      # 章节内容                neilink = BeautifulSoup(html2).find(id='linkNext')                bb = neilink['href']#                 print bb                if first == '1':                    content = content1.text                    file.write(content+'\n\n')                else:                    html3 = download(bb)                    content2 = BeautifulSoup(html3).find(id='htmlContent')                    content1 = content1.text                    content2 = content2.text                    file.write(content1+'\n')                    file.write(content2+'\n\n')#                     content = content1+content2(错误代码,不可以用+相连)#                 print content.text                                 seen.add(link)                crawler_queue.append(link)                time.sleep(1)       # 睡眠(每隔一秒钟找一次)                  # 得到已知链接的网页源代码(各句功能在以前的博客中都有,不再赘述)def download(url,user_agent = 'wswp',proxy = None,num_retries = 2):    print 'downloading:',url    headers = {'User-agent':user_agent}    request = urllib2.Request(url,headers = headers)          opener = urllib2.build_opener()    if proxy:        proxy_params = {urlparse.urlparse(url).scheme:proxy}        opener.add_handler(urllib2.ProxyHandler(proxy_params))    try:        html = opener.open(request).read()    except urllib2.URLError as e:        print 'download error',e.reason        html = None        if num_retries > 0:            if hasattr(e,'code') and 500 <= e.code < 600:                html = download(url,user_agent,proxy,num_retries-1)                      return html seed_url = 'http://www.7kankan.la/book/1/'# seed_url = 'http://www.biquge5200.com/52_52542/'link_crawler(seed_url)


阅读全文
0 0
原创粉丝点击