爬虫实践练习(爬小说)

来源:互联网 发布:seo推广学院 编辑:程序博客网 时间:2024/05/20 13:04

爬取多本小说

要爬取的网页:


http://www.tianyashuku.com/kehuan/

用到的库: urllib2,re,bs4

获取网页源代码:

 def getPage(self)               request=urllib2.Request(url,headers=headers)        response=urllib2.urlopen(request)        return response.read().decode('utf-8')

利用正则获得每个小说的网址(也就是小说的目录页面):

 def geturl(self,page):        #得到小说章节的网址        pattern = re.compile('<li><a href="(.*?)" title=',re.S)        result = re.findall(pattern,page)        return result
在小说的目录页面获得小说题目:

    def getTitle(self,page):        #获得标题        pattern = re.compile('<h1>(.*?)</h1>',re.S)        result = re.search(pattern,page)        return result.group(1).strip()
获得每一章的网址利用上面geturl方法


再利用得到的网址获取每本小说每一章的内容

    def neirong(self,page):        #bs4获得小说的内容        soup=BeautifulSoup(page)        text=soup.find_all("div",class_="neirong")        for k in text:            return k.get_text(strip=True)

设置保存小说的文件名:

    def setFileTitle(self,title):        if title is not None:            fileName = re.sub('[\/:*?"<>|]','-',title)#去掉非法字符            self.file = open(fileName + ".txt","w+")        else:            self.file = open('xiaoshuo' + ".txt","w+")

整体的代码:

# -*- coding:utf-8 -*-import urllib2import refrom bs4 import BeautifulSoupclass XiaoShuo:    def __init__(self,url):        self.url = url    def geturl(self,page):        #得到小说章节的网址        pattern = re.compile('<li><a href="(.*?)" title=',re.S)        result = re.findall(pattern,page)        return result    def seturl(self,result):        for k in result:            newurl = "http://www.tianyashuku.com" +k    def getPage(self):        #获得网页        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'        headers = { 'User-Agent' : user_agent }        url=self.url        request=urllib2.Request(url,headers=headers)        response=urllib2.urlopen(request)        return response.read().decode('utf-8')    def getxiaoshuo(self,url):        #获得网页        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'        headers = { 'User-Agent' : user_agent }        request=urllib2.Request(url,headers=headers)        response=urllib2.urlopen(request)        return response.read().decode('utf-8')    def getTitle(self,page):        #获得标题        pattern = re.compile('<h1>(.*?)</h1>',re.S)        result = re.search(pattern,page)        return result.group(1).strip()    def setFileTitle(self,title):        if title is not None:            fileName = re.sub('[\/:*?"<>|]','-',title)#去掉非法字符            self.file = open(fileName + ".txt","w+")        else:            self.file = open('xiaoshuo' + ".txt","w+")    def neirong(self,page):        #bs4获得小说的内容        soup=BeautifulSoup(page)        text=soup.find_all("div",class_="neirong")        for k in text:            return k.get_text(strip=True)    def start(self):        #程序的开始        page=self.getPage()        URL=self.geturl(page)#小说网页        print URL        for m in URL:            newurl1=re.sub('" target="_blank','',m)            xiaoshuourl="http://www.tianyashuku.com" +newurl1            request=urllib2.Request(xiaoshuourl)            response=urllib2.urlopen(request)            content =response.read().decode('utf-8')            m1=self.geturl(content)            title=self.getTitle(content)#获得小说标题            for k in m1:                newurl = "http://www.tianyashuku.com" +k                print newurl                xiaoshuo=self.getxiaoshuo(newurl)                a=self.getTitle(xiaoshuo)                b=self.neirong(xiaoshuo)                list=a+'\n'+b+'\n'                print list                f1 = open(title + ".txt",'a')                f1.write(list.encode('utf-8'))                f1.close()# url='http://www.51shucheng.net/kehuan/santi/santi1/174.html'#URL='http://www.tianyashuku.com/kehuan/'html = XiaoShuo(URL)html.start()




原创粉丝点击