neihan8段子爬取

来源:互联网 发布:淘宝客手机可以开通吗 编辑:程序博客网 时间:2024/05/21 09:20
# coding:utf-8import urllib2import reimport timeclass Spider(object):    def __init(self):        pass    def loadPage(self):        startNum = int(raw_input("请输入起始页号:"))        endNum = int(raw_input("请输入结束页号:"))        headers = {            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"            }        for num in range(startNum, endNum + 1):            if num == 1:                url = "http://www.neihan8.com/article/index.html"            else:                url = "http://www.neihan8.com/article/index_%s.html"%str(num)            print url            request = urllib2.Request(url,headers = headers)            response = urllib2.urlopen(request)            #print response.read()            html = response.read()            pattern = re.compile('<div\sclass="desc">(.*?)</div>',re.S)            content_list = pattern.findall(html)            self.writePage(content_list)    def writePage(self,content_list):            with open("duanzi.txt","a") as f:                for content in content_list:                    f.write(content+"\r\n\r\n")if __name__ == "__main__":    Spider().loadPage()