爬取GEEKPARK最近一年活跃会员

来源:互联网 发布:淘宝与03年的非典 编辑:程序博客网 时间:2024/04/29 07:37

最简单的小爬虫,啥都没考虑直接爬,比较慢。

# -*- coding: utf-8 -*-__author__ = 'wangjingyao'from BeautifulSoup import BeautifulSoupimport urllibimport  urllib2import reimport sysimport proxyIPimport  user_agents,random,timeclass JKGY:    def __init__(self):        self.pageIndex = 210714        self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36'        self.headers={'User-Agent' : self.user_agent}        self.stories=[]        self.errorFlag = 0    def getPage(self,pageIndex):        try:            url='http://www.geekpark.net/topics/'+str(pageIndex)            # proxy_ip={'http':'27.24.158.155:84'}            # print proxy_ip            # proxy_support=urllib2.ProxyHandler(proxy_ip)            # opener=urllib2.build_opener(proxy_support, urllib2.HTTPHandler(debuglevel=1))            # urllib2.install_opener(opener)            # request=urllib2.Request(url)            # user_agent = random.choice(user_agents.user_agents)  #在user_agents中随机取一个做user_agent            # print user_agent            # headers={'User-Agent' : user_agent}            # request.add_header('User-Agent',user_agent) #修改user-Agent字段            # request=urllib2.Request(url,headers = headers)            # response=urllib2.urlopen(request)            request = urllib2.Request(url,headers = self.headers)            response = urllib2.urlopen(request)            reload(sys)            sys.setdefaultencoding('utf8')            pageCode = response.read().encode('gbk','ignore')            return pageCode        except urllib2.URLError,e :            if hasattr(e,'reason'):                print u'connection error:',e.reason                return  None    def getPageItems(self,pageIndex):        pageCode=self.getPage(pageIndex)        if not pageCode:            print 'pageCode init error'+str(self.errorFlag)            self.errorFlag +=1            return  None        # 作者爬取        pattern = re.compile('<span itemprop="author">(.*?)</span>')        items = re.findall(pattern,pageCode)        for item in items:            self.stories.append(item)        # 评论会员爬取        partternComment = re.compile('<div class="comment-detail"><a href=".*?">(.*?)</a>')        itemcomments= re.findall(partternComment,pageCode)        for itemcomment in itemcomments:            if itemcomment.decode('gbk') != '极客漫游者':                self.stories.append(itemcomment)    def loadPage(self):            if self.pageIndex <= 213394:#213394                self.getPageItems(self.pageIndex)                self.pageIndex += 1            else:                self.enable=False    def start(self):        self.enable=True        while self.enable:            print '---------------------'            self.loadPage()        # print self.stories        fp =open("out.txt","w")#a是追加        self.stories=list(set(self.stories))        for i in self.stories:            fp.write(i+'\t')        fp.close()spiler = JKGY()spiler.start()

0 0
原创粉丝点击