糗百爬虫
来源:互联网 发布:mac文件怎么转word 编辑:程序博客网 时间:2024/05/16 19:16
from BeautifulSoup import BeautifulSoupimport urllib2import urlliboutfile = open("qiubai1.txt", "w")def formalize(text): result = '' lines = text.split(u'\n') for line in lines: line = line.strip() if len(line) == 0: continue result += line + u'\n\n' return resultdef writeIO(text): #text=text+r"\r\n" outfile.write(text) #print >> outfile, text + "\r\n" def qiuBaiDemo(page): url="http://www.qiushibaike.com/hot/page/"+page #print url data = urllib2.urlopen(url).readlines() #print len(data) soup = BeautifulSoup("".join(data)) #print soup.div.content #print soup['title'] #print type(soup) contents = soup.findAll('div',"content") contentss=[] for i in range(0,len(contents)): try: title=contents[i]['title'] contentss.append(contents[i]) except: print "" stories = [str(text) for text in contentss] count=0 for story in stories: count+=1 minisoup = BeautifulSoup(story) text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)]) #text = urllib.unquote(unescape(text, {'"':'"'})) text = formalize(text).encode("gb18030") print >> outfile, '-' * 20 + " %05d " % count + '-' * 20 + "\n" print text writeIO(text) if __name__ == '__main__': page=raw_input('Enter the page you want view : ') qiuBaiDemo(page) outfile.close()