糗百爬虫

来源:互联网 发布:mac文件怎么转word 编辑:程序博客网 时间:2024/05/16 19:16
from BeautifulSoup import BeautifulSoupimport urllib2import urlliboutfile = open("qiubai1.txt", "w")def formalize(text):    result = ''    lines = text.split(u'\n')    for line in lines:        line = line.strip()        if len(line) == 0:            continue        result += line + u'\n\n'    return resultdef writeIO(text):    #text=text+r"\r\n"    outfile.write(text)    #print >> outfile, text + "\r\n"  def qiuBaiDemo(page):    url="http://www.qiushibaike.com/hot/page/"+page    #print url    data = urllib2.urlopen(url).readlines()    #print len(data)       soup = BeautifulSoup("".join(data))    #print soup.div.content    #print soup['title']    #print type(soup)    contents = soup.findAll('div',"content")    contentss=[]    for i in range(0,len(contents)):        try:            title=contents[i]['title']            contentss.append(contents[i])                 except:            print ""        stories = [str(text) for text in contentss]    count=0    for story in stories:        count+=1        minisoup = BeautifulSoup(story)          text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])        #text = urllib.unquote(unescape(text, {'"':'"'}))        text = formalize(text).encode("gb18030")        print >> outfile, '-' * 20 + " %05d " % count + '-' * 20 + "\n"        print text        writeIO(text)                                                           if __name__ == '__main__':    page=raw_input('Enter the page you want view : ')    qiuBaiDemo(page)    outfile.close()