python 爬虫抓小说

来源:互联网 发布:南通留学中介 知乎 编辑:程序博客网 时间:2024/04/28 04:38
#coding = utf-8from BeautifulSoup import BeautifulSoupimport urllib2import sysimport traceback default_encoding = 'utf-8'if sys.getdefaultencoding() != default_encoding:reload(sys)sys.setdefaultencoding(default_encoding)def write1(url,title):print "start ->" + titlef = urllib2.urlopen(url)soup = BeautifulSoup(f.read().decode('utf-8','ignore'))str1=""str1=str('\n\n')+title+str('\n\t')+str(soup.find('div',id='contTxt'))str1=str1.replace('<div id="contTxt" class="contTxt1"><p>','\n')str1=str1.replace('</p><p>','\n')   str1=str1.replace('</p></div>','')return str1if __name__=='__main__':headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}url = 'http://vip.book.sina.com.cn/books/142600'url2 = 'http://vip.book.sina.com.cn'url3 = 'http://vip.book.sina.com.cn/chapter/142600/94509.html'f = urllib2.urlopen(url)soup = BeautifulSoup(f.read().decode('utf-8','ignore'))uls = soup.findAll('ul')delli=BeautifulSoup(str(uls)).findAll('li')str3=""for i in delli:#-26try:title = i.a.stringhref = i.a['href']if href.startswith('/chapter'):str2 = url2 + str(href)print  title + '\n' + str2 str3 += write1(str2,title)else:print 'no'except:print traceback.format_exc()pass  ff=open('heike.txt','wr+')    ff.write(str3)ff.close()

1 0
原创粉丝点击