使用python抓取小说

来源：互联网发布：网络祭奠亲人编辑：程序博客网时间：2024/04/28 02:34

# coding: utf-8import bs4from bs4 import BeautifulSoupimport urllib2import codecsimport timeimport jsonimport sysdef novelFilter(content):  content=content.replace('<br />\n<br />','')  content=content.replace('<br />','')  content=content.replace(' ','')  content=content.replace('<dd id="contents">','')  content=content.replace('</dd>','')  return contentdef novelFetch(url, title):  novel = urllib2.urlopen(url)  soup = BeautifulSoup(novel.read().decode('gbk', 'ignore'))  contents = '\n' + title + '\n' + str(soup.find('dd', id = 'contents'))  contents = novelFilter(contents)  print title.decode("utf-8")  return contentsdef novelSelect(url, mark):  link = urllib2.urlopen(url)  soup = BeautifulSoup(link.read().decode('gbk', 'ignore'))  body = soup.findAll('td')  flag = False  cont = ''  href = ''  title = ''  for i in body:    try:      href = url + i.a['href']      title = str(i.a.string)      if flag and href:        cont += novelFetch(href, title)      if title.decode("utf-8") == mark:        flag = True    except:      pass  return {    'contents': cont,    'bookmark': title  }def novelManage(info=0):  if info:    f = codecs.open('novel.json', 'w')    f.write(json.dumps(info, indent=2, ensure_ascii=False))    f.close()  else:    f = codecs.open('novel.json', 'r')    info = json.loads(f.read())  return infoif __name__=='__main__':  novels = novelManage()  hasUpdate = False  for title in novels:    novel = novelSelect(novels[title]['url'], novels[title]['bookmark'])    cont = novel['contents']    bookmark = novel['bookmark']    if cont and bookmark:      novels[title]['bookmark'] = bookmark      timesamp = time.strftime("%Y%m%d%H%M", time.localtime())      f = codecs.open(title + '_' + timesamp + '.txt', 'w', 'utf-8') #使用gbk格式      f.write(cont)      f.close()      hasUpdate = True  if not hasUpdate:    print '小说没有更新。'.decode('utf-8')  else:    novelManage(novels)

json

{  "不败战神": {    "url": "http://www.23us.com/html/27/27736/",     "bookmark": "第两百八十八节 唐天的判断"  },   "大主宰": {    "url": "http://www.23us.com/html/28/28373/",     "bookmark": "第一百九十五章 取巧"  },   "神级英雄": {    "url": "http://www.23us.com/html/42/42368/",     "bookmark": "第101章 牵动公会巨头的卷轴"  }}

转自：http://www.oschina.net/code/snippet_254703_25144

运行结果截图：