网络爬虫初试

来源：互联网发布：破解试用版软件 2017 编辑：程序博客网时间：2024/05/17 09:28

爬取腾讯漫画的一个例子

# encoding: utf-8from __future__ import print_functionfrom __future__ import unicode_literalsimport requestsimport base64import reimport jsonimport osrequestSession = requests.session()def getId(url):    numRE = re.compile(r'\d+')#r的意思是不转义，即\表示原样的\。否则有可能被视图按\d为一个字符解析转义。    id = numRE.findall(url)[0]    return id   def getContent(id):    getComicInfoUrl = 'http://m.ac.qq.com/GetData/getComicInfo?id={}'.format(id)    getComicInfo = requestSession.get(getComicInfoUrl)    comicInfoJson = getComicInfo.text    comicInfo = json.loads(comicInfoJson)    comicName = comicInfo['title']#解析出漫画名    getChapterListUrl = 'http://m.ac.qq.com/GetData/getChapterList?id={}'.format(id)    getChapterList = requestSession.get(getChapterListUrl)    contentJson = json.loads(getChapterList.text)    count = contentJson['length']#获得章节数    sortedContentList = []    #按顺序整理章节    for i in range(count + 1):        for item in contentJson:            if isinstance(contentJson[item], dict) and contentJson[item]['seq'] == i:                sortedContentList.append({item: contentJson[item]})                break    return (comicName, count, sortedContentList)def getImgList(contentJson, id):    #cid = contentJson.keys()[0] #原作者使用的是Python2    #在python2.x中，dict.keys()返回一个列表，在python3.x中，dict.keys()返回一个dict_keys对象，比起列表，这个对象的行为更像是set，所以不支持索引的。解决方案：list(dict.keys())[index]    cid = list(contentJson.keys())[0]    #getPicHashURL = 'http://m.ac.qq.com/View/mGetPicHash?id={}&cid={}'.format(id, cid) #原作者请求失效    #getPicHashURL='http://m.ac.qq.com/chapter/index/id/{}/cid/{}'.format(id,cid) #异步加载,修改后仍是错误的    picJsonPage = requestSession.get(getPicHashURL).text    picJson = json.loads(picJsonPage)    count = picJson['pCount']    #统计图片数量    pHash = picJson['pHash']    sortedImgDictList = []    for i in range(1, count + 1):        for item in pHash:            if pHash[item]['seq'] == i:                sortedImgDictList.append(pHash[item])                break    imgList = []    for imgDict in sortedImgDictList:        k = imgDict['cid']        m = imgDict['pid']        j = int(id)        uin = max(j + k + m, 10001)        l = [j % 1000 / 100, j % 100, j, k]        n = '/mif800/' + '/'.join(str(j) for j in l) + '/'        h = str(m) + '.mif2'        g="http://ac.tc.qq.com/store_file_download?buid=15017&uin="+str(uin)+"&dir_path="+n+"&name="+h        imgList.append(g)    return imgListdef downloadImg(imgUrlList, contentPath):    count = len(imgUrlList)    print('该集漫画共计{}张图片'.format(count))    i = 1    for imgUrl in imgUrlList:        print('\r正在下载第{}张图片...'.format(i), end='')        imgPath = os.path.join(contentPath, '{0:0>3}.jpg'.format(i))        downloadRequest = requestSession.get(imgUrl, stream=True)        with open(imgPath, 'wb') as f:            for chunk in downloadRequest.iter_content(chunk_size=1024):                 if chunk: # filter out keep-alive new chunks                    f.write(chunk)                    f.flush()        i += 1    print('完毕!\n')def main():    url = 'http://m.ac.qq.com/comic/index/id/623622'  #要爬取的漫画首页    path = 'E:\\WorkSpace\\test'   #下载图片存放路径    if not os.path.isdir(path):        os.mkdir(path)    id = getId(url)    comicName,count,contentList = getContent(id)    contentNameList = []    for item in contentList:        for k in item:            contentNameList.append(item[k]['t'])    print('漫画名: {}'.format(comicName))    print('章节数: {}'.format(count))    print('章节列表:')    print('\n'.join(contentNameList))    comicPath = os.path.join(path, comicName)    if not os.path.isdir(comicPath):        os.mkdir(comicPath)    print()    i = 0    for content in contentList:        print('正在下载第{0:0>4}话: {1}'.format(i + 1, contentNameList[i]))        contentPath = os.path.join(comicPath, '第{0:0>4}话-{1}'.format(i + 1, contentNameList[i]))        if not os.path.isdir(contentPath):            os.mkdir(contentPath)        imgList = getImgList(content, id)        downloadImg(imgList, contentPath)        i += 1if __name__ == '__main__':    main()

#coding=utf-8#破解采集图片数据"""http://api.yyhao.com/app_api/v3/getcomicinfo/?comic_id=27284      #获得目录信息"""import requestsimport base64import reimport jsonimport osimport urllib#获取网页源代码def getHtmlText(url):    try:        head={}        head['User-Agent'] ="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"        r=requests.get(url,headers=head)        r.raise_for_status()#如果状态不是200,引发HTTPError异常        r.encoding=r.apparent_encoding        return r.text       except:        print("获取网页产生异常")  #获取漫画信息def getComicInfo(comicId):    url='http://api.yyhao.com/app_api/v3/getcomicinfo/?comic_id={}'.format(comicId)    info=json.loads(getHtmlText(url))    comicName=info['comic_name']    comicAuthor=info['comic_author']    comicDesc=info['comic_desc']    chapterList=info['comic_chapter'][1]['chapter_list']    return(comicName,comicAuthor,comicDesc,chapterList)#下载某个url对应的图片def downloadImag(url,picPath,num):    head={}    head['User-Agent'] ="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"    image = requests.get(url,headers=head)    if image.status_code == 200:        picPath=picPath+str(num)+'.jpg'        f=open(picPath, 'wb')        f.write(image.content)          f.close()    else:        print("下载图片出错!")#下载一部漫画所有的图片def downloadAll(path,chapterList):    for chapter in chapterList:        chapterPath=path+chapter['chapter_name']+'/'        if not os.path.exists(chapterPath):            os.mkdir(chapterPath)        start=chapter['chapter_source'][0]['start_num']        end=chapter['chapter_source'][0]['end_num']        ofDomain=chapter['chapter_source'][0]['chapter_domain']        if ofDomain=='':            domain='mhpic.samanlehua.com'        else:            domain='mhpic.'+ofDomain        rule=urllib.parse.quote(chapter['chapter_source'][0]['rule'])        imageUrl='http://'+domain+rule        imageUrl=imageUrl.replace("%24%24",'{}')        #imageUrl=urllib.parse.quote(imageUrl)        while(start<=end):            imageUrlAll=imageUrl.format(start)#+'-kmw.middle'            downloadImag(imageUrlAll,chapterPath,start)            start+=1        print(chapter['chapter_name']+"  has downloaded")if __name__ == '__main__':    comicId=3130       comicName,coimcAuthor,comicDesc,chapterList=getComicInfo(comicId)    print(comicName+'  downloading...')    #存放路径    path='E:/WorkSpace/comic/'+comicName+'/'    if not os.path.exists(path):        os.mkdir(path)    downloadAll(path,chapterList)    print("All finished!")

其他资料
www.manhuagui.com的网站是动态加载的,用静态提取的方法失败了
参考博客
[1]http://blog.sina.com.cn/s/blog_700376570102x80k.html #scrapy静态抓取
[2]http://blog.sina.com.cn/s/blog_700376570102x8er.html
[3]http://blog.sina.com.cn/s/blog_700376570102x96k.html #动态抓取
对于数据来说，我们通常的做法就是进行Ajax处理，即异步请求数据，然后局部渲染生成整个网页。因此，我们设想，它是不是有ajax请求呢，
让我们来看看Network tab页下面查不查得到相关信息

使用PantomJS

#coding=utf-8from selenium import webdriverimport timefrom selenium.webdriver.common.desired_capabilities import DesiredCapabilities#基于PhantomJS创建一个浏览器，并且设置一下用户代理，否则可能出现界面不兼容的情况dcap = dict(DesiredCapabilities.PHANTOMJS)  dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/4.0 (compatible; MSIE 5.5; windows NT)"  )  browser = webdriver.PhantomJS(desired_capabilities=dcap)#打开动漫的第一页browser.get("http://ac.qq.com/ComicView/index/id/539443/cid/1")#通过该循环，我们可以依次进行自动滑动，模拟滑动后自然会触发后续的图片资源。for i in range(10):    js='window.scrollTo('+str(i*1280)+','+str((i+1)*1280)+')'    browser.execute_script(js)    time.sleep(1)#将打开的界面截图保存，方便观察a=browser.get_screenshot_as_file("E:/WorkSpace/Python/test.jpg")#获取当前页面所有源码（此时包含触发出来的异步加载的资源）data=browser.page_source#将相关网页源码写入本地文件中，方便分析fh=open("E:/WorkSpace/Python/dongman.html","w",encoding="utf-8")fh.write(data)fh.close()#结束了PhantomJS的使用之后，我们需要关闭一下浏览器，所以，我们在代码后添加如下一行代码browser.quit()#我们可以通过正则表达式'<img src="(http:..ac.tc.qq.com.store_file_download.buid=.*?name=.*?).jpg"'将所有动漫资源图片网址提取出来，提取出来之后，通过urllib对这些图片进行爬取，爬到本地。import reimport urllib#构造正则表达式提取动漫图片资源网址pat='<img src="(http:..ac.tc.qq.com.store_file_download.buid=.*?name=.*?).jpg"'#获取所有动漫图片资源网址allid=re.compile(pat).findall(data)for i in range(0,len(allid)):    #得到当前网址    thisurl=allid[i]    #去除网址中的多余元素amp;    thisurl2=thisurl.replace("amp;","")+".jpg"    #输出当前爬取的网址    print(thisurl2)    #设置将动漫存储到本地的本地目录    localpath="E:/WorkSpace/dongman/"+str(i)+".jpg"    #通过urllib对动漫图片资源进行爬取    urllib.request.urlretrieve(thisurl2,filename=localpath)

爬取小说的例子

#coding=utf-8#BeautifulSoup处理HTML标签包括搜索,遍历相应的标签,但是将HTML文档转化为可读文档,属于浏览器渲染的行为,bs一般是不提供的,具体的解决办法有用正则表达式,用nltk,用js的innnerHTML#import nltk#content=nltk.clean_html(contentHTML)  #nltk后来的版本不支持clear_html()和clear_url()#[s.extract() for s in contentHTML.find_all('p',text="")] #去除了所有的p标签import requestsfrom bs4 import BeautifulSoup#from HTMLParser import HTMLParser  #python2from html.parser import HTMLParser  #python3from re import sub  from sys import stderr  from traceback import print_exc  import osclass MyHTMLParser(HTMLParser):      def __init__(self):          HTMLParser.__init__(self)          self.texts=""#存放处理的结果    def handle_data(self, data):          data=data.strip()  #去除首尾空格        data='  '+data   #段落首部空格        self.texts+=data    #处理标签    #def handle_starttag(self, tag, attrs):  #在遇到开始标签时发生的行为    #def handle_startendtag(self, tag, attrs):  #在遇到开始或者结束标签时发生的行为    def handle_endtag(self,tag):  #在遇到结束标签时发生的行为        if tag == 'p':              self.texts+='\n\n'    def text(self):          return self.texts#解析HTML为Textdef dehtml(text):      try:          parser = MyHTMLParser()          parser.feed(text)          parser.close()          return parser.text()      except:          print_exc(file=stderr)          return text  #获取网页源代码def getHTMLText(url):    try:        head={}        head['User-Agent'] ="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"        r=requests.get(url,headers=head)        r.raise_for_status()#如果状态不是200,引发HTTPError异常        r.encoding=r.apparent_encoding        return r.text       except:        return "获取网页产生异常"if __name__=="__main__":    print('starting...')    bookUrl="http://www.shuhuanghuang.com/book/44674.html"#小说首页    path="E:/WorkSpace/novel/"#下载小说位置    soup=BeautifulSoup(getHTMLText(bookUrl),'lxml')    infoSoup=soup.find('div',class_='info-box')    bookTitle=infoSoup.find('p',class_='title').text    bookDescription=infoSoup.get_text()    bookPath=path+bookTitle;    if not os.path.exists(bookPath):  #不能用!来取非        os.makedirs(bookPath)     descriptionFile=bookPath+'/'+'Description.txt'    if not os.path.exists(descriptionFile):        open(descriptionFile,'w+').close()          f=open(descriptionFile,'w')    f.write(bookDescription)    f.close()    print('downloading...')    chapterUrl=soup.find('a',class_='read',text='阅读')['href'] #小说第一章url    hasNext=True;    while(hasNext):        chapterSoup = BeautifulSoup(getHTMLText(chapterUrl),'lxml')        title=chapterSoup.find('div',class_='title').string #小说标题        title=title.replace('\r\n','').replace('\t','')        contentHTML=chapterSoup.find('div',class_='content')        contentStr=contentHTML.prettify().replace('\n','').replace('<p> </p>','')        content=dehtml(contentStr)        chapterPath=bookPath+'/'+title+'.txt'        if not os.path.exists(chapterPath):            open(chapterPath,'w+').close()        f=open(chapterPath,'w')        f.write(content)        f.close()        print(title+'  has downloaded!')        nextChapter=chapterSoup.find('a',class_='next')        if not nextChapter:            hasNext=False            break        chapterUrl=nextChapter['href']    print('All finished!')

阅读全文

0 0