基于Python3的phantomJs+Selenium动态网页爬取技术

来源：互联网发布：韩国网络剧推荐编辑：程序博客网时间：2024/05/17 08:26

Python3+Scrapy+phantomJs+Selenium爬取今日头条

在实现爬虫的过程中，我们不可避免的会爬取又JS以及Ajax等动态网页技术生成网页内容的网站，今日头条就是一个很好的例子。

本文所要介绍的是基于Python3，配合Scrapy+phantomjs+selenium框架的动态网页爬取技术。

本文所实现的2个项目已上传至Github中，求Star~
1. 爬取今日头条新闻列表URL：
2. 爬取今日头条新闻内容：

静态网页爬取技术以及windows下爬虫环境搭建移步上几篇博客，必要的安装软件也在上一篇博客中提供。

本文介绍使用PhantongJs + Selenium实现新闻内容的爬取，爬取新闻列表的url也是相同的原理，不再赘述。

项目结构

这里写图片描述

项目原理

底层代码使用Python3，网络爬虫基础框架采用Scrapy，由于爬取的是动态网页，整个网页并不是直接生成页面，动过Ajax等技术动态生成。所以这里考虑采用 PhantomJs+Selenium模拟实现一个无界面的浏览器，去模拟用户操作，抓取网页代码内容。

代码文件说明

项目结构从上到下依次为：

middleware.py：整个项目的核心，用于启动中间件，在Scrapy抓取调用request的过程中实现模拟用户操作浏览器
ContentSpider.py：爬虫类文件，定义爬虫
commonUtils：工具类
items.py：爬虫所抓取到的字段存储类
pipelines.py：抓取到的数据处理类

这5个为关键类代码，其余的代码为业务相关代码。

关键代码讲解

middleware.py

# douguo request middleware# for the page which loaded by js/ajax# ang changes should be recored here:## @author zhangjianfei# @date 2017/05/04from selenium import webdriverfrom scrapy.http import HtmlResponsefrom DgSpiderPhantomJS import settingsfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesimport timeimport randomclass JavaScriptMiddleware(object):    print("LOGS  Starting Middleware  ...")    def process_request(self, request, spider):        print("LOGS:  process_request is starting  ...")        # 开启虚拟浏览器参数        dcap = dict(DesiredCapabilities.PHANTOMJS)        # 设置agents        dcap["phantomjs.page.settings.userAgent"] = (random.choice(settings.USER_AGENTS))        # 启动phantomjs        driver = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1\bin\phantomjs.exe", desired_capabilities=dcap)        # 设置60秒页面超时返回        driver.set_page_load_timeout(60)        # 设置60秒脚本超时时间        driver.set_script_timeout(60)        # get page request        driver.get(request.url)        # simulate user behavior        js = "document.body.scrollTop=10000"        driver.execute_script(js)  # 可执行js，模仿用户操作。此处为将页面拉至1000。        # 等待异步请求响应        driver.implicitly_wait(20)        # 获取页面源码        body = driver.page_source        return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)

ContentSpider.py

# -*- coding: utf-8 -*-import scrapyimport randomimport timefrom DgSpiderPhantomJS.items import DgspiderPostItemfrom scrapy.selector import Selectorfrom DgSpiderPhantomJS import urlSettingsfrom DgSpiderPhantomJS import contentSettingsfrom DgSpiderPhantomJS.mysqlUtils import dbhandle_update_statusfrom DgSpiderPhantomJS.mysqlUtils import dbhandle_geturlclass DgContentSpider(scrapy.Spider):    print('LOGS: Spider Content_Spider Staring  ...')    sleep_time = random.randint(60, 90)    print("LOGS: Sleeping :" + str(sleep_time))    time.sleep(sleep_time)    # get url from db    result = dbhandle_geturl()    url = result[0]    # spider_name = result[1]    site = result[2]    gid = result[3]    module = result[4]    # set spider name    name = 'Content_Spider'    # name = 'DgUrlSpiderPhantomJS'    # set domains    allowed_domains = [site]    # set scrapy url    start_urls = [url]    # change status    """对于爬去网页，无论是否爬取成功都将设置status为1，避免死循环"""    dbhandle_update_status(url, 1)    # scrapy crawl    def parse(self, response):        # init the item        item = DgspiderPostItem()        # get the page source        sel = Selector(response)        print(sel)        # get post title        title_date = sel.xpath(contentSettings.POST_TITLE_XPATH)        item['title'] = title_date.xpath('string(.)').extract()        # get post page source        item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract()        # get url        item['url'] = DgContentSpider.url        yield item

commonUtils

import randomfrom hashlib import md5# 获取随机发帖IDdef get_random_user(user_str):    user_list = []    for user_id in str(user_str).split(','):        user_list.append(user_id)    userid_idx = random.randint(1, len(user_list))    user_chooesd = user_list[userid_idx-1]    return user_chooesd# 获取MD5加密URLdef get_linkmd5id(url):    # url进行md5处理，为避免重复采集设计    md5_url = md5(url.encode("utf8")).hexdigest()    return md5_url

items.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass DgspiderUrlItem(scrapy.Item):    url = scrapy.Field()class DgspiderPostItem(scrapy.Item):    url = scrapy.Field()    title = scrapy.Field()    text = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport reimport datetimeimport urllib.requestfrom DgSpiderPhantomJS import urlSettingsfrom DgSpiderPhantomJS import contentSettingsfrom DgSpiderPhantomJS.mysqlUtils import dbhandle_insert_contentfrom DgSpiderPhantomJS.uploadUtils import uploadImagefrom DgSpiderPhantomJS.mysqlUtils import dbhandle_onlinefrom DgSpiderPhantomJS.PostHandle import post_handelfrom DgSpiderPhantomJS.mysqlUtils import dbhandle_update_statusfrom bs4 import BeautifulSoupfrom DgSpiderPhantomJS.commonUtils import get_random_userfrom DgSpiderPhantomJS.commonUtils import get_linkmd5idclass DgspiderphantomjsPipeline(object):    # post构造reply    cs = []    # 帖子title    title = ''    # 帖子文本    text = ''    # 当前爬取的url    url = ''    # 随机用户ID    user_id = ''    # 图片flag    has_img = 0    # get title flag    get_title_flag = 0    def __init__(self):        DgspiderphantomjsPipeline.user_id = get_random_user(contentSettings.CREATE_POST_USER)    # process the data    def process_item(self, item, spider):        self.get_title_flag += 1        # 获取当前网页url        DgspiderphantomjsPipeline.url = item['url']        # 获取post title        if len(item['title']) == 0:            title_tmp = ''        else:            title_tmp = item['title'][0]        # 替换标题中可能会引起 sql syntax 的符号        # 对于分页的文章，只取得第一页的标题        if self.get_title_flag == 1:            # 使用beautifulSoup格什化标题            soup_title = BeautifulSoup(title_tmp, "lxml")            title = ''            # 对于bs之后的html树形结构，不使用.prettify()，对于bs, prettify后每一个标签自动换行，造成多个、            # 多行的空格、换行，使用stripped_strings获取文本            for string in soup_title.stripped_strings:                title += string            title = title.replace("'", "”").replace('"', '“')            DgspiderphantomjsPipeline.title = title        # 获取正post内容        if len(item['text']) == 0:            text_temp = ''        else:            text_temp = item['text'][0]        soup = BeautifulSoup(text_temp, "lxml")        text_temp = str(soup)        # 获取图片        reg_img = re.compile(r'<img.*?>')        imgs = reg_img.findall(text_temp)        for img in imgs:            DgspiderphantomjsPipeline.has_img = 1            # matchObj = re.search('.*src="(.*)"{2}.*', img, re.M | re.I)            match_obj = re.search('.*src="(.*)".*', img, re.M | re.I)            img_url_tmp = match_obj.group(1)            # 去除所有Http:标签            img_url_tmp = img_url_tmp.replace("http:", "")            # 对于<img src="http://a.jpg" title="a.jpg">这种情况单独处理            imgUrl_tmp_list = img_url_tmp.split('"')            img_url_tmp = imgUrl_tmp_list[0]            # 加入http            imgUrl = 'http:' + img_url_tmp            list_name = imgUrl.split('/')            file_name = list_name[len(list_name)-1]            # if os.path.exists(settings.IMAGES_STORE):            #     os.makedirs(settings.IMAGES_STORE)            # 获取图片本地存储路径            file_path = contentSettings.IMAGES_STORE + file_name            # 获取图片并上传至本地            urllib.request.urlretrieve(imgUrl, file_path)            upload_img_result_json = uploadImage(file_path, 'image/jpeg', DgspiderphantomjsPipeline.user_id)            # 获取上传之后返回的服务器图片路径、宽、高            img_u = upload_img_result_json['result']['image_url']            img_w = upload_img_result_json['result']['w']            img_h = upload_img_result_json['result']['h']            img_upload_flag = str(img_u)+';'+str(img_w)+';'+str(img_h)            # 在图片前后插入字符标记            text_temp = text_temp.replace(img, '[dgimg]' + img_upload_flag + '[/dgimg]')        # 替换<strong>标签        text_temp = text_temp.replace('<strong>', '').replace('</strong>', '')        # 使用beautifulSoup格什化HTML        soup = BeautifulSoup(text_temp, "lxml")        text = ''        # 对于bs之后的html树形结构，不使用.prettify()，对于bs, prettify后每一个标签自动换行，造成多个、        # 多行的空格、换行        for string in soup.stripped_strings:            text += string + '\n\n'        # 替换因为双引号为中文双引号，避免 mysql syntax        DgspiderphantomjsPipeline.text = self.text + text.replace('"', '“')        return item    # spider开启时被调用    def open_spider(self, spider):        pass    # sipder 关闭时被调用    def close_spider(self, spider):        # 数据入库：235        url = DgspiderphantomjsPipeline.url        title = DgspiderphantomjsPipeline.title        content = DgspiderphantomjsPipeline.text        user_id = DgspiderphantomjsPipeline.user_id        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')        dbhandle_insert_content(url, title, content, user_id, DgspiderphantomjsPipeline.has_img, create_time)        # 处理文本、设置status、上传至dgCommunity.dg_post        # 如果判断has_img为1，那么上传帖子        if DgspiderphantomjsPipeline.has_img == 1:            if title.strip() != '' and content.strip() != '':                spider.logger.info('status=2 , has_img=1, title and content is not null! Uploading post into db...')                post_handel(url)            else:                spider.logger.info('status=1 , has_img=1, but title or content is null! ready to exit...')            pass        else:            spider.logger.info('status=1 , has_img=0, changing status and ready to exit...')            pass

阅读全文

1 0