基于Python3的phantomJs+Selenium动态网页爬取技术
来源:互联网 发布:韩国网络剧推荐 编辑:程序博客网 时间:2024/05/17 08:26
Python3+Scrapy+phantomJs+Selenium爬取今日头条
在实现爬虫的过程中,我们不可避免的会爬取又JS以及Ajax等动态网页技术生成网页内容的网站,今日头条就是一个很好的例子。
本文所要介绍的是基于Python3,配合Scrapy+phantomjs+selenium框架的动态网页爬取技术。
本文所实现的2个项目已上传至Github中,求Star~
1. 爬取今日头条新闻列表URL:
2. 爬取今日头条新闻内容:静态网页爬取技术以及windows下爬虫环境搭建移步上几篇博客,必要的安装软件也在上一篇博客中提供。
本文介绍使用PhantongJs + Selenium实现新闻内容的爬取,爬取新闻列表的url也是相同的原理,不再赘述。
项目结构
项目原理
底层代码使用Python3,网络爬虫基础框架采用Scrapy,由于爬取的是动态网页,整个网页并不是直接生成页面,动过Ajax等技术动态生成。所以这里考虑采用 PhantomJs+Selenium模拟实现一个无界面的浏览器,去模拟用户操作,抓取网页代码内容。
代码文件说明
项目结构从上到下依次为:
- middleware.py:整个项目的核心,用于启动中间件,在Scrapy抓取调用request的过程中实现模拟用户操作浏览器
- ContentSpider.py:爬虫类文件,定义爬虫
- commonUtils:工具类
- items.py:爬虫所抓取到的字段存储类
- pipelines.py:抓取到的数据处理类
这5个为关键类代码,其余的代码为业务相关代码。
关键代码讲解
- middleware.py
# douguo request middleware# for the page which loaded by js/ajax# ang changes should be recored here:## @author zhangjianfei# @date 2017/05/04from selenium import webdriverfrom scrapy.http import HtmlResponsefrom DgSpiderPhantomJS import settingsfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesimport timeimport randomclass JavaScriptMiddleware(object): print("LOGS Starting Middleware ...") def process_request(self, request, spider): print("LOGS: process_request is starting ...") # 开启虚拟浏览器参数 dcap = dict(DesiredCapabilities.PHANTOMJS) # 设置agents dcap["phantomjs.page.settings.userAgent"] = (random.choice(settings.USER_AGENTS)) # 启动phantomjs driver = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1\bin\phantomjs.exe", desired_capabilities=dcap) # 设置60秒页面超时返回 driver.set_page_load_timeout(60) # 设置60秒脚本超时时间 driver.set_script_timeout(60) # get page request driver.get(request.url) # simulate user behavior js = "document.body.scrollTop=10000" driver.execute_script(js) # 可执行js,模仿用户操作。此处为将页面拉至1000。 # 等待异步请求响应 driver.implicitly_wait(20) # 获取页面源码 body = driver.page_source return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
- ContentSpider.py
# -*- coding: utf-8 -*-import scrapyimport randomimport timefrom DgSpiderPhantomJS.items import DgspiderPostItemfrom scrapy.selector import Selectorfrom DgSpiderPhantomJS import urlSettingsfrom DgSpiderPhantomJS import contentSettingsfrom DgSpiderPhantomJS.mysqlUtils import dbhandle_update_statusfrom DgSpiderPhantomJS.mysqlUtils import dbhandle_geturlclass DgContentSpider(scrapy.Spider): print('LOGS: Spider Content_Spider Staring ...') sleep_time = random.randint(60, 90) print("LOGS: Sleeping :" + str(sleep_time)) time.sleep(sleep_time) # get url from db result = dbhandle_geturl() url = result[0] # spider_name = result[1] site = result[2] gid = result[3] module = result[4] # set spider name name = 'Content_Spider' # name = 'DgUrlSpiderPhantomJS' # set domains allowed_domains = [site] # set scrapy url start_urls = [url] # change status """对于爬去网页,无论是否爬取成功都将设置status为1,避免死循环""" dbhandle_update_status(url, 1) # scrapy crawl def parse(self, response): # init the item item = DgspiderPostItem() # get the page source sel = Selector(response) print(sel) # get post title title_date = sel.xpath(contentSettings.POST_TITLE_XPATH) item['title'] = title_date.xpath('string(.)').extract() # get post page source item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract() # get url item['url'] = DgContentSpider.url yield item
- commonUtils
import randomfrom hashlib import md5# 获取随机发帖IDdef get_random_user(user_str): user_list = [] for user_id in str(user_str).split(','): user_list.append(user_id) userid_idx = random.randint(1, len(user_list)) user_chooesd = user_list[userid_idx-1] return user_chooesd# 获取MD5加密URLdef get_linkmd5id(url): # url进行md5处理,为避免重复采集设计 md5_url = md5(url.encode("utf8")).hexdigest() return md5_url
- items.py
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass DgspiderUrlItem(scrapy.Item): url = scrapy.Field()class DgspiderPostItem(scrapy.Item): url = scrapy.Field() title = scrapy.Field() text = scrapy.Field()
- pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport reimport datetimeimport urllib.requestfrom DgSpiderPhantomJS import urlSettingsfrom DgSpiderPhantomJS import contentSettingsfrom DgSpiderPhantomJS.mysqlUtils import dbhandle_insert_contentfrom DgSpiderPhantomJS.uploadUtils import uploadImagefrom DgSpiderPhantomJS.mysqlUtils import dbhandle_onlinefrom DgSpiderPhantomJS.PostHandle import post_handelfrom DgSpiderPhantomJS.mysqlUtils import dbhandle_update_statusfrom bs4 import BeautifulSoupfrom DgSpiderPhantomJS.commonUtils import get_random_userfrom DgSpiderPhantomJS.commonUtils import get_linkmd5idclass DgspiderphantomjsPipeline(object): # post构造reply cs = [] # 帖子title title = '' # 帖子文本 text = '' # 当前爬取的url url = '' # 随机用户ID user_id = '' # 图片flag has_img = 0 # get title flag get_title_flag = 0 def __init__(self): DgspiderphantomjsPipeline.user_id = get_random_user(contentSettings.CREATE_POST_USER) # process the data def process_item(self, item, spider): self.get_title_flag += 1 # 获取当前网页url DgspiderphantomjsPipeline.url = item['url'] # 获取post title if len(item['title']) == 0: title_tmp = '' else: title_tmp = item['title'][0] # 替换标题中可能会引起 sql syntax 的符号 # 对于分页的文章,只取得第一页的标题 if self.get_title_flag == 1: # 使用beautifulSoup格什化标题 soup_title = BeautifulSoup(title_tmp, "lxml") title = '' # 对于bs之后的html树形结构,不使用.prettify(),对于bs, prettify后每一个标签自动换行,造成多个、 # 多行的空格、换行,使用stripped_strings获取文本 for string in soup_title.stripped_strings: title += string title = title.replace("'", "”").replace('"', '“') DgspiderphantomjsPipeline.title = title # 获取正post内容 if len(item['text']) == 0: text_temp = '' else: text_temp = item['text'][0] soup = BeautifulSoup(text_temp, "lxml") text_temp = str(soup) # 获取图片 reg_img = re.compile(r'<img.*?>') imgs = reg_img.findall(text_temp) for img in imgs: DgspiderphantomjsPipeline.has_img = 1 # matchObj = re.search('.*src="(.*)"{2}.*', img, re.M | re.I) match_obj = re.search('.*src="(.*)".*', img, re.M | re.I) img_url_tmp = match_obj.group(1) # 去除所有Http:标签 img_url_tmp = img_url_tmp.replace("http:", "") # 对于<img src="http://a.jpg" title="a.jpg">这种情况单独处理 imgUrl_tmp_list = img_url_tmp.split('"') img_url_tmp = imgUrl_tmp_list[0] # 加入http imgUrl = 'http:' + img_url_tmp list_name = imgUrl.split('/') file_name = list_name[len(list_name)-1] # if os.path.exists(settings.IMAGES_STORE): # os.makedirs(settings.IMAGES_STORE) # 获取图片本地存储路径 file_path = contentSettings.IMAGES_STORE + file_name # 获取图片并上传至本地 urllib.request.urlretrieve(imgUrl, file_path) upload_img_result_json = uploadImage(file_path, 'image/jpeg', DgspiderphantomjsPipeline.user_id) # 获取上传之后返回的服务器图片路径、宽、高 img_u = upload_img_result_json['result']['image_url'] img_w = upload_img_result_json['result']['w'] img_h = upload_img_result_json['result']['h'] img_upload_flag = str(img_u)+';'+str(img_w)+';'+str(img_h) # 在图片前后插入字符标记 text_temp = text_temp.replace(img, '[dgimg]' + img_upload_flag + '[/dgimg]') # 替换<strong>标签 text_temp = text_temp.replace('<strong>', '').replace('</strong>', '') # 使用beautifulSoup格什化HTML soup = BeautifulSoup(text_temp, "lxml") text = '' # 对于bs之后的html树形结构,不使用.prettify(),对于bs, prettify后每一个标签自动换行,造成多个、 # 多行的空格、换行 for string in soup.stripped_strings: text += string + '\n\n' # 替换因为双引号为中文双引号,避免 mysql syntax DgspiderphantomjsPipeline.text = self.text + text.replace('"', '“') return item # spider开启时被调用 def open_spider(self, spider): pass # sipder 关闭时被调用 def close_spider(self, spider): # 数据入库:235 url = DgspiderphantomjsPipeline.url title = DgspiderphantomjsPipeline.title content = DgspiderphantomjsPipeline.text user_id = DgspiderphantomjsPipeline.user_id create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') dbhandle_insert_content(url, title, content, user_id, DgspiderphantomjsPipeline.has_img, create_time) # 处理文本、设置status、上传至dgCommunity.dg_post # 如果判断has_img为1,那么上传帖子 if DgspiderphantomjsPipeline.has_img == 1: if title.strip() != '' and content.strip() != '': spider.logger.info('status=2 , has_img=1, title and content is not null! Uploading post into db...') post_handel(url) else: spider.logger.info('status=1 , has_img=1, but title or content is null! ready to exit...') pass else: spider.logger.info('status=1 , has_img=0, changing status and ready to exit...') pass
阅读全文
1 0
- 基于Python3的phantomJs+Selenium动态网页爬取技术
- selenium+ phantomjs实现动态网页爬取
- selenium+ Phantomjs爬取动态网页
- 动态网页爬取例子(WebCollector+selenium+phantomjs)
- python+selenium+PhantomJS爬取网页动态加载内容
- 基于Python,Selenium和PhantomJS实现动态页面爬取
- Selenium + phantomJS 爬取动态网站
- [Python爬虫]Scrapy配合Selenium和PhantomJS爬取动态网页
- Scrapy+phantomjs爬取动态网页数据
- Python使用Selenium和PhantomJS解析动态JS的网页
- Python使用Selenium和PhantomJS解析动态JS的网页
- Selenium+PhantomJS 爬取页面
- Selenium+PhantomJS爬取淘宝
- python3 scrapy 使用PhantomJS作为middlewares爬取动态加载的数据
- selenium+phantomjs+js动态网页截图
- Node.js 动态网页爬取 PhantomJS 使用入门
- Python3+phantomjs+selenium配置
- 基于phantomJs的Java后台网页截图技术
- 管道
- NIO-缓存-通道-选择器
- HEVC专业术语对译
- Android动画 SVG VectorDrawable 基础二
- 数据挖掘考试重点(条理版)
- 基于Python3的phantomJs+Selenium动态网页爬取技术
- F1V3.0-图形-自动成图-系统图算法介绍
- 【编程语言】C#中字符串的内存分配
- JVM里java对象的创建及内存分配和访问定位
- Linux下安装python-opencv ,亲测可用
- 个人博客
- 【HOJ 1867】经理的烦恼(树状数组)
- numpy库中一维矩阵的一些坑点
- 虚拟机的安装