cve_details按照时间爬取(pyspider)
来源:互联网 发布:ddos攻击器软件 编辑:程序博客网 时间:2024/05/17 09:27
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-08-11 10:19:23
#Created by zhangguodong
# Project: CVE_Details
from pyspider.libs.base_handler import *
import re
from collections import defaultdict
#url = "www.cvedetails.com/cve/CVE-2007-6593/"
#pattern = "<table class=\"listtable\" .*?>.*?<tr.*?>.*?</tr>.*?(<tr.*?>.*?</tr>)+</table>"
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
self.base_url = 'http://www.cvedetails.com/vulnerability-list/year-'
self.start_year = 1999
self.last_year = 2017
@every(minutes=24 * 60)
def on_start(self):
while self.start_year<=self.last_year:
url = self.base_url + str(self.start_year)+'/vulnerabilities.html'
self.crawl(url, callback=self.index_page)
self.start_year+=1
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('#pagingb>a').items():
self.crawl(each.attr.href, callback=self.list_page)
def list_page(self,response):
for each in response.doc('a[href^="http"]').items():
if re.match("http://www.cvedetails.com/cve/CVE-\w+", each.attr.href, re.U):
self.crawl(each.attr.href, callback=self.detail_page)
@config(priority=2)
def detail_page(self, response):
products=list()
versions=list()
#for each in response.doc(' td').eq(3).items():
#products.append(each.text())
# for each in response.doc('#vulnprodstable>td').eq(4).items():
count = 0
for each in response.doc('#vulnprodstable td').items():
count += 1
if count %9==5:
versions.append(each.text())
if count %9==4:
products.append(each.text())
#versions.append(each.text())
#print products
#print versions
p_v = zip(products, versions)
p_v_list = list((product,version) for product,version in p_v)
return {
"cve_id": response.doc('#cvedetails>h1>a').text(),
"(product,version)":p_v_list,
"Product Type":response.doc('#vulnprodstable td ').eq(1).text(),
"Vendor":response.doc('#vulnprodstable td ').eq(2).text(),
# "Product":products,
#"Version":versions,
"url": response.url,
}
# -*- encoding: utf-8 -*-
# Created on 2017-08-11 10:19:23
#Created by zhangguodong
# Project: CVE_Details
from pyspider.libs.base_handler import *
import re
from collections import defaultdict
#url = "www.cvedetails.com/cve/CVE-2007-6593/"
#pattern = "<table class=\"listtable\" .*?>.*?<tr.*?>.*?</tr>.*?(<tr.*?>.*?</tr>)+</table>"
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
self.base_url = 'http://www.cvedetails.com/vulnerability-list/year-'
self.start_year = 1999
self.last_year = 2017
@every(minutes=24 * 60)
def on_start(self):
while self.start_year<=self.last_year:
url = self.base_url + str(self.start_year)+'/vulnerabilities.html'
self.crawl(url, callback=self.index_page)
self.start_year+=1
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('#pagingb>a').items():
self.crawl(each.attr.href, callback=self.list_page)
def list_page(self,response):
for each in response.doc('a[href^="http"]').items():
if re.match("http://www.cvedetails.com/cve/CVE-\w+", each.attr.href, re.U):
self.crawl(each.attr.href, callback=self.detail_page)
@config(priority=2)
def detail_page(self, response):
products=list()
versions=list()
#for each in response.doc(' td').eq(3).items():
#products.append(each.text())
# for each in response.doc('#vulnprodstable>td').eq(4).items():
count = 0
for each in response.doc('#vulnprodstable td').items():
count += 1
if count %9==5:
versions.append(each.text())
if count %9==4:
products.append(each.text())
#versions.append(each.text())
#print products
#print versions
p_v = zip(products, versions)
p_v_list = list((product,version) for product,version in p_v)
return {
"cve_id": response.doc('#cvedetails>h1>a').text(),
"(product,version)":p_v_list,
"Product Type":response.doc('#vulnprodstable td ').eq(1).text(),
"Vendor":response.doc('#vulnprodstable td ').eq(2).text(),
# "Product":products,
#"Version":versions,
"url": response.url,
}
阅读全文
1 0
- cve_details按照时间爬取(pyspider)
- pyspider 爬取淘宝食品
- 使用pyspider爬虫爬取百度图片
- 基于pyspider的大众点评数据爬取总结
- 使用Pyspider 框架爬取全球的注册公司列表
- python网络爬虫学习(六)利用Pyspider+Phantomjs爬取淘宝模特图片
- Pyspider框架 —— Python爬虫实战之爬取 V2EX 网站帖子
- pyspider
- pyspider
- pyspider
- pyspider
- 爬取新闻时间
- pyspider 爬豆瓣电影信息
- mysql按照时间查询
- 按照时间排序
- list按照时间排列
- mysql按照时间分区
- mongodb按照时间查询
- dubbox 发布rest服务调用过程记录
- Spring事务机制详解
- 关于app目前第三方托管平台整理
- SpringCloud教程 | 第三篇: 服务消费者(Feign)
- emWin 2天速成实例教程-000
- cve_details按照时间爬取(pyspider)
- 三、高并发秒杀API之Service层设计与实现
- 【POJ
- 牛客网优惠码-直通BAT面试算法精品课购买
- Oracle之:Function :dateToNumber()
- CSS 盒子模型
- 多态原理实现分析及重载,隐藏,覆盖的辨析
- Mysql错误:Unable to connect to remote host. Catalog download has failed.
- 微信小程序开发效果:animation心跳动画