Beautiful SOAP 爬网页
来源:互联网 发布:支持php免费空间 编辑:程序博客网 时间:2024/04/28 20:46
Python Beautiful SOAP 是一款强大的html解析工具,堪称网络爬虫利器。
下面代码为工具cvelist.csv文件中的CVE ID, 分别爬出该CVE信息的一段代码。供记录。
# -*- coding: utf-8 -*-import sysreload(sys)sys.setdefaultencoding('utf-8')import timeimport osimport copyimport randomfrom urllib2 import Request, urlopen, HTTPErrorimport loggingimport jsonfrom bs4 import BeautifulSoup import jsonimport codecsimport geventimport loggingURL = "http://cve.scap.org.cn/%s.html"def fetchCVE(sid): sid = "CVE-"+str(sid).strip() request_url = URL %(sid) request_settings = { 'content-type': 'text/plain','Accept-Encoding':'deflate','User-Agent':'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'} req = Request(request_url,headers=request_settings) content = "" try: response = urlopen(req) content = response.read().decode('utf8') except HTTPError, e: pass except Exception,e: pass return contentdef fetchCVEByList(sidList,sidContentPair): length = len(sidList) count=0 for sid in sidList: count = count+1 debugstr = "Process %d of %d\n" % (count,length) content = fetchCVE(sid) sidContentPair[sid.strip()] = content fp = open("./file/"+sid.strip(), "w") fp.write(content.encode("utf-8")) fp.close()def parseTD(table) : soup = BeautifulSoup(table,"lxml") tds=soup.find_all("td") return tdsdef getScoreAndSeverity(table): tds = parseTD(table) if len(tds)>2: return (tds[2].string,tds[1].string) else: return ("","")def getPlatform(table): tds = parseTD(table) content ="" for td in tds: if td.string!=None: content = content + "\n"+td.string return contentdef getSummary(summary): soup = BeautifulSoup(summary,"lxml") strongs=soup.find_all("strong") return strongs[0].stringdef writeCVEList(sidContentPair): length = 4152 counter =0; logging.info("begin") sidInfoDic= {} for sid in sidContentPair.keys(): debugstr = "process %d of total %d rule: SUCCEED\n" counter = counter+1 content = sidContentPair.get(sid) if (content== ""): logging.error("sid:"+sid+ " content is none") continue try: soup = BeautifulSoup(content,"lxml") summary=soup.find_all("div", {'class':'summary'}) cvsstable=soup.find_all(id="cvss") cpetable=soup.find_all(id="cpe") (severity,score) = getScoreAndSeverity(cvsstable[0].encode("utf-8")) if(severity=="" or score==""): logging.error("sid %s no score", sid) name = "" for content in summary[0].contents: if(content.encode("utf-8").find("strong")!=-1): name = getSummary(content.encode("utf-8")) platform = getPlatform(cpetable[0].encode("utf-8")) sidInfoDic[sid]=[name,score.strip(),severity,platform] logging.info(debugstr , counter,length) except Exception,e: debugstr = "process %d of total %d rule: FAIL,sid="+sid+"\n" logging.exception(e) logging.info(debugstr , counter,length) #wstr = json.dumps(sidInfoDic, ensure_ascii=False) fp = open("result.json", "w") json.dump( sidInfoDic,fp, ensure_ascii=False,indent=4) fp.close()def dumpResult(): sidContentPair = {} fp = open("cvelist.csv",'r') lines = fp.readlines() fp.close() length = len(lines) threadNumber = length/500+1 taskPerThread = 500 threadList = [] for i in xrange(threadNumber+1): taskBegin = i* taskPerThread taskEnd = (i+1)* taskPerThread if(taskEnd>length): taskEnd = length t = gevent.spawn(fetchCVEByList, lines[taskBegin:taskEnd],sidContentPair) threadList.append(t) gevent.joinall(threadList) writeCVEList(sidContentPair)def dumpResultByFile(): sidContentPair = {} #cve 文件,一行一个cve id fp = open("cvelist.csv",'r') lines = fp.readlines() fp.close() for line in lines: fp = open("./file/"+line.strip(), "r") content = fp.read() fp.close() sidContentPair[line.strip()] = content writeCVEList(sidContentPair) if __name__=='__main__': #dumpResult() dumpResultByFile()
0 0
- Beautiful SOAP 爬网页
- beautiful soap simple examples
- Beautiful Soup 提取网页
- python Beautiful Soup分析网页
- Beautiful Soup 4解析网页
- 使用 Beautiful Soup 解析网页内容
- Python网页抓取之Beautiful Soup
- 网页正文提取工具Beautiful Soup
- python Beautiful soup网页解析-星座网
- Python网页抓取工具Beautiful Soup面面观!
- Beautiful
- 利用urllib和beautiful soup下载网页图片
- 网页解析器和beautiful soup 实例测试
- SOAP
- SOAP
- SOAP
- SOAP
- SOAP
- hdu 1968 Just a Hook 线段树区间更新
- Struts 2快速上手
- ORA-00054: resource busy and acquire with NOWAIT specified
- Pascal's Triangle
- Scalaz(46)- scalaz-stream 基础介绍
- Beautiful SOAP 爬网页
- Linux的多任务多进程
- STM32F302R8 + CubeMx + USB +VCP(虚拟串口)一
- Python代码模块热更新机制实现(reload)
- getpid()函数
- 51NOD 1098 最小方差(基础数学)
- poi获取excel2003,excel2007,ppt2007图表类型
- 问题记录—1(递归函数转换)
- 使用php调用shell脚本同步文件