Beautiful SOAP 爬网页

来源：互联网发布：支持php免费空间编辑：程序博客网时间：2024/04/28 20:46
Python Beautiful SOAP 是一款强大的html解析工具，堪称网络爬虫利器。
下面代码为工具cvelist.csv文件中的CVE ID，分别爬出该CVE信息的一段代码。供记录。
# -*- coding: utf-8 -*-import sysreload(sys)sys.setdefaultencoding('utf-8')import timeimport osimport copyimport randomfrom urllib2 import Request, urlopen, HTTPErrorimport loggingimport jsonfrom bs4 import BeautifulSoup import jsonimport codecsimport geventimport loggingURL = "http://cve.scap.org.cn/%s.html"def fetchCVE(sid):    sid = "CVE-"+str(sid).strip()    request_url = URL %(sid)    request_settings = { 'content-type': 'text/plain','Accept-Encoding':'deflate','User-Agent':'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}    req = Request(request_url,headers=request_settings)    content = ""    try:        response = urlopen(req)        content = response.read().decode('utf8')    except HTTPError, e:        pass    except Exception,e:        pass    return contentdef fetchCVEByList(sidList,sidContentPair):    length = len(sidList)    count=0    for sid in sidList:        count = count+1        debugstr = "Process %d of %d\n" % (count,length)        content = fetchCVE(sid)        sidContentPair[sid.strip()] = content        fp = open("./file/"+sid.strip(), "w")        fp.write(content.encode("utf-8"))        fp.close()def parseTD(table) :    soup = BeautifulSoup(table,"lxml")    tds=soup.find_all("td")    return tdsdef getScoreAndSeverity(table):    tds = parseTD(table)    if len(tds)>2:        return (tds[2].string,tds[1].string)    else:        return ("","")def getPlatform(table):    tds = parseTD(table)    content  =""    for td in tds:        if td.string!=None:            content = content + "\n"+td.string    return contentdef getSummary(summary):    soup = BeautifulSoup(summary,"lxml")    strongs=soup.find_all("strong")    return strongs[0].stringdef writeCVEList(sidContentPair):    length = 4152    counter =0;    logging.info("begin")    sidInfoDic= {}    for sid in sidContentPair.keys():        debugstr = "process %d of total %d rule: SUCCEED\n"        counter = counter+1        content = sidContentPair.get(sid)        if (content== ""):            logging.error("sid:"+sid+ " content is none")            continue        try:            soup = BeautifulSoup(content,"lxml")            summary=soup.find_all("div", {'class':'summary'})            cvsstable=soup.find_all(id="cvss")            cpetable=soup.find_all(id="cpe")            (severity,score) = getScoreAndSeverity(cvsstable[0].encode("utf-8"))            if(severity=="" or score==""):                logging.error("sid %s no score", sid)            name = ""            for content in summary[0].contents:                if(content.encode("utf-8").find("strong")!=-1):                    name = getSummary(content.encode("utf-8"))            platform = getPlatform(cpetable[0].encode("utf-8"))            sidInfoDic[sid]=[name,score.strip(),severity,platform]            logging.info(debugstr , counter,length)        except Exception,e:            debugstr = "process %d of total %d rule: FAIL,sid="+sid+"\n"            logging.exception(e)            logging.info(debugstr , counter,length)    #wstr = json.dumps(sidInfoDic, ensure_ascii=False)    fp = open("result.json", "w")    json.dump( sidInfoDic,fp, ensure_ascii=False,indent=4)    fp.close()def dumpResult():    sidContentPair = {}    fp = open("cvelist.csv",'r')    lines = fp.readlines()    fp.close()    length = len(lines)    threadNumber = length/500+1    taskPerThread = 500    threadList = []    for i in xrange(threadNumber+1):        taskBegin = i* taskPerThread        taskEnd = (i+1)* taskPerThread        if(taskEnd>length):            taskEnd = length        t = gevent.spawn(fetchCVEByList, lines[taskBegin:taskEnd],sidContentPair)        threadList.append(t)    gevent.joinall(threadList)    writeCVEList(sidContentPair)def dumpResultByFile():    sidContentPair = {}    #cve 文件，一行一个cve id    fp = open("cvelist.csv",'r')    lines = fp.readlines()    fp.close()    for line in lines:        fp = open("./file/"+line.strip(), "r")        content = fp.read()        fp.close()        sidContentPair[line.strip()] = content    writeCVEList(sidContentPair)    if __name__=='__main__':    #dumpResult()    dumpResultByFile()
0 0