FreeBuf爬虫

来源：互联网发布：java教程软件编辑：程序博客网时间：2024/05/04 23:42
freebuf爬虫
#C:\Python27\python.exe#coding:utf-8import sysreload(sys)sys.setdefaultencoding("utf-8")import reimport osimport urllibimport requestsfrom multiprocessing import Poolsubject_dict = {u'漏洞':'http://www.freebuf.com/vuls', u'安全工具':'http://www.freebuf.com/sectool',                u'WEB安全':'http://www.freebuf.com/articles/web', u'系统安全':'http://www.freebuf.com/articles/system',                u'网络安全':'http://www.freebuf.com/articles/network', u'无线安全':'http://www.freebuf.com/articles/wireless',                u'终端安全':'http://www.freebuf.com/articles/terminal', u'数据安全':'http://www.freebuf.com/articles/database',                u'安全管理':'http://www.freebuf.com/articles/security-management', u'企业安全':'http://www.freebuf.com/articles/es',                u'极客':'http://www.freebuf.com/geek'}def spider(filename, url):    print "Crawling subject: %s" % filename    if os.path.isfile(filename + ".html"):        os.remove(filename + ".html")    with open(filename + ".html",'a') as f:        page = 0        error_couter = 0        while True:            page += 1            try:                html = requests.get(url + '/page/' + str(page))                code = html.status_code                if code == 404:                    error_couter += 1                    if error_couter == 1:                        print "Subject %s may only have %s pages." % (filename, str(page - 1))                    if error_couter <= 3:                        print "Retrying %s: 404 not Found!" % str(error_couter)                        continue                    else:                        print "Subject %s finished!" % filename                        print "#################################"                        break                else:                    print u"Parsing page: " + str(page)                    if page == 1:                        site = re.findall('([\s\S]*)      </div>\n      <div class="news-more" id="pagination">',html.text,re.S)                    else:                        site = re.findall('<div id="timeline" class="news-detial">([\s\S]*?)      </div>\n      <div class="news-more" id="pagination">',html.text,re.S)                    for each in site:                        f.write(urllib.unquote(each.encode('utf-8')))            except Exception as e:                print e                pass        f.close()def main():    for key,value in subject_dict.items():        spider(key, value)    # pool = Pool(processes=4)    # for i in range(0, subject_dict.__len__()):    #     arg_list = subject_dict.items()[i]    #     pool.apply_async(spider, (arg_list[0], arg_list[1],)).get(timeout=None)    # pool.close()    # pool.join()if __name__ == '__main__':    main()
阅读全文
0 0