FreeBuf爬虫
来源:互联网 发布:java教程软件 编辑:程序博客网 时间:2024/05/04 23:42
freebuf爬虫
#C:\Python27\python.exe#coding:utf-8import sysreload(sys)sys.setdefaultencoding("utf-8")import reimport osimport urllibimport requestsfrom multiprocessing import Poolsubject_dict = {u'漏洞':'http://www.freebuf.com/vuls', u'安全工具':'http://www.freebuf.com/sectool', u'WEB安全':'http://www.freebuf.com/articles/web', u'系统安全':'http://www.freebuf.com/articles/system', u'网络安全':'http://www.freebuf.com/articles/network', u'无线安全':'http://www.freebuf.com/articles/wireless', u'终端安全':'http://www.freebuf.com/articles/terminal', u'数据安全':'http://www.freebuf.com/articles/database', u'安全管理':'http://www.freebuf.com/articles/security-management', u'企业安全':'http://www.freebuf.com/articles/es', u'极客':'http://www.freebuf.com/geek'}def spider(filename, url): print "Crawling subject: %s" % filename if os.path.isfile(filename + ".html"): os.remove(filename + ".html") with open(filename + ".html",'a') as f: page = 0 error_couter = 0 while True: page += 1 try: html = requests.get(url + '/page/' + str(page)) code = html.status_code if code == 404: error_couter += 1 if error_couter == 1: print "Subject %s may only have %s pages." % (filename, str(page - 1)) if error_couter <= 3: print "Retrying %s: 404 not Found!" % str(error_couter) continue else: print "Subject %s finished!" % filename print "#################################" break else: print u"Parsing page: " + str(page) if page == 1: site = re.findall('([\s\S]*) </div>\n <div class="news-more" id="pagination">',html.text,re.S) else: site = re.findall('<div id="timeline" class="news-detial">([\s\S]*?) </div>\n <div class="news-more" id="pagination">',html.text,re.S) for each in site: f.write(urllib.unquote(each.encode('utf-8'))) except Exception as e: print e pass f.close()def main(): for key,value in subject_dict.items(): spider(key, value) # pool = Pool(processes=4) # for i in range(0, subject_dict.__len__()): # arg_list = subject_dict.items()[i] # pool.apply_async(spider, (arg_list[0], arg_list[1],)).get(timeout=None) # pool.close() # pool.join()if __name__ == '__main__': main()
阅读全文
0 0
- FreeBuf爬虫
- freebuf
- http://www.freebuf.com/
- 【原创】FreeBuf 晒书会
- pyquery示例-获取FreeBuf新闻标题
- Socks代理反弹突破内网(freebuf)
- RSA推荐的知名博客(转载freebuf)
- SQL 注入学习(来自freebuf的总结)
- 逆向工程入门学习(FreeBuf)
- freebuf上一篇关于waf绕过的介绍
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- 爬虫
- PAT乙级1007. 素数对猜想 (20)
- 用数组模拟栈的peek,pop,push
- 弹窗之二:PopupWindow的使用
- 在MyEclipse中使用SVN进行多人协作开发
- ADS1298学习笔记2——导联脱落检测正端和负端错位
- FreeBuf爬虫
- 职场赛道转换之数据分析师
- GIT修改commit信息
- CNN网络层详解
- Servlet生命周期,工作原理
- HDU
- 我的编码习惯
- 如何在CSDN上编辑公式
- 解决电脑连接正常,但浏览器无法打开网页的问题