爬取环境信息实例
来源:互联网 发布:mac 结构图软件 编辑:程序博客网 时间:2024/06/04 23:32
【python3】
#-*- coding:utf-8 -*-#author:lvfengwen#date:2017/10/12#descript:爬取环境配置信息import urllib,requestsimport http.cookiejarimport re# import pprintimport gzip#cookielib --> http.cookiejar#urllib2 --> urllib.request#urllib --> urllib.request#百度贴吧爬虫类class EnvironmentSpider: # 初始化,传入基地址,是否只看楼主的参数 def __init__(self,db_ip): self.base_url = "http://docker.ql.corp/LoginServlet?username=lvfengwen%40mobanker.com&password=123456&action=login" # self.filename = 'cookie.txt' # 设置保存cookie的文件,同级目录下的cookie.txt self.db_ip = db_ip self.result_dic = {} self.opener = None def make_fake_request(self): # 声明一个CookieJar对象实例来保存cookie cookie = http.cookiejar.CookieJar() #cookielib --> http.cookiejar # 利用urllib2库的HTTPCookieProcessor对象来创建cookie处理器 handler = urllib.request.HTTPCookieProcessor(cookie) # 通过handler来构建opener self.opener = urllib.request.build_opener(handler) self.opener.open(self.base_url) # 利用cookie请求访问另一个网址,此网址是成绩查询网址 gradeUrl = 'http://docker.ql.corp/DockerMachineServlet?ipaddress='+ self.db_ip +'&action=environmentSearch' # 请求访问成绩查询网址 result = self.opener.open(gradeUrl) html_str = result.read().decode('UTF-8') # print(html_str) return html_str #获取配置信息详情页 def get_detail_page(self,url_para): base_url = "http://docker.ql.corp/DockerMachineServlet?action=environmentdetail&" target_url = base_url + url_para detail_page = self.opener.open(target_url) return detail_page.read().decode('UTF-8') #在详情页面获取 配置端口信息 def get_server_port(self,html_str,ip): #找到所有服务 # #找手机贷微站 pattern = re.compile("<a href=.*?/micro_site.tomcat.*?:4(\d*?)-->", re.S) print("----") micro_data = re.search(pattern, html_str) if micro_data: micro_port = micro_data.group(1).strip() #找到手机贷微站就返回 self.result_dic['micro_port'] = "4" + str(micro_port) self.result_dic['micro_ip'] = ip #找信审接口 pattern = re.compile("<a href=.*?/business_xs.tomcat.*?:4(\d*?)-->", re.S) print("----") business_xs_data = re.search(pattern, html_str) print(business_xs_data) if business_xs_data: business_xs_port = business_xs_data.group(1).strip() #找到手机贷微站就返回 self.result_dic['business_xs_port'] = "4" + str(business_xs_port) self.result_dic['business_xs_ip'] = ip #找财务接口 pattern = re.compile("<a href=.*?/financial-web.netty.*?:4(\d*?)-->", re.S) print("----") financial_web_data = re.search(pattern, html_str) print(financial_web_data) if financial_web_data: financial_web_port = financial_web_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['financial_web_port'] = "4" + str(financial_web_port) self.result_dic['financial_web_ip'] = ip # 找应花分期资料接口 pattern = re.compile("<a href=.*?/yh-openapi.netty.*?:4(\d*?)-->", re.S) print("----") yhfq_micro_data = re.search(pattern, html_str) print(yhfq_micro_data) if yhfq_micro_data: yhfq_micro_port = yhfq_micro_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['yhfq_micro_port'] = "4" + str(yhfq_micro_port) self.result_dic['yhfq_micro_ip'] = ip # 找卡代偿资料接口 pattern = re.compile("<a href=.*?/kdc-api.netty.*?:4(\d*?)-->", re.S) print("----") kdc_micro_data = re.search(pattern, html_str) print(kdc_micro_data) if kdc_micro_data: kdc_micro_port = kdc_micro_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['kdc_micro_port'] = "4" + str(kdc_micro_port) self.result_dic['kdc_micro_ip'] = ip #获取环境配置跳转列表 def get_environment_list(self,html_str): pattern = re.compile("<a href='./DockerMachineServlet.*?/>(.*?)</td>", re.S) # pprint.pprint(html_str) ip_list = re.findall(pattern,html_str) pattern = re.compile("<a href='./DockerMachineServlet\?action=environmentdetail&(.*?)/>", re.S) url_para_list = re.findall(pattern, html_str) print(url_para_list) print(1) skip_first= False print(2) if url_para_list: print(3) for url_para,ip in zip(url_para_list,ip_list): print(4) if skip_first == False: print(5) skip_first = True continue if url_para.strip() == "" : print(6) continue print("-----------------start---------------------") print(url_para) #深入链接去爬取 print("深入链接去爬取") detail_page = self.get_detail_page(url_para) self.get_server_port(detail_page,ip) #处理环境详细页 print("-----------------end---------------------") # pprint.pprint(self.result_dic) print(self.result_dic) else: return None def get_environment_result(self): html_str = es.make_fake_request() es.get_environment_list(html_str)es = EnvironmentSpider("33.71")es.get_environment_result()
【python 2】
#-*- coding:utf-8 -*-#author:lvfengwen#date:2017/10/12#descript:爬取环境配置信息import urllibimport urllib2import cookielibimport reimport pprint#百度贴吧爬虫类class EnvironmentSpider: # 初始化,传入基地址,是否只看楼主的参数 def __init__(self): self.base_url = "http://docker.ql.corp/LoginServlet?username=lvfengwen%40mobanker.com&password=123456&action=login" self.filename = 'cookie.txt' # 设置保存cookie的文件,同级目录下的cookie.txt self.result_dic = {} self.opener = None def make_fake_request(self): # 声明一个CookieJar对象实例来保存cookie cookie = cookielib.CookieJar() # 利用urllib2库的HTTPCookieProcessor对象来创建cookie处理器 handler = urllib2.HTTPCookieProcessor(cookie) # 通过handler来构建opener self.opener = urllib2.build_opener(handler) self.opener.open(self.base_url) # 写入文件的一种cookie实现方式,最好不要 ----start----- # # 声明一个MozillaCookieJar对象实例来保存cookie,之后写入文件 # cookie = cookielib.MozillaCookieJar(self.filename) # self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) # # 模拟登录,并把cookie保存到变量 # result = self.opener.open(self.base_url) # # 保存cookie到cookie.txt中 # cookie.save(ignore_discard=True, ignore_expires=True) # 写入文件的一种cookie实现方式,最好不要 ----end----- # 利用cookie请求访问另一个网址,此网址是成绩查询网址 gradeUrl = 'http://docker.ql.corp/DockerMachineServlet?ipaddress=33.71&action=environmentSearch' # 请求访问成绩查询网址 result = self.opener.open(gradeUrl) # print result.read() return result.read() #获取配置信息详情页 def get_detail_page(self,url_para): base_url = "http://docker.ql.corp/DockerMachineServlet?action=environmentdetail&" target_url = base_url + url_para detail_page = self.opener.open(target_url) return detail_page.read() #在详情页面获取 配置端口信息 def get_server_port(self,html_str,ip): #找到所有服务 # #找手机贷微站 pattern = re.compile("<a href=.*?/micro_site.tomcat.*?:4(\d*?)-->", re.S) print("----") micro_data = re.search(pattern, html_str) if micro_data: micro_port = micro_data.group(1).strip() #找到手机贷微站就返回 self.result_dic['micro_port'] = "4" + str(micro_port) self.result_dic['micro_ip'] = ip #找信审接口 pattern = re.compile("<a href=.*?/business_xs.tomcat.*?:4(\d*?)-->", re.S) print("----") business_xs_data = re.search(pattern, html_str) print business_xs_data if business_xs_data: business_xs_port = business_xs_data.group(1).strip() #找到手机贷微站就返回 self.result_dic['business_xs_port'] = "4" + str(business_xs_port) self.result_dic['business_xs_ip'] = ip #找财务接口 pattern = re.compile("<a href=.*?/financial-web.netty.*?:4(\d*?)-->", re.S) print("----") financial_web_data = re.search(pattern, html_str) print financial_web_data if financial_web_data: financial_web_port = financial_web_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['financial_web_port'] = "4" + str(financial_web_port) self.result_dic['financial_web_ip'] = ip # 找应花分期资料接口 pattern = re.compile("<a href=.*?/yh-openapi.netty.*?:4(\d*?)-->", re.S) print("----") yhfq_micro_data = re.search(pattern, html_str) print yhfq_micro_data if yhfq_micro_data: yhfq_micro_port = yhfq_micro_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['yhfq_micro_port'] = "4" + str(yhfq_micro_port) self.result_dic['yhfq_micro_ip'] = ip # 找卡代偿资料接口 pattern = re.compile("<a href=.*?/kdc-api.netty.*?:4(\d*?)-->", re.S) print("----") kdc_micro_data = re.search(pattern, html_str) print kdc_micro_data if kdc_micro_data: kdc_micro_port = kdc_micro_data.group(1).strip() # 找到手机贷微站就返回 self.result_dic['kdc_micro_port'] = "4" + str(kdc_micro_port) self.result_dic['kdc_micro_ip'] = ip #获取环境配置跳转列表 def get_environment_list(self,html_str): pattern = re.compile("<a href='./DockerMachineServlet.*?/>(.*?)</td>", re.S) ip_list = re.findall(pattern,html_str) pattern = re.compile("<a href='./DockerMachineServlet\?action=environmentdetail&(.*?)/>", re.S) url_para_list = re.findall(pattern, html_str) print url_para_list print(1) skip_first= False print(2) if url_para_list: print(3) for url_para,ip in zip(url_para_list,ip_list): print(4) if skip_first == False: print(5) skip_first = True continue if url_para.strip() == "" : print(6) continue print "-----------------start---------------------" print(url_para) #深入链接去爬取 print("深入链接去爬取") detail_page = self.get_detail_page(url_para) self.get_server_port(detail_page,ip) #处理环境详细页 print "-----------------end---------------------" # pprint.pprint(self.result_dic) print(self.result_dic) else: return None def get_sjd_config(self,some_ip): pass def get_yhfq_config(self,some_ip): pass def get_kdc_config(self,some_ip): passes = EnvironmentSpider()html_str = es.make_fake_request()print html_stres.get_environment_list(html_str)
阅读全文
0 0
- 爬取环境信息实例
- 爬取招聘信息
- 爬取二手房信息
- 大学排名信息爬取
- python的bs的简单实例爬取58同城手机信息
- python爬取网页信息
- HttpClient 登录爬取信息
- 安居客信息爬取
- urllib2 爬取网页信息
- python3爬取淘宝信息
- python3爬取淘宝信息!
- 爬取二手房信息v2
- python 爬取淘宝信息
- 携程网旅游信息爬取
- Python爬取国家信息
- 爬取12306站点信息
- Python-爬取网页信息
- Python爬取天气信息
- Python optparser库详解
- <设计模式可复用面向对象软件的基础> [1.4]、原型(C#)
- js中html,text,val 比较
- 51nod 1289 大鱼吃小鱼 【stack的使用】
- 三条命令解决Git 如何删除远程服务器文件同时保留本地文件
- 爬取环境信息实例
- 上传大文件失败问题记录
- SQLSTATE[HY000] [1130] Host '127.0.0.1' is not allowed to connect to this MySQL server怎么解决?
- 用jQuery实现简单的表单验证
- UVa11292
- 解决使用libhdfs.so连接hdfs出错问题java.lang.ClassNotFoundException: org.apache.hadoop.fs.F
- 在Java中按字节获得字符串长度的两种方法
- redis设置开机启动
- 2017.10.13