玩玩python之爬取补天厂商列表

来源:互联网 发布:python 字符串变json 编辑:程序博客网 时间:2024/06/06 08:46

今天写了一个爬取补天厂商列表的爬虫,好方便进行渗透测试

直接贴出代码:

import requestsfrom lxml import etreeimport os#引入模块 以dom-tree的方式浏览网页,注意模块要pip install安装才能引入def Save_File(messageList):path = '补天厂商列表'if not os.path.exists(path):os.mkdir(path)#若不存在这个文件夹则要创建filepath = '厂商列表.txt'new_path = os.path.join(path, filepath)with open(new_path, 'a+', encoding = 'utf8') as f:#以追加的方法将列表写入.txt文件for x in messageList:f.write('%s   %s\n'% (x[0][0], x[1][0]))#写入厂商名字和urldef load_message(page_message):dom = etree.HTML(page_message)i = 2LM_messageList = []while True:index = str(i)new_xpath = '//table/tr[' + index + ']/td[1]/a/text()'#使用正则匹配厂商名字title = dom.xpath(new_xpath)new_url_xpath = '//table/tr[' + index + ']/td[2]/text()'#使用正则匹配厂商URLurl = dom.xpath(new_url_xpath)if not url:url = ['URL丢失']#有的没有URL输出丢失if not title:breakLM_messageList.append((title, url))#将名称和URL都输入到列表当中i += 1return LM_messageListdef Spider(file_URL, last_message):print(file_URL)S_page_message = requests.get(file_URL).content.decode('utf8')#访问URL注意当前网页的编码S_messageList = load_message(S_page_message)a = S_messageList[0][0] == last_message[0]if not S_messageList[0][0] == last_message:#判断厂商和URL是否对应Save_File(S_messageList)return S_messageList[0][0], aif __name__ == '__main__':new_url = 'https://butian.360.cn/company/lists't = Spider(new_url, ('a', 'b'))i = 2while True:P_new_url = new_url +'/page/' + str(i)#循环到所有的paget = Spider(P_new_url, t)if  t[1]:breaki += 1print('finish')#完成

到此补天厂商的列表都被抓取出来了

0 0