# coding = utf-8# __author__ = Christopherfrom urllib import request, parsefrom bs4 import BeautifulSoupimport reimport os# from time import strftimeclass Spider: def __init__(self): self.header = {} self.url = 'http://www.baidu.com/s?wd=' self.page = 0 self.word = 'inurl:action' # 这里设置你想要搜索的内容 ''' self.hrefre = re.compile('(http|ftp|https)' ':\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])')''' def EncodingWd(self, wd): return parse.quote(wd) def struct_url(self, search_word, page=0): url = self.url if search_word == '': pass else: self.url = search_word if page != 0: url = url + self.EncodingWd(self.word) + '&pn=' + str(page) + '0' else: url += self.EncodingWd(self.word) return url def spider(self, page, search_word): self.header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.1' auth_url_lis = [] baidu_search_url = (self.struct_url(search_word, page)) req = request.Request(baidu_search_url, headers=self.header) rsp = request.urlopen(req) html = rsp.read() soup = BeautifulSoup(html, 'html.parser') # auth_url_lis = [] for url in soup.find_all('a', {'class': 'c-showurl'}): auth_url_lis.append(url.get('href')) # soupresult.append(soup.findAll('a', {'class': 'c-showurl'})) # time.sleep(8) return auth_url_lis ''' def extract_href(self, waitforre): hrefre = self.hrefre result = [] base_url = 'http://www.baidu.com/' for i in re.findall(hrefre, waitforre): result.append(base_url+i[2]) return result''' def auth_url(self, crypt_url): result = [] # header = {} # header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.1' try: req = request.Request(crypt_url, headers=self.header) url = request.urlopen(req) result.append(url.geturl()) except: print('[*]Can\'t get auth_url!%s' % str(crypt_url)) return resultdef main(): path = os.getcwd() path += '\\auth_url.txt' # 获取脚本当前路径 spi_ob = Spider() file = open(path, 'a') # for k in range(0, 5): print('[#]Version 0.3\n[#]__Author__=ChristopherLam\n[#]qq:770304694', end='\n') search_word = str(input('[*]请输入搜索关键词(可不填):')) subscript_page = int(input('[*]请输入页码下限(0为第一页):')) superscript_page = int(input('[*]请输入页码上限:')) print('[*]Spider is under running...') for k in range(subscript_page, superscript_page): auth_url_lis = spi_ob.spider(page=k, search_word=search_word) while auth_url_lis: url_result = spi_ob.auth_url(auth_url_lis.pop()) while url_result: file.write(url_result.pop()+'\n') file.close() print('[*]Success. Quit...') # for i in re_result: # print(i)if __name__ == '__main__': main()
1 0