《百度URL采集器》

来源:互联网 发布:unity3d ios 交互 编辑:程序博客网 时间:2024/05/29 06:27
# coding = utf-8# __author__ = Christopherfrom urllib import request, parsefrom bs4 import BeautifulSoupimport reimport os# from time import strftimeclass Spider:    def __init__(self):        self.header = {}        self.url = 'http://www.baidu.com/s?wd='        self.page = 0        self.word = 'inurl:action'  # 这里设置你想要搜索的内容        '''        self.hrefre = re.compile('(http|ftp|https)'                                 ':\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])')'''    def EncodingWd(self, wd):        return parse.quote(wd)    def struct_url(self, search_word, page=0):        url = self.url        if search_word == '':            pass        else:            self.url = search_word        if page != 0:            url = url + self.EncodingWd(self.word) + '&pn=' + str(page) + '0'        else:            url += self.EncodingWd(self.word)        return url    def spider(self, page, search_word):        self.header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.1'        auth_url_lis = []        baidu_search_url = (self.struct_url(search_word, page))        req = request.Request(baidu_search_url, headers=self.header)        rsp = request.urlopen(req)        html = rsp.read()        soup = BeautifulSoup(html, 'html.parser')        # auth_url_lis = []        for url in soup.find_all('a', {'class': 'c-showurl'}):            auth_url_lis.append(url.get('href'))            # soupresult.append(soup.findAll('a', {'class': 'c-showurl'}))            # time.sleep(8)        return auth_url_lis    '''    def extract_href(self, waitforre):        hrefre = self.hrefre        result = []        base_url = 'http://www.baidu.com/'        for i in re.findall(hrefre, waitforre):            result.append(base_url+i[2])        return result'''    def auth_url(self, crypt_url):        result = []        # header = {}        # header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.1'        try:            req = request.Request(crypt_url, headers=self.header)            url = request.urlopen(req)            result.append(url.geturl())        except:            print('[*]Can\'t get auth_url!%s' % str(crypt_url))        return resultdef main():    path = os.getcwd()    path += '\\auth_url.txt'  # 获取脚本当前路径    spi_ob = Spider()    file = open(path, 'a')    # for k in range(0, 5):    print('[#]Version 0.3\n[#]__Author__=ChristopherLam\n[#]qq:770304694', end='\n')    search_word = str(input('[*]请输入搜索关键词(可不填):'))    subscript_page = int(input('[*]请输入页码下限(0为第一页):'))    superscript_page = int(input('[*]请输入页码上限:'))        print('[*]Spider is under running...')    for k in range(subscript_page, superscript_page):        auth_url_lis = spi_ob.spider(page=k, search_word=search_word)        while auth_url_lis:            url_result = spi_ob.auth_url(auth_url_lis.pop())            while url_result:                file.write(url_result.pop()+'\n')    file.close()    print('[*]Success. Quit...')    # for i in re_result:    #    print(i)if __name__ == '__main__':    main()


                                             
1 0
原创粉丝点击