python爬虫上手 笔记<4>

来源:互联网 发布:投资公司的网络销售 编辑:程序博客网 时间:2024/06/05 06:17

根据之前的结果,把代码修改完,就成这个样子。


class ImgCrawler:    def __init__(self,searchlink = None):        self.link = searchlink        self.soupheader = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3176.2 Safari/537.36"}#BeautifulSoup用的头        self.scrolldown = None        self.jsdriver = None        self.soup = None        self.message = []    def __iter__(self):        return self    def getChromeCanary(self):        browser_options = webdriver.ChromeOptions()  #宣称browser_option为webdriver下Chrome.Option类        browser_options.add_argument("--headless")   #选择参数        browser_options.binary_location = r"C:\Users\luyi\AppData\Local\Google\Chrome SxS\Application\chrome.exe"#指定无头浏览器的可执行文件位置,这里用的是Chrome Canary60的位置        self.jsdriver = webdriver.Chrome("f:\software\python\chromedriver_win32\chromedriver.exe", chrome_options = browser_options) #通过chromedriver——Chromium组织制作的引导程序启动chrome,配置参数为browser_option        # self.jsdriver = webdriver.Chrome("f:\software\python\chromedriver_win32\chromedriver.exe")        self.jsdriver.implicitly_wait(30)#设置最长等待时间30s        try:            self.jsdriver.get(self.link)        except:            self.message.append(self.link + ' connect error')            pass    def scrollDownUseChromeCanary(self, scrolltimes = 1, sleeptime = 10):        for i in range(scrolltimes):            self.message.append(u"开始执行第" + str(i+1) + u'次下拉操作')            self.jsdriver.execute_script('window.scrollTo(0,document.body.scrollHeight);')#执行JavaScript实现网页下拉到底部            self.message.append(u'第' + str(i+1) + u'次下拉操作执行完成,等待网页加载')            time.sleep(sleeptime)            # WebDriverWait(self.jsdriver,sleeptime)    def getSoupLink(self):        try:            html = urlopen(self.link)            self.soup = BeautifulSoup(html, 'html.parser')        except:            self.message.append(self.link + ' connect error')            pass    def findKeyInSoup(self,key):        return self.soup.findAll(key)    # 获取html结构    def getSoupWebdriver(self, parser=None):        self.soup = BeautifulSoup(self.jsdriver.page_source, parser)    def updateSoup(self, newsoup):        self.soup = newsoup    def getActualUrl(self, key=None):        actualurl = []        for a in self.soup.findAll('a', href=key):            parsed = parse.urlparse(a['href'])            actualurl.append(parsed)        return actualurlclass GoogleImgCrawler(ImgCrawler):    def getActualUrl(self):        actualurl = []        key = re.compile(r"/imgres\?imgurl=")#匹配关键词,每种浏览器的显示值不一样        for a in self.soup.findAll('a', href=key):            parsed = parse.urlparse(a['href'])            #下面解析parsed得到的数据中query为imgurl的值   parsed的数据为<a jsname = "aaa" href="/imgres?imgurl=https%3A%2F%2Fwww.sss.gov.ph%2Fsss%2FShowProperty%2FCommunity_Repository%2FSSS_News%2FCorporate%2520Governance%2520Seal%2Fimage&imgrefurl=https%3A%2F%2Fwww.sss.gov.ph%2F&docid=19zjQZ8ta7LBAM&tbnid=H7YTjaHNI7KPRM%3A&vet=10ahUKEwiBhbzfrNTVAhWMvLwKHVIbABQQMwgkKAAwAA..i&w=320&h=250&hl=zh-CN&safe=images&bih=927&biw=1249&as_q=sss&ved=0ahUKEwiBhbzfrNTVAhWMvLwKHVIbABQQMwgkKAAwAA&iact=mrc&uact=8"            # print (parsed.query)            url = parse.parse_qs(parsed.query)['imgurl']            actualurl.append(url)        return actualurl    def clickNextPage(self,key):        return self.jsdriver.find_elements_by_css_selector(key)[0].click()
由于经验不足,还不足以抽象出一个较好的模型,因此只能将Google的部分单独写一个。另外增加只用beautifulsoup就可以处理的函数。这样遇到简单的网页,也可以直接调用beautifulsoup处理
爬完自然要下载点啥,所以单独再写一些图片存储的操作。自然也要用到beautifulsoup
#_*_ coding: utf-8 _*_import osimport win32apiimport win32conimport urllib.requestclass ImgSave:    def __init__(self, save_path, folder_name, file_list):        self.save_path = save_path        self.folder_name = folder_name        self.file_list = file_list        self.urlheader = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3176.2 Safari/537.36"}        self.message = []    def sortFiles (self):#去掉重复的链接,这里假设重复的链接内容为重复的,不同的链接即使文件名相同,也是不同文件        return sorted(set(self.file_list),self.file_list.index)    def creatFolder(self):        if os.path.exists(self.save_path + u'\\' + self.folder_name):            pass        else:            os.makedirs(self.save_path + u'\\' + self.folder_name)    def downloadImg(self, filelink):        try:            n = 0            raw_img = None            while (raw_img == None):                if (n>0) :                    print (u'download repeat ' + filelink + str(n))                req = urllib.request.Request(filelink, headers = self.urlheader)                raw_img = urllib.request.urlopen(req).read()                n = n + 1            cntr = len(os.listdir(self.save_path + '\\' +  self.folder_name)) + 1            file_name = u'img' + str(cntr)            imgfile = open(os.path.join(self.save_path + '\\' +  self.folder_name, file_name + filelink[-4:]), 'wb')            imgfile.write(raw_img)            imgfile.close()        except:            self.message.append(filelink + u'下载失败')    #下载链接文件    def downloadImgTotal(self,file_list):        for f in file_list :            try:                __req = urllib.request.Request(f, headers = self.urlheader)                __raw_img = urllib.request.urlopen(__req).read()                ff = open(os.path.join(self.save_path + self.folder_name, __raw_img), 'wb')                ff.write(__raw_img)                ff.close()            except:                self.message.append(f + u'下载失败')        __cntr = 1        __cur_path = self.save_path + u'\\' + self.folder_name        #按要求改名        for df in os.listdir(__cur_path):            __ppos = df.rfind('.')            __file_new_name = u'image_' + '%04d'%__cntr + f[__ppos:]            os.rename(__cur_path + u'\\' + df, __cur_path + u'\\' + __file_new_name)            self.message.append(self.folder_name + u'完成采集')