python爬虫上手 笔记<4>
来源:互联网 发布:投资公司的网络销售 编辑:程序博客网 时间:2024/06/05 06:17
根据之前的结果,把代码修改完,就成这个样子。
class ImgCrawler: def __init__(self,searchlink = None): self.link = searchlink self.soupheader = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3176.2 Safari/537.36"}#BeautifulSoup用的头 self.scrolldown = None self.jsdriver = None self.soup = None self.message = [] def __iter__(self): return self def getChromeCanary(self): browser_options = webdriver.ChromeOptions() #宣称browser_option为webdriver下Chrome.Option类 browser_options.add_argument("--headless") #选择参数 browser_options.binary_location = r"C:\Users\luyi\AppData\Local\Google\Chrome SxS\Application\chrome.exe"#指定无头浏览器的可执行文件位置,这里用的是Chrome Canary60的位置 self.jsdriver = webdriver.Chrome("f:\software\python\chromedriver_win32\chromedriver.exe", chrome_options = browser_options) #通过chromedriver——Chromium组织制作的引导程序启动chrome,配置参数为browser_option # self.jsdriver = webdriver.Chrome("f:\software\python\chromedriver_win32\chromedriver.exe") self.jsdriver.implicitly_wait(30)#设置最长等待时间30s try: self.jsdriver.get(self.link) except: self.message.append(self.link + ' connect error') pass def scrollDownUseChromeCanary(self, scrolltimes = 1, sleeptime = 10): for i in range(scrolltimes): self.message.append(u"开始执行第" + str(i+1) + u'次下拉操作') self.jsdriver.execute_script('window.scrollTo(0,document.body.scrollHeight);')#执行JavaScript实现网页下拉到底部 self.message.append(u'第' + str(i+1) + u'次下拉操作执行完成,等待网页加载') time.sleep(sleeptime) # WebDriverWait(self.jsdriver,sleeptime) def getSoupLink(self): try: html = urlopen(self.link) self.soup = BeautifulSoup(html, 'html.parser') except: self.message.append(self.link + ' connect error') pass def findKeyInSoup(self,key): return self.soup.findAll(key) # 获取html结构 def getSoupWebdriver(self, parser=None): self.soup = BeautifulSoup(self.jsdriver.page_source, parser) def updateSoup(self, newsoup): self.soup = newsoup def getActualUrl(self, key=None): actualurl = [] for a in self.soup.findAll('a', href=key): parsed = parse.urlparse(a['href']) actualurl.append(parsed) return actualurlclass GoogleImgCrawler(ImgCrawler): def getActualUrl(self): actualurl = [] key = re.compile(r"/imgres\?imgurl=")#匹配关键词,每种浏览器的显示值不一样 for a in self.soup.findAll('a', href=key): parsed = parse.urlparse(a['href']) #下面解析parsed得到的数据中query为imgurl的值 parsed的数据为<a jsname = "aaa" href="/imgres?imgurl=https%3A%2F%2Fwww.sss.gov.ph%2Fsss%2FShowProperty%2FCommunity_Repository%2FSSS_News%2FCorporate%2520Governance%2520Seal%2Fimage&imgrefurl=https%3A%2F%2Fwww.sss.gov.ph%2F&docid=19zjQZ8ta7LBAM&tbnid=H7YTjaHNI7KPRM%3A&vet=10ahUKEwiBhbzfrNTVAhWMvLwKHVIbABQQMwgkKAAwAA..i&w=320&h=250&hl=zh-CN&safe=images&bih=927&biw=1249&as_q=sss&ved=0ahUKEwiBhbzfrNTVAhWMvLwKHVIbABQQMwgkKAAwAA&iact=mrc&uact=8" # print (parsed.query) url = parse.parse_qs(parsed.query)['imgurl'] actualurl.append(url) return actualurl def clickNextPage(self,key): return self.jsdriver.find_elements_by_css_selector(key)[0].click()
由于经验不足,还不足以抽象出一个较好的模型,因此只能将Google的部分单独写一个。另外增加只用beautifulsoup就可以处理的函数。这样遇到简单的网页,也可以直接调用beautifulsoup处理
爬完自然要下载点啥,所以单独再写一些图片存储的操作。自然也要用到beautifulsoup
#_*_ coding: utf-8 _*_import osimport win32apiimport win32conimport urllib.requestclass ImgSave: def __init__(self, save_path, folder_name, file_list): self.save_path = save_path self.folder_name = folder_name self.file_list = file_list self.urlheader = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3176.2 Safari/537.36"} self.message = [] def sortFiles (self):#去掉重复的链接,这里假设重复的链接内容为重复的,不同的链接即使文件名相同,也是不同文件 return sorted(set(self.file_list),self.file_list.index) def creatFolder(self): if os.path.exists(self.save_path + u'\\' + self.folder_name): pass else: os.makedirs(self.save_path + u'\\' + self.folder_name) def downloadImg(self, filelink): try: n = 0 raw_img = None while (raw_img == None): if (n>0) : print (u'download repeat ' + filelink + str(n)) req = urllib.request.Request(filelink, headers = self.urlheader) raw_img = urllib.request.urlopen(req).read() n = n + 1 cntr = len(os.listdir(self.save_path + '\\' + self.folder_name)) + 1 file_name = u'img' + str(cntr) imgfile = open(os.path.join(self.save_path + '\\' + self.folder_name, file_name + filelink[-4:]), 'wb') imgfile.write(raw_img) imgfile.close() except: self.message.append(filelink + u'下载失败') #下载链接文件 def downloadImgTotal(self,file_list): for f in file_list : try: __req = urllib.request.Request(f, headers = self.urlheader) __raw_img = urllib.request.urlopen(__req).read() ff = open(os.path.join(self.save_path + self.folder_name, __raw_img), 'wb') ff.write(__raw_img) ff.close() except: self.message.append(f + u'下载失败') __cntr = 1 __cur_path = self.save_path + u'\\' + self.folder_name #按要求改名 for df in os.listdir(__cur_path): __ppos = df.rfind('.') __file_new_name = u'image_' + '%04d'%__cntr + f[__ppos:] os.rename(__cur_path + u'\\' + df, __cur_path + u'\\' + __file_new_name) self.message.append(self.folder_name + u'完成采集')
阅读全文
0 0
- python爬虫上手 笔记<4>
- python爬虫上手 笔记<1>
- python爬虫上手 笔记<2>
- python爬虫上手 笔记<3>
- python爬虫笔记 --------scrapy框架(4)
- Python爬虫笔记一
- Python爬虫笔记
- python 爬虫笔记
- python爬虫项目笔记
- python爬虫笔记
- Python爬虫学习笔记
- python爬虫入门笔记
- python爬虫学习笔记
- python爬虫学习笔记
- python爬虫学习笔记
- python 爬虫笔记
- Python 爬虫学习笔记
- Python网络爬虫笔记
- C语言中的 (void*)0 与 (void)0
- 设计模式---构造者模式
- CODECHEF Palindromic Game
- C++中 atexit函数 exit函数
- JetBrains正式发布Kotlin 1.0:JVM和Android上更好用的语言
- python爬虫上手 笔记<4>
- Markdown的一些语法(好难记啊QAQ)
- mvc、mvp 和mvvm区别
- 【广告项目】STS实现SVN账户轻松切换
- 习题 3-7 DNA序列(DNA Consesus String) UVa 1368
- SpringBoot专题1----springboot与mybatis的完美融合
- 数组
- NOI2017同步赛游记
- 大二课本之链表理解