Stage6--Python简单爬虫
来源:互联网 发布:按键精灵压枪源码 编辑:程序博客网 时间:2024/05/29 09:44
正则表达式简单介绍
正则表达式(regular expression)描述了一种字符串匹配的模式,可以用来检查一个串是否含有某种子串、将匹配的子串做替换或者从某个串中取出符合某个条件的子串等。
字符 [a-z]
数字 [0-9] 或 \b
* 匹配前面的子表达式零次或多次
+ 匹配前面的子表达式一次或多次
? 匹配前面的子表达式零次或一次
一个简单爬虫例子
import reimport urllib.requesturl = "http://mall.csdn.net/coin"savePath = "G:/QQData/"def getHtml(url): page = urllib.request.urlopen(url) html = page.read().decode('utf-8') return htmldef getImg(html): reg = r'http:\/\/img.bss.csdn.net\/[0-9]+\.jpg' imgre = re.compile(reg) imglist = re.findall(imgre, html) return imglist;def saveImg(url): conn = urllib.request.urlopen(url) file = open(savePath + getFileName(url), 'wb') file.write(conn.read()) file.close() returndef getFileName(url): reg = r'[a-z0-9]+\.jpg' matchObj = re.search(reg, url) if matchObj: return matchObj.group() returnhtml = str(getHtml(url));imgurls = getImg(html);for imgurl in imgurls: print(imgurl) saveImg(imgurl)
这个例子很简单,里面的正则简单的到没有,是爬取CSDN U币商城的图片,很容易看懂。
可以自定义正则和Url的爬虫
from tkinter import *import reimport urllib.requestsavePath = "G:/QQData/"class ControllPanel(Frame): default_url = "http://www.xingyongshe.com/man/ddfeijibei" default_regix = r'<img src="(.+)&w=228&q=90&type=jpg' __status_label = None #私有属性 url_input = None regix_input = None def __init__(self, master): Frame.__init__(self, master, width=360, height=260) self.master = master self.pack() self.add_urlarea() self.add_regixarea() self.add_buttonarea() self.add_statusbar() def add_urlarea(self): urlAreaFrame = Frame(self) urlAreaFrame.pack(pady=20) label = Label(urlAreaFrame, text="网址:") label.pack(side=LEFT) self.url_input = Entry(urlAreaFrame, width=40) self.url_input.insert(0, ControllPanel.default_url) self.url_input.pack(side=RIGHT) def add_regixarea(self): regixAreaFrame = Frame(self) regixAreaFrame.pack() label = Label(regixAreaFrame, text="正则:") label.pack(side=LEFT) self.regix_input = Entry(regixAreaFrame, width=40) self.regix_input.insert(0, self.default_regix) #注意这里的访问方式和上面方式不同 self.regix_input.pack(side=RIGHT) def add_buttonarea(self): buttonAreaFrame = Frame(self) buttonAreaFrame.pack(pady=40) verify_button = Button(buttonAreaFrame, text="验证正则", command=self.verifyRegix) verify_button.pack(padx=50, side=LEFT) start_button = Button(buttonAreaFrame, text="开始爬取", command=self.startRun) start_button.pack(padx=50, side=RIGHT) def verifyRegix(self): if self.url_input == None or self.regix_input == None: return url = self.url_input.get() regix = self.regix_input.get() runfunction = RunFunction(url, regix) imgurls = runfunction.verifyRegix() if len(imgurls) > 0: self.setStatus("有%d张图片可以爬取", len(imgurls)) for imgurl in imgurls: print(imgurl) return imgurls def startRun(self): imgurls = self.verifyRegix() runfunction = RunFunction(None, None) size = runfunction.runTask(imgurls) self.setStatus("爬取了%d张图片放在了" + savePath, size) def add_statusbar(self): statusBarFrame = Frame(self.master) statusBarFrame.pack(side=BOTTOM, fill=X) self.status_label = Label(statusBarFrame, bd=1, relief=SUNKEN, anchor=W) self.status_label.pack(fill=X) def setStatus(self, format, *args): if self.status_label == None: return self.status_label.config(text=format % args) self.status_label.update_idletasks() return def clearStatus(self): self.status_label.config(text="") self.status_label.update_idletasks()class RunFunction: def __init__(self, url, regix): self.url = url self.regix = regix def getHtml(self, url): page = urllib.request.urlopen(url) html = page.read().decode('utf-8') return html def getImg(self, html, reg): imgre = re.compile(reg) imglist = re.findall(imgre, html) return imglist; def saveImg(self, url): conn = urllib.request.urlopen(url) file = open(savePath + self.getFileName(url), 'wb') file.write(conn.read()) file.close() return def getFileName(self, url): reg = r'[a-z0-9]+\.jpg' matchObj = re.search(reg, url) if matchObj: return matchObj.group() return def verifyRegix(self): html = str(self.getHtml(self.url)) imgurls = self.getImg(html, self.regix) return imgurls def runTask(self): imgurls = self.verifyRegix(self.url, self.regix) for imgurl in imgurls: self.saveImg(imgurl) return len(imgurls) def runTask(self, imgurls): for imgurl in imgurls: self.saveImg(imgurl) return len(imgurls)root = Tk()root.title("爬虫管理窗口")#让窗口居中显示scnWidth, scnHeight = root.maxsize()tmpcnf = '%dx%d+%d+%d'%(308, 101, (scnWidth-308)/2, (scnHeight-101)/2)root.geometry(tmpcnf)root.maxsize(600, 300)root.minsize(360, 220)#root.resizable(False, False) #让窗口尺寸不变controllPanel = ControllPanel(root)controllPanel.setStatus("等待爬取……")root.mainloop()root.destroy()
1 0
- Stage6--Python简单爬虫
- python爬虫入门简单爬虫
- python-简单爬虫
- 简单python爬虫
- Python简单爬虫
- python 简单爬虫
- 简单python爬虫
- Python简单的爬虫
- Python简单爬虫
- python 简单爬虫实现
- python简单文本爬虫
- python超级简单爬虫
- python简单爬虫
- python简单爬虫程序
- python简单爬虫
- Python开发简单爬虫
- Python简单爬虫
- python 简单爬虫原理
- I/O多路转接之epoll
- hdu5773The All-purpose Zero,1257最少拦截次数
- ajax
- hdu 3487 Play with Chain splay
- 在64位ubuntu中安装代码比较工具beyond compare
- Stage6--Python简单爬虫
- epoll详解
- Swift与OC语言中语法的一些区别
- spring 导入到eclipse
- 第9课:Scala类和对象彻底实战和Spark源码鉴赏
- 【英语总结】六月
- 折腾记要——Ubuntu 14.04系统安装Nvidia CUDA7.5并搭建Python Theano深度学习开发环境
- redis 配置文件详解
- unity知识结构,可检查自身,可用来教学。