python实现简单抓图并打包成exe

来源：互联网发布：天津seo 页面优化编辑：程序博客网时间：2024/06/04 05:29

打包：

在CMD命令行运行:python setup.py py2exe

注意:
1.需要安装py2exe
2.更改setup.py中对相应的py文件进行打包
3.打包后包括exe文件在内的必须文件都在dist目录下
4.用命令行跑时，应先将路径cd到py文件的目录下

setup.py:

#!/usr/bin/env python
#coding=utf-8

from distutils.core import setup
import py2exe

file="python_oschina_picture_new.py"#.decode('utf-8')
setup(console=[file])

python代码：(python_oschina_picture_new.py)

#!/usr/bin/env python
#coding=utf-8

import urllib
import urllib2
import os
import re
import sys
BaseURL='http://www.mzitu.com/' #首页
Module=['model'] #需要下载的模块

#显示下载进度
def schedule(a,b,c):
"""
a:已经下载的数据块
b:数据块的大小
c:远程文件的大小
"""
per = 100.0 * a * b / c
if per > 100 :
per = 100
print '%.2f%%' % per

def GetHtml(URL):
#req = urllib2.Request('http://192.168.1.2/')
#req.add_header('User-agent', 'Mozilla 5.10') #python使用自定义user-agent抓取网页
#content=urllib2.urlopen(req).read()#.decode('utf-8') #解码后打印网页内容不乱码，也可以后面再处理乱码情况s
content=urllib.urlopen(URL).read()
return content

#获取资源页数
def GetPage(URL,content,flag):
if flag==0:
regex=URL+'/page/(\d*)\s*\'\s*>\s*<span>\s*\d*\s*</span>' #<a class='page-numbers' href='http://www.mzitu.com/model/page/47'><span>47</span></a>
elif flag==1:
regex='span\s*class\s*=\s*\'dots\'.*?</span>\s*<\s*a\s*href\s*=\s*\'\s*'+URL+'/(\d*)\'>' #<span class='dots'>…</span><a href='http://www.mzitu.com/41633/30'>

res=re.findall(regex,content,re.S) #re.S 表示多行匹配
return res.pop()

#获取列表
def GetList(content):
regex=BaseURL+'(\d*)\"\s*title\s*=\s*\"(.*?)\"\s*target\s*=\s*\"_blank\s*\">.*?</a>\s*</h2>' #"http://www.mzitu.com/41107" title="女神许诺白皙翘臀美腿秒杀宅男" target="_blank">女神许诺白皙翘臀美腿秒杀宅男</a></h2>
list=re.findall(regex,content,re.S) #re.S 表示多行匹配
return list

#下载资源
def Download(URL,num,dirName):
Dir='%s' % unicode(dirName,'utf-8') #对中文(需要用UNICODE显示的字符)进行重新编码
try:
if not os.path.exists(Dir): #路径不存在时创建一个
os.makedirs(Dir)
picture = Dir+'/%s.jpg' % num
if os.path.exists(picture):
return
regex='href\s*=\s*\"'+URL+'/{0,1}\d*\".*?<\s*img\s*src\s*=\s*\"(.*?)\"\s*alt' #<p><a href="http://www.mzitu.com/41633/3" ><img src="http://pic.dofay.com/2015/05/27t02.jpg" alt="长发红唇性感睡衣美女夏瑶秒杀宅男" /></a></p>
if (int(num))==1:
htmlURL=URL
else:
htmlURL=URL+'/'+num
url=re.findall(regex,GetHtml(htmlURL),re.S)
print 'downloading',picture
#urllib.urlretrieve(url[0],picture,schedule) #显示下载进度
urllib.urlretrieve(url[0],picture) #不显示下载进度
except:
print 'exception!','when download',Dir,num+'.jpg'

#下载资源
def Handle(URL):
content=GetHtml(URL)
for page in range(1,int(GetPage(URL,content,0))+1):
if page==1:
url=URL
else:
url=URL+'/page/'+str(page)
content=GetHtml(url)
list=GetList(content)
#for x in list:
#file=open('E:/study/python/study/picture.txt','a')
#file.write(x[0]+'\n')
for L in list:
URL=BaseURL+L[0]
content=GetHtml(URL)
for page_jpg in range(1,int(GetPage(URL,content,1))+1):
Download(URL,str(page_jpg),str(L[1]))

if __name__=='__main__':
for list in Module:
Handle(BaseURL+list)

0 0