python批处理实现爬取网页静态图片文件重命名图片统一修改大小等功能

来源：互联网发布：php mysql 存储过程编辑：程序博客网时间：2024/06/04 23:09

按照特定字符（此处选为字符’j’）分割文档成多个小文档

def main():    with open("dir/origin.txt","r") as f:        temp = f.readlines()        words = []        L = len(temp)        for i in range(L):            if "j" in temp[i] :                words = temp[i]                with open('dir/'+words[0:6]+'.txt', 'w') as g:                    for j in range(i+1, L):                        if "j" in temp[j]:                            break                        g.write(temp[j])if __name__ == '__main__':    main()

爬取网页静态图片

###coding=utf-8#####urllib模块提供了读取Web页面数据的接口##import urllib.request###re模块主要包含了正则表达式##import re###定义一个getHtml()函数##def getHtml(url):##    page = urllib.request.urlopen(url)  #urllib.urlopen()方法用于打开一个URL地址##    html = page.read() #read()方法用于读取URL上的数据##    return html####def getImg(html):##    reg = r'src="(.+?\.jpg)" pic_ext'    #正则表达式，得到图片地址##    imgre = re.compile(reg)     #re.compile() 可以把正则表达式编译成一个正则表达式对象.##    imglist = re.findall(imgre,html)      #re.findall() 方法读取html 中包含 imgre（正则表达式）的    数据##    #把筛选的图片地址通过for循环遍历并保存到本地##    #核心是urllib.urlretrieve()方法,直接将远程数据下载到本地，图片通过x依次递增命名##    x = 0####    for imgurl in imglist:##        urllib.request.urlretrieve(imgurl,'/Users/c-ten/Desktop/worm/%s.jpg' % x)##        x+=1######html = getHtml("http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=masked%20faces&hs=2&xthttps=000000&fr=ala&ori_query=masked%20faces&ala=0&alatpl=sp&pos=0")###print getImg(html)#<span style="color:#330099;">'''#Created on 2017-4-17#@author: Administrator#'''import urllib.requestimport redef gethtml(url):    html=urllib.request.urlopen(url)    data=html.read()    return data.decode("utf-8")def getimg(html):    dir="/Users/c-ten/Desktop/worm/"  #图片保存路径    reg='src="(.+?\.jpg)" pic_ext'     imgreg=re.compile(reg)    imglist=imgreg.findall(html)    x=0    for i in imglist:        print(i)        urllib.request.urlretrieve(i,'{}{}.jpg'.format(dir, x))        x=x+1print('gethtml')html=gethtml("http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=%BF%DA%D5%D6%C3%B1%D7%D3%D5%DA%B5%B2%C8%CB%C1%B3%CD%BC%C6%AC&hs=2&xthttps=000000&fr=ala&ori_query=%E5%8F%A3%E7%BD%A9%E5%B8%BD%E5%AD%90%E9%81%AE%E6%8C%A1%E4%BA%BA%E8%84%B8%E5%9B%BE%E7%89%87&ala=0&alatpl=sp&pos=0")print('getimg')getimg(html)#</span><span style="color:#333333;">#</span>

计算文档行数

def main():    with open("list.txt","r") as f:        temp = f.readlines()        L = len(temp)        print(L)    f.close()if __name__ == '__main__':    main()

批量增加后缀名

# -*- coding: cp936 -*-import ospath = 'jpg_files_dir'for file in os.listdir(path):    if os.path.isfile(os.path.join(path,file))==True:        if file.find('.')<0:            newname=file+'.jpg'            os.rename(os.path.join(path,file),os.path.join(path,newname))

阅读全文

1 0