python 下载百度贴吧图片
来源:互联网 发布:人海战术 知乎 编辑:程序博客网 时间:2024/05/01 07:49
主程序
#!/usr/bin/python# -*-coding:utf-8-*-import urllib.parse, urllib.request, http.cookiejar, re, timeimport toolsimport threadingfrom db import dbclass tieba(threading.Thread): '下载贴吧图片' # http://tieba.baidu.com/p/4519246742?see_lz=1&pn=1 url = 'http://tieba.baidu.com/p/%s?see_lz=1&pn=%s' def __init__(self, tieid=4690733195, page=1): threading.Thread.__init__(self) self.tieid = tieid self.page = page @staticmethod def getEndPage(tieid): '获取帖子共多少页' url = tieba.url % (tieid, 1) res = urllib.request.urlopen(url) text = res.read().decode('utf-8') pattern = r'(\d+)</span>回复贴,共<span class="red">(\d+)</span>页' match = re.search(pattern, text) if match: pages = match.group(2) else: pages = 1 return int(pages) def run(self): '线程' url = tieba.url % (self.tieid, self.page) res = urllib.request.urlopen(url) text = res.read().decode('utf-8') pattern = r'<img class="BDE_Image"([\s\S]*?)src="(.*?)"' match = re.findall(pattern, text) for i in match: url = i[1] print('第%s页,下载地址:%s' % (self.page, url)) tools.download(url)@tools.runTime('tieba.log')def main(): '主方法' tieID = 4519246742 #贴子ID,http://tieba.baidu.com/p/4671247923 帖子ID:4671247923 endpage = tieba.getEndPage(tieID) mysql = db('127.0.0.1', 'root', '', 'test') sql = 'select * from tieba where tieid=%s order by id desc limit 1' % tieID data = mysql.queryRow(sql) # 每次下载5页 size = 5 if data: if int(data['EndPage']) > endpage: print("已到尾页,结束下载!") exit() start = data['EndPage'] + 1 end = start + size - 1 else: start = 1 end = 5 sql = """ INSERT INTO `test`.`tieba` ( `tieid`, `StartPage`, `EndPage`) VALUES('%s','%s', '%s'); """ % (tieID, start, end) mysql.execute(sql) threads = [] for page in range(start, end + 1): if page > endpage: print("已到尾页,结束下载!!") break thread = tieba(tieID, page) thread.start() threads.append(thread) for t in threads: # 等待所有线程完成 t.join() print("退出主线程")if __name__ == '__main__': main()
tools.py
#!/usr/bin/python# -*-coding:utf-8-*-import time, randomimport urllib.requestimport os.path, re'''自定义工具方法,tools.py'''def runTime(file='test.log'): def _runTime(func): '记录程序运行时间' def newFunc(*args, **kwargs): start = time.clock() log('开始任务', file) res = func(*args, **kwargs) end = time.clock() msg = "结束任务,运行了: %f 秒" % (end - start) log(msg, file) print(msg) return res return newFunc return _runTimedef log(content, file='test.log', type=1): if type == 1: f = open(file, 'a+', encoding='utf-8') else: f = open(file, 'w+', encoding='utf-8') t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) content = t + ' : ' + content + '\r' f.write(content)def download(url, filename='', foldername='', useOldName=False): """ :param url: str 文件下载地址 :param filename: str 下载后的文件名,默认:yyyymmddHHiiss+3位随机数 :param foldername: str 下载目录,默认 yyyy-mm-dd-HH,请尽量使用绝对路径,如(windows下):"D:\\360Downloads\\test" :param useOldName: str 是否使用原文件名作为下载后的文件名,默认不使用 :return: """ if not url: return oldFileName = os.path.basename(url) pattern = r'\.(.*?)$' match = re.search(pattern, oldFileName) suffix = match.group(1) t = time.localtime(time.time()) if foldername == '': foldername = str(t.__getattribute__("tm_year")) + "-" + str(t.__getattribute__("tm_mon")) + "-" + str( t.__getattribute__("tm_mday")) + "-" + str(t.__getattribute__("tm_hour")) picpath = foldername # 下载到的本地目录 if not os.path.exists(picpath): # 路径不存在时创建一个 os.makedirs(picpath) if filename == '': filename = time.strftime("%Y%m%d%H%M%S", time.localtime()) + str(random.randint(100, 999)) + '.' + suffix if useOldName: filename = oldFileName target = picpath + '\\%s' % (filename,) image = urllib.request.urlretrieve(url, target)
db.py
# -*- coding: utf-8 -*-import pymysqlclass db: '数据库操作类' dbconnect = '' # 数据库连接对象 error = '' # 错误信息 def __init__(self, host, username, password, db='', port=3306): '构造方法' try: self.dbconnect = pymysql.connect(host, username, password, db, cursorclass=pymysql.cursors.DictCursor, charset='utf8') except pymysql.Error as e: self.error = str(e) pass def __del__(self): '析构方法' self.close() def execute(self, sql): '执行sql' if self.dbconnect == '': return self.error cursor = self.dbconnect.cursor() db = self.dbconnect try: # 执行SQL语句 cursor.execute(sql) # 提交到数据库执行 db.commit() except: # 发生错误时回滚 db.rollback() return cursor def queryAll(self, sql): '执行一个select sql并放回结果' if self.dbconnect == '': return self.error cursor = self.dbconnect.cursor() cursor.execute(sql) data = cursor.fetchall() return data def queryRow(self, sql): '执行一个select sql并放回一条结果' if self.dbconnect == '': return self.error cursor = self.dbconnect.cursor() cursor.execute(sql) data = cursor.fetchone() return data def queryScalar(self, sql): '执行一个select sql并放回一条字段' if self.dbconnect == '': return self.error data = self.queryRow(sql) res = '' values = data.values() count = 0 for i in values: count += 1 res = i if count == 1: break return res def close(self): if self.dbconnect: self.dbconnect.close()if __name__ == '__main__': db = db('172.23.16.91', 'unipei', 'jiaparts','jpd') one = db.queryRow('select * from jpd.jpd_user limit 1') all = db.queryAll('select * from jpd.jpd_organ limit 10') count = db.queryScalar('select count(*) from jpd.jpd_organ') delSql = 'delete from pap.pap_evaluation_system_history limit 100' delRes = db.execute(delSql) updata = 'update jpd.jpd_user set lastvisittime=1 where id=61' updateRes = db.execute(updata) print(one) print(all) print(count) print(delRes.rowcount) print(updateRes.rowcount)
sql
CREATE TABLE `tieba` ( `ID` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键', `TieID` bigint(11) DEFAULT NULL, `StartPage` int(11) DEFAULT NULL COMMENT '开始页面', `EndPage` int(11) DEFAULT NULL COMMENT '结束页面', `CreateTime` int(13) DEFAULT NULL COMMENT '创建时间', PRIMARY KEY (`ID`)) ENGINE=InnoDB AUTO_INCREMENT=31 DEFAULT CHARSET=utf8;
0 0
- Python 下载百度贴吧的图片
- python 下载百度贴吧图片
- python 下载百度图片
- python 百度贴吧爬虫(下载图片)
- Python下载百度贴吧帖子里面的图片
- python爬虫:下载百度贴吧图片学习笔记
- 批量下载百度贴吧帖子图片
- 我的第一个python爬虫程序(从百度贴吧自动下载图片)
- python爬虫:下载百度贴吧图片(多页)学习笔记
- Python爬虫实战(五) :下载百度贴吧帖子里的所有图片
- 用python抓包下载百度图片
- python-下载固定百度图片地址
- Python根据关键字百度搜索下载图片
- 利用Python 实现下载百度图片
- Python 下载百度图片搜索结果
- Python学习--下载图片--下载百度的固定页面图片
- Python 爬虫获取百度贴吧图片
- Python 3 抓取百度贴吧图片
- java基础总结_06
- 有序数组创建高度最小的二叉查找树
- POJ 1753 Flip Game (状态压缩+BFS) -- 解题报告
- Android子线程居然可以更新UI?
- 以前遗留下的一个问题(关于手机解锁)
- python 下载百度贴吧图片
- Log4j使用详解
- jsp静态包含和动态包含的区别?
- 统计单词数
- js使用ajax方法遇到的servlet传值失败的可能情况
- Linux(centOS6.5)下SVN的安装、配置及开机启动
- Activiti 工作流 5.19.0 教程(1)
- AngularJS API
- 异步编程之Promise