python 下载百度贴吧图片

来源:互联网 发布:人海战术 知乎 编辑:程序博客网 时间:2024/05/01 07:49

主程序

#!/usr/bin/python# -*-coding:utf-8-*-import urllib.parse, urllib.request, http.cookiejar, re, timeimport toolsimport threadingfrom db import dbclass tieba(threading.Thread):    '下载贴吧图片'    # http://tieba.baidu.com/p/4519246742?see_lz=1&pn=1    url = 'http://tieba.baidu.com/p/%s?see_lz=1&pn=%s'    def __init__(self, tieid=4690733195, page=1):        threading.Thread.__init__(self)        self.tieid = tieid        self.page = page    @staticmethod    def getEndPage(tieid):        '获取帖子共多少页'        url = tieba.url % (tieid, 1)        res = urllib.request.urlopen(url)        text = res.read().decode('utf-8')        pattern = r'(\d+)</span>回复贴,共<span class="red">(\d+)</span>页'        match = re.search(pattern, text)        if match:            pages = match.group(2)        else:            pages = 1        return int(pages)    def run(self):        '线程'        url = tieba.url % (self.tieid, self.page)        res = urllib.request.urlopen(url)        text = res.read().decode('utf-8')        pattern = r'<img class="BDE_Image"([\s\S]*?)src="(.*?)"'        match = re.findall(pattern, text)        for i in match:            url = i[1]            print('第%s页,下载地址:%s' % (self.page, url))            tools.download(url)@tools.runTime('tieba.log')def main():    '主方法'    tieID = 4519246742  #贴子ID,http://tieba.baidu.com/p/4671247923  帖子ID:4671247923    endpage = tieba.getEndPage(tieID)    mysql = db('127.0.0.1', 'root', '', 'test')    sql = 'select * from tieba where tieid=%s order by id desc limit 1' % tieID    data = mysql.queryRow(sql)    # 每次下载5页    size = 5    if data:        if int(data['EndPage']) > endpage:            print("已到尾页,结束下载!")            exit()        start = data['EndPage'] + 1        end = start + size - 1    else:        start = 1        end = 5    sql = """        INSERT INTO `test`.`tieba` (        `tieid`,        `StartPage`,        `EndPage`)        VALUES('%s','%s', '%s');        """ % (tieID, start, end)    mysql.execute(sql)    threads = []    for page in range(start, end + 1):        if page > endpage:            print("已到尾页,结束下载!!")            break        thread = tieba(tieID, page)        thread.start()        threads.append(thread)    for t in threads:        # 等待所有线程完成        t.join()    print("退出主线程")if __name__ == '__main__':    main()

tools.py
#!/usr/bin/python# -*-coding:utf-8-*-import time, randomimport urllib.requestimport os.path, re'''自定义工具方法,tools.py'''def runTime(file='test.log'):    def _runTime(func):        '记录程序运行时间'        def newFunc(*args, **kwargs):            start = time.clock()            log('开始任务', file)            res = func(*args, **kwargs)            end = time.clock()            msg = "结束任务,运行了: %f 秒" % (end - start)            log(msg, file)            print(msg)            return res        return newFunc    return _runTimedef log(content, file='test.log', type=1):    if type == 1:        f = open(file, 'a+', encoding='utf-8')    else:        f = open(file, 'w+', encoding='utf-8')    t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())    content = t + ' : ' + content + '\r'    f.write(content)def download(url, filename='', foldername='', useOldName=False):    """    :param url: str 文件下载地址    :param filename: str 下载后的文件名,默认:yyyymmddHHiiss+3位随机数    :param foldername: str 下载目录,默认 yyyy-mm-dd-HH,请尽量使用绝对路径,如(windows下):"D:\\360Downloads\\test"    :param useOldName: str 是否使用原文件名作为下载后的文件名,默认不使用    :return:    """    if not url:        return    oldFileName = os.path.basename(url)    pattern = r'\.(.*?)$'    match = re.search(pattern, oldFileName)    suffix = match.group(1)    t = time.localtime(time.time())    if foldername == '':        foldername = str(t.__getattribute__("tm_year")) + "-" + str(t.__getattribute__("tm_mon")) + "-" + str(            t.__getattribute__("tm_mday")) + "-" + str(t.__getattribute__("tm_hour"))    picpath = foldername  # 下载到的本地目录    if not os.path.exists(picpath):  # 路径不存在时创建一个        os.makedirs(picpath)    if filename == '':        filename = time.strftime("%Y%m%d%H%M%S", time.localtime()) + str(random.randint(100, 999)) + '.' + suffix    if useOldName:        filename = oldFileName    target = picpath + '\\%s' % (filename,)    image = urllib.request.urlretrieve(url, target)

db.py

# -*- coding: utf-8 -*-import pymysqlclass db:    '数据库操作类'    dbconnect = ''  # 数据库连接对象    error = ''  # 错误信息    def __init__(self, host, username, password, db='', port=3306):        '构造方法'        try:            self.dbconnect = pymysql.connect(host, username, password, db, cursorclass=pymysql.cursors.DictCursor,                                             charset='utf8')        except pymysql.Error as e:            self.error = str(e)            pass    def __del__(self):        '析构方法'        self.close()    def execute(self, sql):        '执行sql'        if self.dbconnect == '':            return self.error        cursor = self.dbconnect.cursor()        db = self.dbconnect        try:            # 执行SQL语句            cursor.execute(sql)            # 提交到数据库执行            db.commit()        except:            # 发生错误时回滚            db.rollback()        return cursor    def queryAll(self, sql):        '执行一个select sql并放回结果'        if self.dbconnect == '':            return self.error        cursor = self.dbconnect.cursor()        cursor.execute(sql)        data = cursor.fetchall()        return data    def queryRow(self, sql):        '执行一个select sql并放回一条结果'        if self.dbconnect == '':            return self.error        cursor = self.dbconnect.cursor()        cursor.execute(sql)        data = cursor.fetchone()        return data    def queryScalar(self, sql):        '执行一个select sql并放回一条字段'        if self.dbconnect == '':            return self.error        data = self.queryRow(sql)        res = ''        values = data.values()        count = 0        for i in values:            count += 1            res = i            if count == 1:                break        return res    def close(self):        if self.dbconnect:            self.dbconnect.close()if __name__ == '__main__':    db = db('172.23.16.91', 'unipei', 'jiaparts','jpd')    one = db.queryRow('select * from jpd.jpd_user limit 1')    all = db.queryAll('select * from jpd.jpd_organ limit 10')    count = db.queryScalar('select count(*) from jpd.jpd_organ')    delSql = 'delete from pap.pap_evaluation_system_history limit 100'    delRes = db.execute(delSql)    updata = 'update jpd.jpd_user set lastvisittime=1 where id=61'    updateRes = db.execute(updata)    print(one)    print(all)    print(count)    print(delRes.rowcount)    print(updateRes.rowcount)

sql

CREATE TABLE `tieba` (  `ID` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',  `TieID` bigint(11) DEFAULT NULL,  `StartPage` int(11) DEFAULT NULL COMMENT '开始页面',  `EndPage` int(11) DEFAULT NULL COMMENT '结束页面',  `CreateTime` int(13) DEFAULT NULL COMMENT '创建时间',  PRIMARY KEY (`ID`)) ENGINE=InnoDB AUTO_INCREMENT=31 DEFAULT CHARSET=utf8;



0 0
原创粉丝点击