Python伪装浏览器请求爬虫豆瓣小组

来源：互联网发布：python适合初学者的书编辑：程序博客网时间：2024/05/17 05:14
Python爬虫，下载豆瓣小组图片
# -*- coding: utf-8 -*-# -----------------------------------------------#   程序：豆瓣小组图片爬虫#   版本：1.0#   语言：Python 3.4#   作者：gdp12315#   操作：输入豆瓣小组讨论版块地址、起始页面、终止页面#   功能：下载小组帖子里发布的图片#   注意：下载的保存地址为作者本机地址 读者根据自身情况更改# -----------------------------------------------import randomimport socketimport http.cookiesimport http.cookiejarimport urllib.request,re,timeERROR = {        '0':'Can not open the url,checck you net',        '1':'Creat download dir error',        '2':'The image links is empty',        '3':'Download faild',        '4':'Build soup error,the html is empty',        '5':'Can not save the image to your disk',    }class BrowserBase(object):     def __init__(self):        socket.setdefaulttimeout(20)    def speak(self,name,content):        print('[%s]%s', name,content)    def openurl(self,url):        """        打开网页        """        cookie_support= urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar())        self.opener = urllib.request.build_opener(cookie_support,urllib.request.HTTPHandler)        urllib.request.install_opener(self.opener)        user_agents = [                    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',                    'Opera/9.25 (Windows NT 5.1; U; en)',                    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',                    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',                    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',                    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',                    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",                    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",                    ]                agent = random.choice(user_agents)        self.opener.addheaders = [("User-agent",agent),("Accept","*/*"),('Referer',url)]        try:            res = self.opener.open(url)            #print(res.read())        except Exception as e:            self.speak(str(e),url)            raise Exception        else:            return resif __name__=='__main__':    splider=BrowserBase()# ------------ begin ----------------------------# 输入示例# http://www.douban.com/group/Xsz/discussion?start=# 1# 2#print('请输入豆瓣小组地址，去掉start=后面的数字')url = str(input(u'请输入豆瓣小组地址，去掉start=后面的数字：\n'))#url =  'http://www.douban.com/group/blabla/discussion?start='page_bgn = int(input(u'请输入开始时的页码:\n'))page_end = int(input(u'请输入结束时的页码:\n'))num_end = (page_end-1)*25num_now = (page_bgn-1)*25while num_now <= num_end:    # 获得主题列表页面    html_topic_list = splider.openurl(url+str(num_now)).read().decode('utf-8')    # 获得主题列表    re_topic_list = re.compile(r'http://www\.douban\.com/group/topic/\d+')    topic_list = re_topic_list.findall(html_topic_list)    # 遍历每个主题 将其中图片下载下来    for topic_url in topic_list:        print('topic_url '+topic_url)        html_topic = splider.openurl(topic_url).read().decode('utf-8')        # 进入主题 获得图片下载地址列表（图片可能有多张）        re_img_list = re.compile(r'http://img\d\.douban\.com/view/group_topic/large/public/.+\.jpg')        img_list = re_img_list.findall(html_topic)        # 遍历图片下载地址列表 把每张图片保存到对应位置        for img_url in img_list:            print('img_url: '+img_url)            img_name = re.findall(r'p\d{7}',img_url)            download_img = urllib.request.urlretrieve(img_url,'D:\Python\pics\%s.jpg'%img_name)            time.sleep(2)    num_now = num_now + 25else:    print('采集完成!')
1 0