小小爬虫很狂暴-----day01

来源:互联网 发布:小香猪 长大 知乎 编辑:程序博客网 时间:2024/04/28 23:38

小爬虫,环境:

1, python 2.7环境

2, 必须确保联网状态。

3,装好需要的库文件。 对于pybloomfilter 的引用,本人经验:

不要去 pip install pybloomfilter, 装上之后,不会报错,但无法运行。

应该 pip install pybloomfiltermmap ( 加上mmap,两个m)

           4,本人爬虫是在Ubuntu16.04上(还好有qq,否则在ubuntu 都不知道如何截图了 ),爬完之后的界面如下,只要想爬可以一直爬下去,不过我运行了一会就中断了:




闲话不多说,代码如下:
#coding=utf-8import urllib2import httplibimport refrom pybloomfilter import BloomFilterimport osrequest_headers = {    'host': "www.mafengwo.cn",    'connection': "keep-alive",    'cache-control': "no-cache",    'upgrade-insecure-requests': "1",    'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",    'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",    'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"}city_home_pages = []city_ids = []dirname = 'mafengwo_notes/'# 创建 Bloom Filterdownload_bf = BloomFilter(1024 * 1024 * 16, 0.01)# 检查用于存储网页文件夹是否存在,不存在则创建if not os.path.exists(dirname):    os.makedirs(dirname)def download_city_notes(id):    for i in range(1, 999):        #游记中用于翻页的网页地址        url = 'http://www.mafengwo.cn/yj/%s/1-0-%d.html' % (id, i)        if url in download_bf:            continue        print 'open url %s' % (url)        download_bf.add(url)        req = urllib2.Request(url, headers=request_headers)        response = urllib2.urlopen(req)        htmlcontent = response.read()        city_notes = re.findall('href="/i/\d{7}.html', htmlcontent)        # 如果导航页错误,该页的游记数为0,则意味着 1-0-xxx.html 已经遍历完,结束这个城市        if len(city_notes) == 0:            return        for city_note in city_notes:            try:                city_url = 'http://www.mafengwo.cn%s' % (city_note[6:])                if city_url in download_bf:                    print 'bukenengzhixingdao'                    print city_url                    print download_bf                    continue                if city_url in city_notes:                    print 'yidingyou'                print 'download %s' % (city_url)                req = urllib2.Request(city_url, headers=request_headers)                response = urllib2.urlopen(req)                html = response.read()                filename = city_url[7:].replace('/', '_')                fo = open("%s%s" % (dirname, filename), 'wb+')                fo.write(html)                fo.close()                download_bf.add(city_url)            except Exception, Arguments:                print Arguments                continuetry:    # 下载目的地的首页    req = urllib2.Request('http://www.mafengwo.cn/mdd/', headers=request_headers)    # print req    response = urllib2.urlopen(req)    #获取了,整个页面的信息    htmlcontent = response.read()    # print htmlcontent    # 利用正则表达式,找出所有的城市主页    city_home_pages = re.findall('/travel-scenic-spot/mafengwo/\d{5}.html', htmlcontent)    print len(city_home_pages)    # print city_home_pages    #'/travel-scenic-spot/mafengwo/10065.html', '/travel-scenic-spot/mafengwo/10099.html', '/travel-scenic-spot/mafengwo/10208.html',    # # 通过循环,依次下载每个城市下的所有游记    for city in city_home_pages:        city_ids.append(city[29:34])        #添加一些list`数组        print city_ids        download_city_notes(city[29:34])except urllib2.HTTPError, Arguments:    print Argumentsexcept httplib.BadStatusLine:    print 'BadStatusLine'except Exception, Arguments:    print Arguments


0 0