【python】爬虫2——下载亦舒博客首页所有文章

来源:互联网 发布:创奇老照片修复软件 编辑:程序博客网 时间:2024/05/23 01:16
#! /usr/bin/env python#coding=utf-8from urllib import urlopenimport timeurl = ['']*40i = 0arti = urlopen('http://blog.sina.com.cn/s/articlelist_1227636382_0_1.html').read()title = arti.find(r'<a title=')href = arti.find(r'href=',title)html = arti.find(r'.html',href)url[0] = arti[href+6:html+5]print urlwhile title != -1 and href != -1 and html != -1 and i<40:    url[i] = arti[href+6:html+5]    print url[i]    title = arti.find(r'<a title=',html)    href = arti.find(r'href=',title)    html = arti.find(r'.html',href)    i = i + 1else:    print 'find end'    j=0while j<50:    content = urlopen(url[j]).read()    filename = url[j][-26:]    print filename    open(r'yishu/'+url[j][-26:],'w+').write(content)    print 'downloading',url[j]    j = j+1    time.sleep(1)else:    print 'download article finished'






0 0