python_爬取博客内容

来源：互联网发布：数控切割机编程实例编辑：程序博客网时间：2024/05/16 02:13

# -*- coding: utf-8 -*-__author__ = 'YangShengjie'import urlliburl = "http://blog.sina.com.cn/s/blog_4701280b0100h3c8.html"#下载文章：前扑后继#之所以选择这篇文章是因为其网页结构的特殊性，中间嵌套了视频conn = urllib.urlopen(url).read() #读取网页#print conns1 = conn.find(r'<div id="sina_keyword_ad_area2') #将文章内容所在区域化块t1 = conn.find(r'</div>',s1) #运用相对位置conn1 = conn[s1:t1]#print conn1s = conn1.find(r'<p STYLE') #缩小文章内容所在区域，在上一个获取的文章块内再次化快t = conn1.find(r'</P>',s)conn2= conn1[s +29 :t]  #获取首段内容，其结构较特殊，不能加入循环#print conn2content = " "while s !=-1 and t !=-1: # 控制爬取的范围    content = content +'\n'+ conn2  #将文章连在一块，每一段之间用换行标示    s = conn1.find(r'<p STYLE',t)    t = conn1.find(r'</P>',s)    conn2= conn1[s+45:t]  #不同于首段的29else:   print content #输出文章内容#下载文章filename = url[26:]open(filename,'w').write(content)

0 0