Python网络爬虫对知乎首页进行爬取

来源：互联网发布：时间顺序数据挖掘编辑：程序博客网时间：2024/05/16 01:24
# -*- coding: UTF-8 -*-import urllib, urllib2, cookielib, re, time, osimport requestsprint 'zhihu_QA'print 'Please input your email:'MyEmail = raw_input()print 'Please input your password:'MyPassWord = raw_input()print '---------------------------------------------'#基本信息Url = 'http://www.zhihu.com/login'#User_Agent每个电脑都不一样User_Agent = **********************************************'#MyReferer = 'http://www.zhihu.com/'MyValues = {'email' : MyEmail, 'password' : MyPassWord}MyHeaders = {'User-Agent' : User_Agent, }MyRequests = requests.session()MyCont = MyRequests.post(Url, data = MyValues, headers = MyHeaders)MyCont2 = MyCont.text.encode('UTF-8')print MyCont2def GetNowTime():return time.strftime("%Y-%m-%d_%H-%M-%S",time.localtime(time.time()))WordPathName = r'D:\\MyZhiHu' + GetNowTime() + '.docx'String1 = 'h2'String2 = 'div'from bs4 import BeautifulSoupMySoup1 = BeautifulSoup(MyCont2)def has_need(tag):return tag.has_attr('class') and tag.has_attr('feed-item-a') and tag.has_attr('data-type')def has_need_photo(tag):return tag.has_attr('class') and tag.has_attr('src')QuestionNum = 1from docx import Documentfrom docx.shared import InchesMyPhotoPath = 'D:\\MyZhiHuTempPhoto.jpg'if os.path.exists(WordPathName):print '文件存在'MyDocument = Document(WordPathName)else:print '文件不存在'MyDocument = Document()MyDocument.add_heading(u'我的知乎', 0)for MyTag in MySoup1.find_all(has_need):print '****************************************************************'#找问题标题MyQuestion = MyTag.find(name = 'a', class_ ='question_link')#找问题详情MyMoreQuestion = MyTag.find(name = 'div', class_ ='question-description zm-editable-content')#找问题回答MyAnswer = MyTag.find(name = 'textarea', class_ = 'content hidden')#找回答者和回答者的个人说明MyFindPerson = MyTag.find(name = 'h3', class_ = 'zm-item-answer-author-wrap')#找赞同数MyFindLabel = MyTag.find(name = 'span', class_ = 'count')MyDocument.add_page_break()QN = '这是第' + str(QuestionNum) + '个问题'print QNQuestionNum += 1MyDocument.add_paragraph(QN.decode('UTF-8'))if MyQuestion is not None:print ('问题：')print ''MyDocument.add_heading(u'问题：', level=1)print ''MyTempSoup = BeautifulSoup(str(MyQuestion))MyTST = MyTempSoup.textprint MyTSTMyDocument.add_paragraph(MyTST)else:print '该问题不存在！'MyDocument.add_heading(u'该问题不存在！', level=1)if MyMoreQuestion is not None:print '问题详情：'MyDocument.add_heading(u'问题详情：', level=1)MyTempSoup = BeautifulSoup(str(MyMoreQuestion))MyTST = MyTempSoup.textprint MyTSTMyDocument.add_paragraph(MyTST)else:print '该问题没有详情！'MyDocument.add_heading(u'该问题没有详情！', level=1)print ''if MyFindPerson is not None:print '回答作者和个人说明：'MyDocument.add_heading(u'回答作者和个人说明：', level=1)MyTempSoup = BeautifulSoup(str(MyFindPerson))#print MyTempSoupMyTST = MyTempSoup.textprint MyTSTMyDocument.add_paragraph(MyTST)else:print '该问题没有作者！'MyDocument.add_heading(u'该问题没有作者！', level=1)if MyFindLabel is not None:print '回答赞同数：'MyDocument.add_heading(u'回答赞同数：', level=1)MyTempSoup = BeautifulSoup(str(MyFindLabel))MyTST = MyTempSoup.textprint MyTSTMyDocument.add_paragraph(MyTST)else:print '该问题没有人赞同！'MyDocument.add_heading(u'该问题没有人赞同！', level=1)if MyAnswer is not None:print '问题回答：'MyDocument.add_heading(u'问题回答：', level=1)MyTempSoup = BeautifulSoup(str(MyAnswer))MyTST = MyTempSoup.textMyTempAnswer = BeautifulSoup(str(MyTST.encode('UTF-8')))print '以下是回答正文（含图片链接）'print MyTempAnswerprint '以上是回答正文（含图片链接）'MyDocument.add_paragraph(u'以下是回答正文（含图片链接）')MyDocument.add_paragraph(MyTST)MyDocument.add_paragraph(u'以上是回答正文（含图片链接）')PhotoNum = 1MyStartS = 'src="'MyEndS = 'jpg'for MyPhotos in MyTempAnswer.find_all(has_need_photo):if str(MyPhotos).find(MyStartS) >= 0:MyUrlS = str(MyPhotos).find(MyStartS)MyUrlE = str(MyPhotos).find(MyEndS, MyUrlS + 1)MyUrlP = str(MyPhotos)[MyUrlS + len(MyStartS) : MyUrlE + len(MyEndS)]#print MyUrlP#下载图片，用同一个名字命名，因为如果重名会覆盖，所以最后删除就好了urllib.urlretrieve(MyUrlP, MyPhotoPath)#这是测试校验用的图片urllib.urlretrieve(MyUrlP, 'd:\\11\\'+ str(QuestionNum) + '-' + str(PhotoNum) + '.jpg')PhotoNum += 1try:if os.path.exists(MyPhotoPath):MyDocument.add_picture(MyPhotoPath, width = Inches(3))else:print '没有下载的图片（错误）'MyDocument.add_paragraph(u'没有下载的图片（错误）')except Exception, e:print '图片写入失败'print eErrorPhoto = '第' + str(hahatest) + '个问题的第' + str(a) + '张图片写入word失败'print ErrorPhotoMyDocument.add_paragraph(ErrorPhoto.decode('UTF-8'))else:print '该问题没有回答！'MyDocument.add_heading(u'该问题没有回答！', level=1)print '================================================================'MyDocument.add_heading(u'=========================问题分割线=========================', level=1)MyDocument.save(WordPathName)if os.path.exists(MyPhotoPath):#存在图片，删除图片os.remove(MyPhotoPath)print '最后一张缓存图片已删除'else:print '最后不存在缓存图片（错误）'
0 0