python爬虫——获取新浪新闻前两页新闻信息

来源:互联网 发布:软装搭配软件 编辑:程序博客网 时间:2024/06/05 16:03
import requestsimport jsonfrom bs4 import BeautifulSoupimport reimport jsonfrom datetime import datetimecommentURL='http://comment5.news.sina.com.cn/page/info?version=1&\                 format=js&channel=sh&newsid=comos-{}&\                 group=&compress=0&ie=utf-8&oe=utf-8&page=1&\                 page_size=20'def getCommentCount(newsurl):    m=re.search('doc-i(.+).shtml',newsurl)    newsid=m.group(1)    comments=requests.get(commentURL.format(newsid))    jd=json.loads(comments.text.strip('var data='))    return jd['result']['count']['total']def getNewsDetail(newsurl):    result={}    res=requests.get(newsurl)    res.encoding='utf-8'    soup=BeautifulSoup(res.text,'html.parser')    result['title']=soup.select('#artibodyTitle')[0].text    result['newssource']=soup.select('.time-source span a')[0].text    timesource=soup.select('.time-source')[0].contents[0].strip()    result['dt']=datetime.strptime(timesource,'%Y年%m月%d日%H:%M')    result['article']=' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])    result['editor']=soup.select('.show_author')[0].text.lstrip('责任编辑:')    result['comments']=getCommentCount(newsurl)    return result#news='http://news.sina.com.cn/c/nd/2017-12-09/doc-ifypnyqi2478315.shtml'#print(getNewsDetail(news))def parserlistlinks(url):    newsdetails=[]    res=requests.get(url)    jd=json.loads(res.text.lstrip('  newsloadercallback(').rstrip(');'))    for ent in jd['result']['data']:        newsdetails.append(getNewsDetail(ent['url']))    return newsdetails#news='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page=2&callback=newsloadercallback&_=1512876098022'#print(parserlistlinks(news))url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1512876098022'news_total=[]for i in range(1,3):    newsurl=url.format(i)    newsary=parserlistlinks(newsurl)    news_total.extend(newsary)    print(news_total)#print(len(news_total))'''import pandasdf=pandas.DataFrame(news_total)#print(df.head(2))print(df.to_excel('news.xlsx'))'''
原创粉丝点击