python3.5 beautiful4.4 扣扣国内新闻 爬虫
来源:互联网 发布:python 命令行输入 编辑:程序博客网 时间:2024/04/28 13:34
- #!/usr/bin/python3
- # -*- coding: UTF-8 -*-
- '''
- Created on 2016年11月18日
- @author: baoyou <span style="line-height: 1.5;">curiousby@163.com</span>
- '''
- 下载
- #http://ssdfz001.iteye.com/blog/2228685
- import urllib.request
- import urllib.parse
- import os, sys
- import codecs
- import bs4
- from bs4 import BeautifulSoup
- import re
- import urllib.request, urllib.parse, http.cookiejar
- #跟网址 http://news.qq.com/c/816guonei_1.htm
- base_url='http://news.qq.com/'
- url='http://news.qq.com/c/816guonei_1.htm'
- #存储路径
- save_path='C:/Users/cmcc-B100036/Desktop/'
- save_img='img'
- save_txt='text'
- #抽取正则
- reg = '<a target=\"_blank\" class=\"pic\" href=\"([^\"]*)\"><img class=\"picto\" src=\"([^\"]*)\"></a><em class=\"f14 l24\"><a target=\"_blank\" class=\"linkto\" href=\"[^\"]*\">([^</a>]*)</a></em><p class=\"l22\">([^</p>]*)</p>'
- #request消息头
- heads = {
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Accept-Encoding':'gzip, deflate, sdch',
- 'Accept-Language':'zh-CN,zh;q=0.8',
- 'Cache-Control':'max-age=0',
- 'Host':'news.qq.com',
- 'Proxy-Connection':'keep-alive',
- 'Upgrade-Insecure-Requests':'1',
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
- }
- #获取网页信息
- def getHtml(url):
- fp = urllib.request.urlopen(url)
- bytes = fp.read()
- respAllHtml = bytes.decode('gbk')
- fp.close();
- #print('---- respAllHtml----',respAllHtml);
- return respAllHtml;
- #获取新闻列表
- def getList(url):
- respHtml = getHtml(url);
- #print('---- respHtml----',respHtml);
- soup = BeautifulSoup(respHtml ,'html.parser');
- list = soup.find_all('div',class_='Q-tpList');
- # print('-----------list .len------------',len(list));
- contents=[]
- for x in list:
- contents.append(x)
- return contents
- #获取文本信息到本地
- def loadText(contents):
- for content in contents :
- load(content)
- #下载
资源 - def load(content):
- # print(content.prettify());
- #
- # print(content.find('a',class_='pic'))
- # print(content.find('a',class_='pic')['href'])
- # print(content.find('a',class_='pic').img)
- # print(content.find('a',class_='pic').img['src'])
- # print( content.find('a',class_='linkto'))
- # print( content.find('a',class_='linkto').get_text())
- # print(content.find('p'))
- urlsuffix=content.find('a',class_='pic')['href'];
- detailurl=base_url + urlsuffix;
- detailimg= content.find('a',class_='pic').img['src'];
- detailtitle = content.find('a',class_='linkto').get_text();
- detailcontent = content.find('p').get_text();
- save_path='C:/Users/cmcc-B100036/Desktop/'
- save_path = save_path+urlsuffix.replace(".htm","");
- if not os.path.exists(save_path):
- os.makedirs( save_path, 0o755 );
- newstext = save_path+'/%s'%save_txt
- newsimg= save_path+'/%s'%save_img
- if not os.path.exists(newstext):
- os.makedirs( newstext, 0o755 );
- if not os.path.exists(newsimg):
- os.makedirs( newsimg, 0o755 );
- urllib.request.urlretrieve(detailimg,newsimg+"/img.png" );
- with codecs.open(newstext+"/text.txt",'w+','utf-8') as fp:
- fp.write(detailurl+'\t'+detailimg+'\t'+detailtitle+'\t'+detailcontent)
- #print ('------------------------------------------------------------ end one news')
- if __name__=="__main__":
- # url=raw_input("""输入目标网址\n 按回车键结束\n""")
- print ('---------------------start--------------------------------------')
- url='http://news.qq.com/c/816guonei_1.htm';
- contents = getList(url);
- loadText(contents);
- print ('---------------------end---------------------------------------')
0 0
- python3.5 beautiful4.4 扣扣国内新闻 爬虫
- python3.4-小爬虫
- Python3.4爬虫编程
- python3.0 网络爬虫 5
- python3.5以上 爬虫 (模块化)
- python3.4 百度贴吧小爬虫
- Python3.4 简单抓取爬虫
- python3.4-小爬虫2
- python3.0 网络爬虫 4
- python3 爬虫
- python3爬虫
- python3 爬虫
- Python3 爬虫
- python3.5 爬虫 基于广度优先算法
- Python3.5爬虫urllib系列之三
- 小白Python3爬虫3-5
- python3.4爬虫批量下载音乐
- Python3.4网页爬虫,提取图片
- 【iOS】Swift3 报错:Value of type 'AppDelegate' has no member 'managedObjectContext',UIApplication has no
- 兼容ie8的rgba()方法
- DataNode启动优化改进:磁盘检测并行化
- 【转载】Android酷炫实用的开源框架(UI框架)
- 手机签到页面
- python3.5 beautiful4.4 扣扣国内新闻 爬虫
- Caused by: org.hibernate.MappingException: Repeated column in mapping for entity: com.yyf.entity.Cus
- Java集合
- sdk+ndk+eclipse搭建android开发环境
- IOS绘制虚线的方法,可以给cell设置虚线分割线
- [LeetCode]Flatten Binary Tree to Linked List
- BLE-CC2640之添加自定义服务
- iOS 自定义多选相册
- Remote development of CUDA Applications CUDA应用远程编译及运行