Java爬虫入门(二):单机单程序单线程-提供种子url用广度优先算法实现新闻资讯获取
来源:互联网 发布:王者荣耀聊天软件 编辑:程序博客网 时间:2024/06/06 15:50
在一的基础上,简单新增了广度url爬取算法。
缺点:单线程,url爬取算法,新闻内容爬取,都丢在同一个线程,效率很慢。 后续继续优化。
(自己有留意,没有爬取过疯狂(程序刚入门),所以没有ip跳板。)
待解决问题: 用多线程,实现业务分离(内容爬取算法,url爬取算法),提高抓取效率,优化抓取算法,待抓取队列数据结构选用,
----------------main测试方法-------------
package com.kimt.newsdrawler;import com.kimt.newsdrawler.crawler.IFengCrawler;import com.kimt.newsdrawler.dto.News;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.util.HashSet;import java.util.List;import java.util.concurrent.LinkedBlockingQueue;/** * Created by man on 2017/11/21. */public class UserMain { private static Logger logger = LoggerFactory.getLogger(UserMain.class); public static void main(String[] args) { /* version_1.0 new IFengCrawler("http://news.ifeng.com/a/20171121/53459907_0.shtml").parserForNews();*/ /* version_2.0测试广度优先遍历算法 // 初始化待抓取url队列,已抓取url集合 LinkedBlockingQueue<String> toCatcheUrl = new LinkedBlockingQueue<String>(); HashSet<String> catchedUrl = new HashSet<String>(); // 传入种子url,爬取url到队列中 new IFengUrlCatcher(toCatcheUrl,catchedUrl).urlCatch("http://news.ifeng.com/"); logger.info("info:",toCatcheUrl);*/ /*version_2.1 测试广度优先遍历算法,并且爬取数据*/ LinkedBlockingQueue<String> toCatcheUrl = new LinkedBlockingQueue<String>(); HashSet<String> catchedUrl = new HashSet<String>(); List<News> list = new IFengCrawler(toCatcheUrl, catchedUrl).parserForNews("http://news.ifeng.com/"); logger.info("一共爬取了 "+list.size()+" 条新闻"); }}
----------------提供种子url,广度url爬取算法-------------
package com.kimt.newsdrawler.urlcatcher;import com.kimt.newsdrawler.httpclientutils.HttpClientUtil;import org.apache.http.HttpEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.IOException;import java.util.*;import java.util.concurrent.LinkedBlockingQueue;/** * @Date create on 2017/11/22 * @author man * @Description */public class IFengUrlCatcher extends AbstractUrlCatcher { private static Logger logger = LoggerFactory.getLogger(IFengUrlCatcher.class); /** 待爬取的url队列 */ private LinkedBlockingQueue<String> toCatcheUrl; /** 已爬取的url集合*/ private HashSet<String> catchedUrl; /** * * @param toCatcheUrl 待抓取url队列 * @param catchedUrl 已抓取url集合 */ public IFengUrlCatcher(LinkedBlockingQueue<String> toCatcheUrl, HashSet<String> catchedUrl) { this.toCatcheUrl = toCatcheUrl; this.catchedUrl = catchedUrl; } @Override public void urlCatch(String seedUrl) { try { CloseableHttpResponse httpResponse = HttpClientUtil.getHttpResponse(seedUrl); HttpEntity entity = httpResponse.getEntity(); // 将Entity转成String格式html String html = EntityUtils.toString(entity, "utf-8"); // 遍历页面,可获取新闻内容的url入待抓取队 traversalUrlForIFengNews(html); } catch (IOException e) { e.printStackTrace(); } } /** * 遍历出html页面,获取所有能爬取新闻内容的url并且入队,不同的新闻网站算法不一样 * @param html */ private void traversalUrlForIFengNews(String html){ String baseUrl = "news.ifeng.com"; String url; Document doc = Jsoup.parse(html); //获取html页面的所有<a>标签 Elements elements = doc.getElementsByTag("a"); // 遍历所有<a>标签 for (Element e:elements) { // 获取所有<a>标签的href属性的值(url) url = e.attr("href"); // 如果该url是新闻网的内容页面,并且没有在已爬取队列中,则入队列 if(url.contains(baseUrl) && !catchedUrl.contains(url)){ try { toCatcheUrl.put(url.trim()); } catch (InterruptedException e1) { e1.printStackTrace(); logger.error("InterruptedException",e1.getMessage()); } } } }}----------------包装httpClient工具类,减少重复代码(后续待优化)-------------package com.kimt.newsdrawler.httpclientutils;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.IOException;/** * @author man * @Date create on 2017/11/22 * @Description */public class HttpClientUtil { private static Logger logger = LoggerFactory.getLogger(HttpClientUtil.class); public static CloseableHttpResponse getHttpResponse(String url) throws IOException { CloseableHttpClient client; client = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(url); // 设置请求头信息 httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.5"); httpGet.setHeader("Accept-Charset", "utf-8;q=0.7,*;q=0.7"); httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); httpGet.setHeader("Accept-Encoding", "gzip, deflate"); // 执行get请求 return client.execute(httpGet); }}
------------新闻爬取类-----------package com.kimt.newsdrawler.crawler;import com.kimt.newsdrawler.dto.News;import com.kimt.newsdrawler.httpclientutils.HttpClientUtil;import com.kimt.newsdrawler.urlcatcher.IFengUrlCatcher;import org.apache.http.HttpEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.IOException;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.*;import java.util.concurrent.LinkedBlockingQueue;/** * @author kimt * Created by man on 2017/11/23. */public class IFengCrawler extends AbstractCrawler { private Logger logger = LoggerFactory.getLogger(IFengCrawler.class); /** 待爬取的url队列 */ private LinkedBlockingQueue<String> toCatcheUrl; /** 已爬取的url集合*/ private HashSet<String> catchedUrl; private IFengUrlCatcher urlCatcher ; public IFengCrawler(LinkedBlockingQueue<String> toCatcheUrl, HashSet<String> catchedUrl) { this.toCatcheUrl = toCatcheUrl; this.catchedUrl = catchedUrl; this.urlCatcher = new IFengUrlCatcher(toCatcheUrl,catchedUrl); } @Override public List<News> parserForNews(String seedUrl) { // 先爬取种子url,初始化待爬取队列 urlCatcher.urlCatch(seedUrl); List<News> list = new ArrayList<News>(); try { String url; int stateCode; HttpEntity entity ; CloseableHttpResponse response; // 循环爬取待抓取队列中的url 200次 for(int i =0;i<200;i++){ // 从待抓取队列,拿出一条url url = toCatcheUrl.take(); // 广度优先算法,遍历该url下的所有可爬取的url,内部实现入队操作 urlCatcher.urlCatch(url); // 向指定url模拟发送GET请求 response = HttpClientUtil.getHttpResponse(url); // 获取http状态码 stateCode = response.getStatusLine().getStatusCode(); if (stateCode == AbstractCrawler.HTTP_RESPONSE_CODE_SUCCESS) { News news = null; // 从response中获取entity entity = response.getEntity(); // 将Entity转成String格式html String html = EntityUtils.toString(entity, "utf-8"); // 用Jsoup解析html Document doc = Jsoup.parse(html); String title = doc.title(); // 凤凰新闻网的第一种新闻页面 Element articleDiv = doc.getElementById("artical"); if (articleDiv != null){ news = parseOne(articleDiv, title); }else{ // 凤凰新闻网的第二种新闻页面,使用第二种解析方式 Element article2Div = doc.getElementsByClass("yc_main wrap").first(); if(article2Div != null){ news = parseTwo(article2Div, title); } } // 返回抓取到的新闻对象 if(news != null){ list.add(news); } // 标记为已抓取url catchedUrl.add(url); // 释放资源 EntityUtils.consume(entity); } } } catch (IOException e) { e.printStackTrace(); logger.error("IOException"+e.getMessage()); } catch (ParseException e) { e.printStackTrace(); logger.error("ParseException"+e.getMessage()); } catch (InterruptedException e) { e.printStackTrace(); logger.error("InterruptedException"+e.getMessage()); } return list; } /** * * @param articleDiv 最靠近新闻内容div * @param title 文章标题 * @return News对象 * 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据 */ private News parseOne(Element articleDiv, String title) throws ParseException { News news = new News(); news.setTitle(title); if (articleDiv != null){ // 获取新闻来源,发布时间 Element headDiv = articleDiv.getElementById("artical_sth"); // 获取新闻内容 Element contentDiv = articleDiv.getElementById("main_content"); if (headDiv != null){ // 获取发布时间 String publishTime = headDiv.getElementsByClass("ss01").text(); // 获取新闻来源 String origin = headDiv.getElementsByClass("ss03").text(); // 格式转换String->Data SimpleDateFormat sdf = new SimpleDateFormat("yyyy年MM月dd日 HH:mm:ss"); Date date = sdf.parse(publishTime); // News对象成员赋值 news.setPublishTime(date); news.setOrigin(origin); } if (contentDiv != null){ // 删除img标签 contentDiv.select("img").remove(); // 获取新闻内容html,方便后续分段,而不是直接获取text() String content = contentDiv.html(); // News对象成员赋值 news.setContent(content); } } return news; } /** * * @param article2Div 最靠近新闻内容div * @param title 文章标题 * @return News对象 * 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据 */ private News parseTwo(Element article2Div, String title) throws ParseException { News news = new News(); news.setTitle(title); if (article2Div != null){ // 获取新闻来源,发布时间 Element headDiv = article2Div.getElementsByClass("yc_tit").first(); // 获取新闻内容 Element contentDiv = article2Div.getElementById("yc_con_txt"); if (headDiv != null){ // 获取发布时间 String publishTime = headDiv.getElementsByTag("span").text(); // 获取新闻来源 String origin = headDiv.getElementsByTag("a").first().text(); // 格式转换String->Data SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); Date date = sdf.parse(publishTime); // News对象成员赋值 news.setPublishTime(date); news.setOrigin(origin); } if (contentDiv != null){ // 删除没用的div contentDiv.select("div").remove(); contentDiv.select("script").remove(); // 获取新闻内容html,方便后续分段,而不是直接获取text() String content = contentDiv.html(); // News对象成员赋值 news.setContent(content); } } return news; }}
阅读全文
0 0
- Java爬虫入门(二):单机单程序单线程-提供种子url用广度优先算法实现新闻资讯获取
- Java爬虫入门(一):单机单程序单线程-手动输入url获取新闻内容
- 单线程非递归的广度优先遍历算法
- Python爬虫入门学习--(单线程爬虫)
- Python爬虫学习(单线程爬虫(二))
- 中断可恢复性-爬虫系统(广度优先-python单进程版)
- OC实现算法(二)- 广度优先搜索
- 广度优先入门程序
- Python爬虫(单线程爬虫(三))
- Java线程之单生产者单消费者示例(二)
- 广度优先,模拟爬虫,嗅探URL
- 网络爬虫单线程的实现
- Python小程序:用广度优先搜索算法查询两个url之间的最短路径
- 单线程爬虫
- python-单线程爬虫
- 单线程爬虫
- (算法入门)基本图论-广度优先搜索之JAVA实现
- python3.5 爬虫 基于广度优先算法
- Android按钮单击事件的四种常用写法总结
- Android Studio与夜神模拟器开发调试 error: could not install *smartsocket* listener: cannot bind to 127.0.0.1:
- Vim常用操作-Nginx配置文件批量加注释。
- linux挂载光盘时出现mount: block device /dev/sr0 is write-protected, mounting read-only
- Base64与UTF-8
- Java爬虫入门(二):单机单程序单线程-提供种子url用广度优先算法实现新闻资讯获取
- 利用Tensorflow实现SSD架构model训练(voc2012)
- 你不知道的JS-读书笔记(一)--作用域
- 深度学习库(排名总结)
- 优化数据库第五章
- Oracle性能优化-读懂执行计划
- Log4 日志级别
- Struts2(十一)---类型转换
- DB