Java爬虫入门(二):单机单程序单线程-提供种子url用广度优先算法实现新闻资讯获取

来源:互联网 发布:王者荣耀聊天软件 编辑:程序博客网 时间:2024/06/06 15:50

在一的基础上,简单新增了广度url爬取算法。

缺点:单线程,url爬取算法,新闻内容爬取,都丢在同一个线程,效率很慢。 后续继续优化。

(自己有留意,没有爬取过疯狂(程序刚入门),所以没有ip跳板。)

待解决问题: 用多线程,实现业务分离(内容爬取算法,url爬取算法),提高抓取效率,优化抓取算法,待抓取队列数据结构选用,

----------------main测试方法-------------

package com.kimt.newsdrawler;import com.kimt.newsdrawler.crawler.IFengCrawler;import com.kimt.newsdrawler.dto.News;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.util.HashSet;import java.util.List;import java.util.concurrent.LinkedBlockingQueue;/** * Created by man on 2017/11/21. */public class UserMain {    private static Logger logger = LoggerFactory.getLogger(UserMain.class);    public static void main(String[] args) {       /* version_1.0 new IFengCrawler("http://news.ifeng.com/a/20171121/53459907_0.shtml").parserForNews();*/       /* version_2.0测试广度优先遍历算法        // 初始化待抓取url队列,已抓取url集合        LinkedBlockingQueue<String> toCatcheUrl = new LinkedBlockingQueue<String>();        HashSet<String> catchedUrl = new HashSet<String>();        // 传入种子url,爬取url到队列中        new IFengUrlCatcher(toCatcheUrl,catchedUrl).urlCatch("http://news.ifeng.com/");        logger.info("info:",toCatcheUrl);*/        /*version_2.1 测试广度优先遍历算法,并且爬取数据*/        LinkedBlockingQueue<String> toCatcheUrl = new LinkedBlockingQueue<String>();        HashSet<String> catchedUrl = new HashSet<String>();        List<News> list = new IFengCrawler(toCatcheUrl, catchedUrl).parserForNews("http://news.ifeng.com/");        logger.info("一共爬取了 "+list.size()+" 条新闻");    }}
----------------提供种子url,广度url爬取算法-------------
package com.kimt.newsdrawler.urlcatcher;import com.kimt.newsdrawler.httpclientutils.HttpClientUtil;import org.apache.http.HttpEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.IOException;import java.util.*;import java.util.concurrent.LinkedBlockingQueue;/** * @Date create on 2017/11/22 * @author man * @Description */public class IFengUrlCatcher extends AbstractUrlCatcher {    private static Logger logger = LoggerFactory.getLogger(IFengUrlCatcher.class);    /** 待爬取的url队列 */    private LinkedBlockingQueue<String> toCatcheUrl;    /** 已爬取的url集合*/    private HashSet<String> catchedUrl;    /**     *     * @param toCatcheUrl 待抓取url队列     * @param catchedUrl  已抓取url集合     */    public IFengUrlCatcher(LinkedBlockingQueue<String> toCatcheUrl, HashSet<String> catchedUrl) {        this.toCatcheUrl = toCatcheUrl;        this.catchedUrl = catchedUrl;    }    @Override    public void urlCatch(String seedUrl) {        try {            CloseableHttpResponse httpResponse = HttpClientUtil.getHttpResponse(seedUrl);            HttpEntity entity = httpResponse.getEntity();            // 将Entity转成String格式html            String html = EntityUtils.toString(entity, "utf-8");            // 遍历页面,可获取新闻内容的url入待抓取队            traversalUrlForIFengNews(html);        } catch (IOException e) {            e.printStackTrace();        }    }    /**     * 遍历出html页面,获取所有能爬取新闻内容的url并且入队,不同的新闻网站算法不一样     * @param html     */    private void traversalUrlForIFengNews(String html){        String baseUrl = "news.ifeng.com";        String url;        Document doc = Jsoup.parse(html);        //获取html页面的所有<a>标签        Elements elements = doc.getElementsByTag("a");        // 遍历所有<a>标签        for (Element e:elements) {            // 获取所有<a>标签的href属性的值(url)            url = e.attr("href");            // 如果该url是新闻网的内容页面,并且没有在已爬取队列中,则入队列            if(url.contains(baseUrl) && !catchedUrl.contains(url)){                try {                    toCatcheUrl.put(url.trim());                } catch (InterruptedException e1) {                    e1.printStackTrace();                    logger.error("InterruptedException",e1.getMessage());                }            }        }    }}
----------------包装httpClient工具类,减少重复代码(后续待优化)-------------
package com.kimt.newsdrawler.httpclientutils;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.IOException;/** * @author man * @Date create on 2017/11/22 * @Description */public class HttpClientUtil {    private static Logger logger = LoggerFactory.getLogger(HttpClientUtil.class);    public static CloseableHttpResponse getHttpResponse(String url) throws IOException {        CloseableHttpClient client;        client = HttpClients.createDefault();        HttpGet httpGet = new HttpGet(url);        // 设置请求头信息        httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.5");        httpGet.setHeader("Accept-Charset", "utf-8;q=0.7,*;q=0.7");        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");        httpGet.setHeader("Accept-Encoding", "gzip, deflate");        // 执行get请求        return client.execute(httpGet);    }}

------------新闻爬取类-----------

package com.kimt.newsdrawler.crawler;import com.kimt.newsdrawler.dto.News;import com.kimt.newsdrawler.httpclientutils.HttpClientUtil;import com.kimt.newsdrawler.urlcatcher.IFengUrlCatcher;import org.apache.http.HttpEntity;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import java.io.IOException;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.*;import java.util.concurrent.LinkedBlockingQueue;/** * @author kimt * Created by man on 2017/11/23. */public class IFengCrawler extends AbstractCrawler {    private Logger logger = LoggerFactory.getLogger(IFengCrawler.class);    /** 待爬取的url队列 */    private LinkedBlockingQueue<String> toCatcheUrl;    /** 已爬取的url集合*/    private HashSet<String> catchedUrl;    private IFengUrlCatcher urlCatcher ;    public IFengCrawler(LinkedBlockingQueue<String> toCatcheUrl, HashSet<String> catchedUrl) {        this.toCatcheUrl = toCatcheUrl;        this.catchedUrl = catchedUrl;        this.urlCatcher = new IFengUrlCatcher(toCatcheUrl,catchedUrl);    }    @Override    public List<News> parserForNews(String seedUrl) {        // 先爬取种子url,初始化待爬取队列        urlCatcher.urlCatch(seedUrl);        List<News> list = new ArrayList<News>();        try {            String url;            int stateCode;            HttpEntity entity ;            CloseableHttpResponse response;            // 循环爬取待抓取队列中的url 200次            for(int i =0;i<200;i++){                // 从待抓取队列,拿出一条url                url = toCatcheUrl.take();                // 广度优先算法,遍历该url下的所有可爬取的url,内部实现入队操作                urlCatcher.urlCatch(url);                // 向指定url模拟发送GET请求                response = HttpClientUtil.getHttpResponse(url);                // 获取http状态码                stateCode = response.getStatusLine().getStatusCode();                if (stateCode == AbstractCrawler.HTTP_RESPONSE_CODE_SUCCESS) {                    News news = null;                    // 从response中获取entity                    entity = response.getEntity();                    // 将Entity转成String格式html                    String html = EntityUtils.toString(entity, "utf-8");                    // 用Jsoup解析html                    Document doc = Jsoup.parse(html);                    String title = doc.title();                    // 凤凰新闻网的第一种新闻页面                    Element articleDiv = doc.getElementById("artical");                    if (articleDiv != null){                        news = parseOne(articleDiv, title);                    }else{                        // 凤凰新闻网的第二种新闻页面,使用第二种解析方式                        Element article2Div = doc.getElementsByClass("yc_main wrap").first();                        if(article2Div != null){                            news = parseTwo(article2Div, title);                        }                    }                    // 返回抓取到的新闻对象                    if(news != null){                        list.add(news);                    }                    // 标记为已抓取url                    catchedUrl.add(url);                    // 释放资源                    EntityUtils.consume(entity);                }            }        } catch (IOException e) {            e.printStackTrace();            logger.error("IOException"+e.getMessage());        } catch (ParseException e) {            e.printStackTrace();            logger.error("ParseException"+e.getMessage());        } catch (InterruptedException e) {            e.printStackTrace();            logger.error("InterruptedException"+e.getMessage());        }        return list;    }    /**     *     * @param articleDiv 最靠近新闻内容div     * @param title 文章标题     * @return News对象     * 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据     */     private News parseOne(Element articleDiv, String title) throws ParseException {        News news = new News();        news.setTitle(title);        if (articleDiv != null){            // 获取新闻来源,发布时间            Element headDiv = articleDiv.getElementById("artical_sth");            // 获取新闻内容            Element contentDiv = articleDiv.getElementById("main_content");            if (headDiv != null){                // 获取发布时间                String publishTime = headDiv.getElementsByClass("ss01").text();                // 获取新闻来源                String origin = headDiv.getElementsByClass("ss03").text();                // 格式转换String->Data                SimpleDateFormat sdf = new SimpleDateFormat("yyyy年MM月dd日 HH:mm:ss");                Date date = sdf.parse(publishTime);                // News对象成员赋值                news.setPublishTime(date);                news.setOrigin(origin);            }            if (contentDiv != null){                // 删除img标签                contentDiv.select("img").remove();                // 获取新闻内容html,方便后续分段,而不是直接获取text()                String content = contentDiv.html();                // News对象成员赋值                news.setContent(content);            }        }        return news;    }    /**     *     * @param article2Div 最靠近新闻内容div     * @param title 文章标题     * @return News对象     * 浏览器调试器,查看网页源码,找到对应的各dom节点,用jsoup解析获取想要的数据     */    private News parseTwo(Element article2Div, String title) throws ParseException {        News news = new News();        news.setTitle(title);        if (article2Div != null){            // 获取新闻来源,发布时间            Element headDiv = article2Div.getElementsByClass("yc_tit").first();            // 获取新闻内容            Element contentDiv = article2Div.getElementById("yc_con_txt");            if (headDiv != null){                // 获取发布时间                String publishTime = headDiv.getElementsByTag("span").text();                // 获取新闻来源                String origin = headDiv.getElementsByTag("a").first().text();                // 格式转换String->Data                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");                Date date = sdf.parse(publishTime);                // News对象成员赋值                news.setPublishTime(date);                news.setOrigin(origin);            }            if (contentDiv != null){                // 删除没用的div                contentDiv.select("div").remove();                contentDiv.select("script").remove();                // 获取新闻内容html,方便后续分段,而不是直接获取text()                String content = contentDiv.html();                // News对象成员赋值                news.setContent(content);            }        }        return news;    }}


阅读全文
0 0
原创粉丝点击