java爬虫(Jsoup)爬取某站点评论

来源:互联网 发布:淘宝装修设计教程 编辑:程序博客网 时间:2024/05/19 14:53

本文是基于这一篇的:http://blog.csdn.net/disiwei1012/article/details/51614492

在上一篇中,我们抓取到了新闻的标题,超链接和摘要,这次我们通过新闻的超链接,进入新闻的评论页,然后爬取评论!

注:http://www.wumaow.com,这个网站的标签写的太混乱了,而且还有js报错,到处都是广告。要是不是外国评论翻译的及时,我就去看龙腾网了http://www.ltaaa.com.

先看下评论页的标签:
主要是寻找id为“art_content”的标签下的 id为“text”下的“div”标签。
这里写图片描述

代码:

public class News {    private String title;    private String href;    private String content;    public News() {}    public News(String title,String href,String content){        this.title = title;        this.content = content;        this.href = href;    }    public String getTitle() {        return title;    }    public void setTitle(String title) {        this.title = title;    }    public String getHref() {        return href;    }    public void setHref(String href) {        this.href = href;    }    public String getContent() {        return content;    }    public void setContent(String content) {        this.content = content;    }}
public class JsoupTest {    static String url="http://www.cnblogs.com/zyw-205520/archive/2012/12/20/2826402.html";    /**     * @param args     * @throws Exception      */    public static void main(String[] args) throws Exception {//      BolgBody();//      article();//      Blog();        ArrayList<News> newsList = getWuMaoW();        getComments(newsList);    }    //=======================begin=======================================    //获取5毛网上的文章标题和超链接    public static ArrayList getWuMaoW(){        String url = "http://www.wumaow.com";        Document doc = null;        ArrayList<News> newsList = new ArrayList<News>();        try {            doc = Jsoup.connect(url).get();            Elements listDiv = doc.getElementsByAttributeValue("class", "post");            for(Element element : listDiv){                News news = new News();                Comments comment = new Comments();                Elements texts = element.getElementsByTag("h4");                Elements summerys = element.getElementsByTag("p");                for(Element text:texts){                    String ptext = text.text();                    news.setTitle(ptext);                    Elements hrefs = text.getElementsByTag("a");                    for(Element href:hrefs){                        String phref = href.attr("href");                        news.setHref("http://www.wumaow.com"+phref);                    }                }                for(Element summery:summerys){                    String psummery = summery.text();                    news.setContent(psummery);                }                newsList.add(news);            }            /*for(News news:newsList){                System.out.println(news.getTitle());                System.out.println(news.getHref());                System.out.println(news.getContent());                System.out.println("=============================================");            }*/        } catch (IOException e) {            e.printStackTrace();        }        return newsList;    }    //获取五毛网的评论    public static ArrayList getComments(ArrayList<News> newsList) throws IOException{        for(News news:newsList){            Document doc = Jsoup.connect(news.getHref()).get();            Element art_content = doc.getElementById("art_content");            Element text = art_content.getElementById("text");            Elements  commentsList = text.getElementsByTag("div");            for(Element element:commentsList){                String nr = element.text();//              String _shared = element.attr("class");//              System.out.println(_shared);                if(nr!=""){                    System.out.println(nr);                }            }        }        return null;    } }   //============================end=========================================

结果:
这里写图片描述

0 0
原创粉丝点击