java新闻爬取

来源:互联网 发布:方便面行业数据 编辑:程序博客网 时间:2024/06/07 18:29

本来想爬今日头条,在网上找了很多方法,走了很多弯路,异步刷新没能解决,本人爬虫小白。后来发现json数据和本地cookie也有关,感觉前路艰难。果断换到网易新闻,网易新闻相对来说获取数据比较简单,通过谷歌F12分析包数据,发现网易异步刷新的包和访问路径有关,通过在线json解析数据发现可以解析,这让我欣喜不已。


json数据:

废话不多说,直接上代码
//网易新闻类型String[] typeArray={"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"};String type = typeArray[width];//网易新闻列表urlString url1 = "http://3g.163.com/touch/reconstruct/article/list/";//网易新闻内容urlString url2 = "http://3g.163.com/news/article/";

//根据新闻列表url,获取新闻docid,并把docid存储到list中private static List<String> getDocid(String url,int num,String type) {    String json = null;    List<String> id=new ArrayList<>();    Map map=null;    JSONArray parseArray=null;    String jsonStrM="";    json = JSONUtils.loadJson(url+type+"/"+num+"-10.html");    String jsonStr = StringUtils.substringBeforeLast(json, ")");    String jsonStrO = StringUtils.substringAfter(jsonStr,"artiList(");    Map parse = (Map) JSONObject.parse(jsonStrO);    parseArray = (JSONArray) parse.get(type);    for(int j=0;j<parseArray.size();j++){        map = (Map)parseArray.get(j);        id.add((String) map.get("docid"));    }    return id;}

//根据内容url2获取新闻信息并进行存储private static void getContent(String url2, List<String> ids) {    System.out.println("存储开始!!");    String url = null;    Connection connection = Jsoup.connect(url2);    int i = 1;    for (;i<ids.size();i++){        url = url2+ids.get(i)+".html";        connection = Jsoup.connect(url);        try {            Document document = connection.get();            //获取新闻标题            Elements title = document.select("[class=title]");            //获取新闻来源和文章发布时间            Elements articleInfo = document.select("[class=info]");            Elements src = articleInfo.select("[class=source js-source]");            Elements time = articleInfo.select("[class=time js-time]");            //获取新闻内容            Elements contentEle = document.select("[class=page js-page on]");            DBCollection dbCollection= null;            try {                dbCollection = MongoDBUtils.connMongoDB();            } catch (Exception e) {                e.printStackTrace();            }            BasicDBObject obj = new BasicDBObject();            obj.put("title", src.html());            obj.put("srcFrom", src.html());            obj.put("time", time.html());            obj.put("content", contentEle.html());            dbCollection.insert(obj);            DBCursor dbCursor = dbCollection.find();            while(dbCursor.hasNext()){                Map map = (Map)dbCursor.next();            }        } catch (IOException e) {            e.printStackTrace();        }    }    System.out.println("本次共计存储"+i*0.8+"条数据");}

//设置爬取深度,循环多次获取docidprivate static List<String> getIds(String url1,int num,String type) {    List<String> id = new ArrayList<>();    List<String> ids = new ArrayList<>();    for (int i=0;i<=num;i+=10){        id = getDocid(url1,i,type);        ids.addAll(id);    }    return ids;}

public static void main(String[] args) throws Exception {    //爬取条数,10的倍数,网易新闻每10条预留大约2个广告位,所以爬取新闻的真实条数大约为80%    int deep = 30;    //爬取宽度,0:首页,1:社会,2:国内,3:国际,4:历史    int width = 1;    //网易新闻类型    String[] typeArray={"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"};    String type = typeArray[width];    //网易新闻列表url    String url1 = "http://3g.163.com/touch/reconstruct/article/list/";    //网易新闻内容url    String url2 = "http://3g.163.com/news/article/";    List<String> ids = new ArrayList<>();    //根据url1,爬取条数,新闻类型获取新闻docid    ids = getIds(url1,deep,type);    //根据url2,新闻docid获取内容并存储到MongoDB    getContent(url2,ids);}

为了方便存取比较大的数据量,使用了mongodb数据库进行存储
列表

内容