java 爬虫爬取酷狗歌手数据

来源:互联网 发布:中航锂电研究院 知乎 编辑:程序博客网 时间:2024/06/06 03:34

记录防止忘记

包:

jsoup-1.4.1 html解析

httpcore-4.0.1_1

httpclient-4.0.1

代码:

已经访问的url队列

//已经访问链接队列public class VisitedUrlQueue {public static HashSet<String> visitedUrlQueue = new HashSet<String>();        public synchronized static void addElem(String url) {          visitedUrlQueue.add(url);      }        public synchronized static boolean isContains(String url) {          return visitedUrlQueue.contains(url);      }        public synchronized static int size() {          return visitedUrlQueue.size();      }  }
未访问的队列

//未访问url队列public class UrlQueue {/** 超链接队列 */      public static LinkedList<String> urlQueue = new LinkedList<String>();        /** 队列中对应最多的超链接数量 */      public static final int MAX_SIZE = 10000;        public synchronized static void addElem(String url) {          urlQueue.add(url);      }        public synchronized static String outElem() {          return urlQueue.removeFirst();      }        public synchronized static boolean isEmpty() {          return urlQueue.isEmpty();      }        public static int size() {          return urlQueue.size();      }        public static boolean isContains(String url) {          return urlQueue.contains(url);      }  }
通过url得到页面html代码

public class DownloadPage {public static String getContentFormUrl(String url) throws Exception {  HttpClient client = new DefaultHttpClient();          HttpGet getHttp = new HttpGet(url);            String content = null;            HttpResponse response;          try {              /* 获得信息载体 */              response = client.execute(getHttp);              HttpEntity entity = response.getEntity();                          //已经访问url            VisitedUrlQueue.addElem(url);                if (entity != null) {                  /* 转化为文本信息 */                  content = EntityUtils.toString(entity);              }            } catch (ClientProtocolException e) {              e.printStackTrace();          } catch (IOException e) {              e.printStackTrace();          } finally {              client.getConnectionManager().shutdown();          }            return content;      }  }

页面解析

public class ParseOfPage { /**      * 获得url页面源代码中超链接  * @throws Exception      */      public static void getHrefOfContent(String content) throws Exception {     Document doc = Jsoup.parse(content);    for(Element e:doc.getElementsByTag("a")){    String linkHref = e.attr("href");    if(linkHref.startsWith("/album")){ //进行链接筛选    linkHref = "http://www.kuwo.cn"+linkHref;//进行链接补充    }    if(linkHref.startsWith("http://www.kuwo.cn/album")){ //链接筛选,队列判断重复后加入队列    if (!UrlQueue.isContains(linkHref)                        && !VisitedUrlQueue.isContains(linkHref)) {     String urlNew = linkHref.replace(" ","%20");    //System.out.println(urlNew);                 UrlQueue.addElem(urlNew);              }    }        }    }           //进行自定义解析    public static void getDataOfContentForSinger(String content) throws Exception {    SingerPo po = new SingerPo();        Document doc = Jsoup.parse(content);    for(Element e:doc.getElementsByClass("artistTop")){    po.setPhotourl(e.childNode(1).attr("data-src")); //设置图片    }                    }}




原创粉丝点击