网易云音乐爬虫--评论爬取以及Top Music统计

来源:互联网 发布:淘宝组装机为什么便宜 编辑:程序博客网 时间:2024/06/06 18:20

        网易云云音乐评论十分有趣,于是就想写个爬虫爬取评论。但是不熟悉Python,就用java写了个。

        主要使用了HttpClient,,Jsoup, 队列, 线程, log4j,poi生成Excel保存结果, 书写过程中主要一个问题就是评论获取,网易对其进行了加密,进行好一番搜索才找到解决方法。爬取歌单数,top歌曲数都可以动态进行配置.

        目录结构

       

主程序

       

package personal.mario.main;import java.io.IOException;import java.util.List;import org.apache.http.client.ClientProtocolException;import org.apache.log4j.Logger;import org.apache.poi.hssf.usermodel.HSSFSheet;import org.apache.poi.hssf.usermodel.HSSFWorkbook;import personal.mario.bean.MusicCommentMessage;import personal.mario.service.HtmlFetcherService;import personal.mario.service.HtmlParserService;import personal.mario.service.MusicListQueueService;import personal.mario.service.MusicQueueService;import personal.mario.service.TopMusicCalculateService;import personal.mario.utils.Constants;import personal.mario.utils.GenerateExcelUtils;/* * 主逻辑 * author timeless.li * 2016-10-26 * */public class NetEaseCrawler implements Runnable {private int totalMusicList = Constants.MUSIC_LIST_COUNT;private int limit = Constants.PER_PAGE;private int offset =Constants.OFFSET;private HSSFWorkbook commentMessageWorkbook = new HSSFWorkbook();private List<MusicCommentMessage> ms = null;private static Logger logger = Logger.getLogger(NetEaseCrawler.class);@Overridepublic void run() {try {//初始化待爬取的歌单URL队列initUncrawledMusicListQueue();//记录所有爬取出来的歌曲数,包含重复歌曲int count = 0;//歌曲信息Excel初始化HSSFSheet commentMessageSheet = GenerateExcelUtils.generateCommentMessageExcelInit(commentMessageWorkbook);//开始根据歌单爬取while (!MusicListQueueService.isUncrawledMusicListEmpty()) {//填充待爬取歌曲队列fillUncrawledMusicQueue(MusicListQueueService.getTopMusicList());//歌曲队列为空就返回上层循环填充歌曲队列while (!MusicQueueService.isUncrawledMusicQueueEmpty()) {//取出待爬取歌曲IDString songId = MusicQueueService.getTopMusicUrl();//判断是否已经爬取过if (!MusicQueueService.isMusicCrawled(songId)) {//获取到爬取结果,歌曲信息MusicCommentMessage mcm = getCommentMessage(songId);//判断是否加入Top歌曲列表ms = TopMusicCalculateService.getTopMusic(mcm);//向歌曲信息Excel插入数据GenerateExcelUtils.generateCommentMessageExcelProcess(commentMessageWorkbook, commentMessageSheet, mcm, count);//生成歌曲评论ExcelGenerateExcelUtils.generateCommentsExcel(mcm);//加入已经爬取的队列,供以后查重判断MusicQueueService.addCrawledMusic(songId);count++;}}}//生成歌曲信息ExcelGenerateExcelUtils.generateCommentMessageExcelWrite(commentMessageWorkbook);//生成Top歌曲ExcelGenerateExcelUtils.generateTopMusicExcel(ms);logger.info("count : " + count);//实际爬取的歌曲数,不包含重复logger.info("size : " + MusicQueueService.getCrawledMusicSize());} catch (Exception e) {e.printStackTrace();}}/* * 循环请求获取所有歌单 * */public void initUncrawledMusicListQueue() throws ClientProtocolException, IOException {if (totalMusicList > limit) { int tmpLimit = limit;int tmpOffset = offset;while (totalMusicList > tmpOffset) {String suffix = "limit=" + tmpLimit + "&offset=" + tmpOffset;tmpOffset += tmpLimit;if (tmpOffset + tmpLimit > totalMusicList) {tmpLimit =  totalMusicList - tmpOffset;}HtmlParserService.parseAndSaveMusicListUrl(HtmlFetcherService.fetch(Constants.SOURCE_URL + suffix));}} else {String suffix = "limit=" + totalMusicList + "&offset=" + offset;HtmlParserService.parseAndSaveMusicListUrl(HtmlFetcherService.fetch(Constants.SOURCE_URL + suffix));}}//填充要爬取的歌曲队列public void fillUncrawledMusicQueue(String musicListUrl) throws IOException {HtmlParserService.parseMusicListAndGetMusics(musicListUrl);}//由于反爬的存在, 一旦被禁止爬取, 休眠几秒后再进行爬取public MusicCommentMessage getCommentMessage(String songId) {try {MusicCommentMessage mc = HtmlParserService.parseCommentMessage(songId);if (mc == null) {logger.info("warining: be interceptted by net ease music server..");Thread.sleep((long) (Math.random() * 30000));//递归return getCommentMessage(songId);} else {return mc;}} catch (Exception e) {logger.info("error: be refused by net ease music server..");return getCommentMessage(songId);}}}

计算Top歌曲

package personal.mario.service;import java.util.ArrayList;import java.util.List;import personal.mario.bean.MusicCommentMessage;import personal.mario.utils.Constants;/*计算获取TOP 歌曲*/public class TopMusicCalculateService {private static List<MusicCommentMessage> ms = new ArrayList<MusicCommentMessage>();public static List<MusicCommentMessage> getTopMusic(MusicCommentMessage mcm) {int topSize = ms.size();if (topSize == 0) {ms.add(mcm);}if (topSize > 0 && topSize < Constants.TOP_MUSIC_COUNT) {for (int j = 0; j < topSize; j++) {if (mcm.getCommentCount() > ms.get(j).getCommentCount()) {ms.add(j, mcm);break;}if (j == topSize - 1) {ms.add(mcm);}}}if (topSize >= Constants.TOP_MUSIC_COUNT) {for (int j = 0; j < topSize; j++) {if (mcm.getCommentCount() > ms.get(j).getCommentCount()) {ms.add(j, mcm);ms.remove(topSize);break;}}}return ms;}}

生成评论Excel表

//歌曲评论Excel生成public static void generateCommentsExcel(MusicCommentMessage musicCommentMessage) throws IOException {HSSFWorkbook workbook = new HSSFWorkbook();HSSFSheet sheet = workbook.createSheet("歌曲评论");        sheet.setDefaultColumnWidth(15);                HSSFRow rowHead = sheet.createRow(0);                HSSFCellStyle style = workbook.createCellStyle();        style.setAlignment(HSSFCellStyle.ALIGN_CENTER);                HSSFFont font = workbook.createFont();        font.setColor(HSSFColor.LIGHT_BLUE.index);        font.setFontHeightInPoints((short) 8);        font.setBoldweight(HSSFFont.BOLDWEIGHT_BOLD);        style.setFont(font);        HSSFCell cellHead = rowHead.createCell(0);        cellHead.setCellValue("歌名");        cellHead.setCellStyle(style);                cellHead = rowHead.createCell(1);        cellHead.setCellValue("评论类型");        cellHead.setCellStyle(style);                cellHead = rowHead.createCell(2);        cellHead.setCellValue("评论用户昵称");        cellHead.setCellStyle(style);                cellHead = rowHead.createCell(3);        cellHead.setCellValue("评论时间");        cellHead.setCellStyle(style);                cellHead = rowHead.createCell(4);        cellHead.setCellValue("评论内容");        cellHead.setCellStyle(style);                cellHead = rowHead.createCell(5);        cellHead.setCellValue("获赞数");        cellHead.setCellStyle(style);                HSSFCellStyle cellStyle = workbook.createCellStyle();        cellStyle.setAlignment(HSSFCellStyle.ALIGN_CENTER);                List<MusicComment> comments = musicCommentMessage.getComments();            for (int i = 0; i < comments.size(); i++) {    MusicComment comment = comments.get(i);    HSSFRow row = sheet.createRow(i + 1);                HSSFCell cell = row.createCell(0);        cell.setCellValue(musicCommentMessage.getSongTitle());        cell.setCellStyle(cellStyle);                cell = row.createCell(1);        cell.setCellValue(comment.getType());        cell.setCellStyle(cellStyle);                cell = row.createCell(2);        cell.setCellValue(comment.getNickname());        cell.setCellStyle(cellStyle);                cell = row.createCell(3);        cell.setCellValue(comment.getCommentDate());        cell.setCellStyle(cellStyle);                cell = row.createCell(4);        cell.setCellValue(comment.getContent());        cell.setCellStyle(cellStyle);                cell = row.createCell(5);        cell.setCellValue(comment.getAppreciation());        cell.setCellStyle(cellStyle);    }            String path = Constants.COMMENTS_PATH + StringUtils.dealWithFilename(musicCommentMessage.getSongTitle()) + Constants.COMMENTS_SUFFIX;    logger.info(path);        FileOutputStream fos = new FileOutputStream(path);        workbook.write(fos);        fos.close();}

歌曲队列

package personal.mario.service;import java.util.Queue;import java.util.concurrent.ConcurrentLinkedQueue;/*歌曲队列*/public class MusicQueueService {private static Queue<String> uncrawledMusics = new ConcurrentLinkedQueue<String>();private static Queue<String> crawledMusics = new ConcurrentLinkedQueue<String>();public static void addUncrawledMusic(String e) {uncrawledMusics.offer(e);}public static String getTopMusicUrl() {if (!uncrawledMusics.isEmpty()) {return uncrawledMusics.poll();}return null;}public static void addCrawledMusic(String e) {crawledMusics.offer(e);}public static boolean isMusicCrawled(String id) {return crawledMusics.contains(id);}public static boolean isUncrawledMusicQueueEmpty() {return uncrawledMusics.isEmpty();}public static void printAll() {while (!uncrawledMusics.isEmpty()) {System.out.println(uncrawledMusics.poll());}}public static int getCrawledMusicSize() {return crawledMusics.size();}}
爬取结果


1 0
原创粉丝点击