某票务平台的信息采集

来源:互联网 发布:手机淘宝类目怎么写 编辑:程序博客网 时间:2024/04/29 10:40
package com.crawler.maoyan.age.sex.index;import java.io.IOException;import java.io.UnsupportedEncodingException;import java.text.SimpleDateFormat;import java.util.Date;import java.util.Iterator;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.json.simple.JSONArray;import org.json.simple.JSONObject;import org.json.simple.parser.JSONParser;import org.json.simple.parser.ParseException;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.select.Elements;import com.maoyan.movie.contents.MContents;import com.maoyan.movie.html.MovieHtml;import com.maoyan.movie.html.PrecessHtml;import com.maoyan.movie.ttf.encode.DownParseTTF;import com.maoyan.mysql.configure.DBConfig;import com.maoyan.mysql.configure.DbAttribute;import com.maoyan.mysql.manage.ContentToMySQL;import com.maoyan.mysql.manage.IFRepetition;import com.maoyan.mysql.manage.UpdateData;/** * @author 作者 E-mail: ZH519080@163.com* @date 创建时间:2017年1月17日 上午11:46:10 * @jdk 版本:jdk1.7.0_79** @类说明:受众性别占比和受众年龄占比*/public class AgeSexIndex {public static void main(String[] args) {AgeSexIndex ageSexIndex = new AgeSexIndex();DbAttribute dbAttribute = new DbAttribute();String branchURL = "http://piaofang.maoyan.com/movie/";String ageSexURL = "";String movieIdKey;try {movieIdKey = new String(dbAttribute.maoyanMovieID.getBytes("ISO-8859-1"), "utf-8");String[] splitMovieID = movieIdKey.split("#");for (int i = 0; i < splitMovieID.length; i++) {ageSexURL = branchURL+splitMovieID[i]+"/wantindex";System.out.println(ageSexURL);String movieNameId = ageSexIndex.getMovieNameId(ageSexURL);Document document = ageSexIndex.getDocument(ageSexURL);ageSexIndex.exeSexIndex(document, movieNameId);ageSexIndex.exeAgeIndex(document,movieNameId);}} catch (UnsupportedEncodingException e) {// TODO Auto-generated catch blocke.printStackTrace();}System.out.println("over");}//获取电影的性别占比public void exeSexIndex(Document document,String movieNameId){MContents mContents = new MContents();ContentToMySQL contentToMySQL = new ContentToMySQL();IFRepetition ifRepetition = new IFRepetition();UpdateData updateData = new UpdateData();SimpleDateFormat sDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss SSS");mContents.setTaskTime(sDateFormat.format(new Date()));String[] split = movieNameId.split(",");//电影名称mContents.setMovieName(split[0]);//猫眼电影的id号mContents.setPlatformID(split[1]);//男性占比Elements meles = document.select("section div.stackcolumn div.stackcolumn-desc i.cs");String male = meles.eq(0).text();mContents.setMaleRate(male+"%");//女性占比Elements feeles = document.select("div.stackcolumn div.stackcolumn-desc p.stackcolumn-desc-right i.cs");String female = feeles.eq(0).text();mContents.setFemaRate(female+"%");if (!ifRepetition.sexRepetition(mContents)) {contentToMySQL.saveGenderRate(mContents);}else {updateData.updateSex(mContents);}}//获取电影的年龄占比public void exeAgeIndex(Document document,String movieNameId){MContents mContents = new MContents();ContentToMySQL contentToMySQL = new ContentToMySQL();IFRepetition ifRepetition = new IFRepetition();UpdateData updateData = new UpdateData();SimpleDateFormat sDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss SSS");mContents.setTaskTime(sDateFormat.format(new Date()));JSONParser jsonParser = new JSONParser();Elements eles = document.select("body script#pageData");int beginIndex = eles.toString().indexOf("{");int endIndex = eles.toString().lastIndexOf("}");String ageJson = eles.toString().substring(beginIndex, endIndex+1);String[] split = movieNameId.split(",");//电影名称mContents.setMovieName(split[0]);//猫眼电影的id号mContents.setPlatformID(split[1]);;String ageRate = "";//年龄占比String age = "";//年龄段String ageAgeRate = "";try {JSONObject jsObjectRoot = (JSONObject)jsonParser.parse(ageJson);//获取年龄占比的相关数据JSONObject ageJsonObject = (JSONObject)jsObjectRoot.get("ageRatesChart");JSONArray jsonArray = (JSONArray) ageJsonObject.get("series");Iterator iterator = jsonArray.iterator();while(iterator.hasNext()){JSONObject seriesJsonObject = (JSONObject)iterator.next();//得到年龄占比精确数据JSONArray pointsJsonArray = (JSONArray) seriesJsonObject.get("points");//此处的for循环和while循环是一样的for(int i = 0 ,length = pointsJsonArray.size();i < length;i++){JSONObject xyValue = (JSONObject)pointsJsonArray.get(i);ageRate = xyValue.get("yValue").toString();age = xyValue.get("xValue").toString();ageAgeRate = age+ageRate;if (ageAgeRate.contains("20岁以下")) {mContents.setF16to20(ageAgeRate.replace("20岁以下", ""));}else if (ageAgeRate.contains("20~24")) {mContents.setF21to25(ageAgeRate.replace("20~24", ""));}else if (ageAgeRate.contains("25~29")) {mContents.setF26to30(ageAgeRate.replace("25~29", ""));}else if (ageAgeRate.contains("30~34")) {mContents.setF31to35(ageAgeRate.replace("30~34", ""));}else if (ageAgeRate.contains("35~39")) {mContents.setF36to40(ageAgeRate.replace("35~39", ""));}else {mContents.setF41to45(ageAgeRate.replace("40岁以上", ""));}}}if (!ifRepetition.ageRepetition(mContents)) {contentToMySQL.saveAgeRate(mContents);}else {updateData.updateAge(mContents);}} catch (ParseException e) {// TODO Auto-generated catch blocke.printStackTrace();}}//获取电影的名称和所对应的猫眼电影id号public String getMovieNameId(String ageSexIndexURL){//获取电影的名称,电影名称的获取和性别占比不是同一个链接String movieURL = ageSexIndexURL.substring(0, ageSexIndexURL.indexOf("/wantindex"));String movieNameId = "";String platformId = "";String regex = "[^0-9]";Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(ageSexIndexURL);//猫眼电影的id号platformId = matcher.replaceAll("");try {Document movieNameDocu = Jsoup.connect(movieURL).get();movieNameId = movieNameDocu.title()+","+platformId;} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}return movieNameId;}//获取电影性别和年龄的占比的document数据public Document getDocument(String ageSexIndexURL){MovieHtml movieHtml = new MovieHtml();DownParseTTF downParseTTF = new DownParseTTF();PrecessHtml precessHtml = new PrecessHtml();String sourceHtml = movieHtml.getHtml(ageSexIndexURL).toString();String ttfCode = downParseTTF.parseTTF(sourceHtml);//下载ttf文件并解析String precSourceHtml = precessHtml.precSourceHtml(sourceHtml, ttfCode);Document document = Jsoup.parse(precSourceHtml);return document;}static{DBConfig.initPropertis("./config/config.properties");}} 

原创粉丝点击