webmagic实战使用

来源:互联网 发布:南京黑马程序员培训班 编辑:程序博客网 时间:2024/06/08 03:11

一.引入依赖包

<dependency>  <groupId>us.codecraft</groupId>  <artifactId>webmagic-core</artifactId>  <version>0.4.3</version>  </dependency>  <dependency>  <groupId>us.codecraft</groupId>  <artifactId>webmagic-extension</artifactId>  <version>0.4.3</version>  </dependency>

二.代码

package com.pz998.quartz.spider;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.apache.commons.collections.CollectionUtils;import org.apache.commons.lang.StringUtils;import org.eclipse.jetty.util.MultiMap;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import com.pz998.rpc.model.entity.BdDepartmentDiseaseRelaRpc;import com.pz998.rpc.model.entity.BdDepartmentRpc;import com.pz998.rpc.model.entity.BdDiseaseDoctorRelaRpc;import com.pz998.rpc.model.entity.BdDoctorRpc;import com.pz998.rpc.model.entity.BdHospitalRpc;import net.minidev.json.JSONArray;import net.minidev.json.JSONObject;import net.minidev.json.parser.JSONParser;import net.minidev.json.parser.ParseException;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;import us.codecraft.webmagic.selector.JsonPathSelector;import us.codecraft.xsoup.Xsoup;public class YiBaiduProcessor implements PageProcessor{private static final String START_URL = "https://yi.baidu.com/pc/hospital/list?cityId=371&pageSize=10&page=1";private static final String HOSPITAL_DETAIL_URL = "https://yi\\.baidu\\.com/pc/hospital/index\\?zt=pcpinzhuan&zt_ext=&pvid=\\d+&key=\\S+";private static final String HOSPITAL_LIST_URL = "https://yi\\.baidu\\.com/pc/hospital/list\\?cityId=\\d++&pageSize=10&page=\\d++";private static final String HOSPITAL_INFO_URL ="https://yi\\.baidu\\.com/pc/hospital/info\\?key=\\S+";private static final String DEPT_INFO_URL = "https://yi\\.baidu\\.com/pc/admindepartment/detail\\?zt=\\w+&zt_ext=&pvid=\\d+&hosId=\\d+&adminDepartId=\\d+";private static final String HOSPITAL_DEPT_URL ="https://yi\\.baidu\\.com/pc/hospital/alldep\\?key=\\S+";private static final String DOCTOR_LIST_URL = "https://yi\\.baidu\\.com/pc/admindepartment/doctorlist\\?diseaseId=0&medTitle=0&serviceType=0&page=\\d+&pageSize=8&provId=0&cityId=0&regionId=0&adminDepartId=\\d+&hosId=\\d+";private static final String DOCTOR_INFO_URL = "https://yi\\.baidu\\.com/pc/doctor/detailpage\\?zt=\\w+&zt_ext=&pvid=0&doctorId=\\d+";//https://yi.baidu.com/pc/hospital/info?key=%E6%AD%A6%E6%B1%89%E5%B8%82%E5%A6%87%E5%A5%B3%E5%84%BF%E7%AB%A5%E5%8C%BB%E7%96%97%E4%BF%9D%E5%81%A5%E4%B8%AD%E5%BF%83//https://yi.baidu.com/pc/hospital/alldep?key=private Site site = Site.me();public static final String STATE_SUCCESS = "0";public static final Map<String,String> CITY_MAP = new HashMap<String,String>();static{CITY_MAP.put("371","武汉");CITY_MAP.put("1", "北京");CITY_MAP.put("2", "上海");CITY_MAP.put("84","广州");}@Overridepublic void process(Page page) {String url=page.getUrl().toString();if(page.getUrl().regex(HOSPITAL_LIST_URL).match()){try{String state = new JsonPathSelector("$.status").select(page.getRawText());if(STATE_SUCCESS.equals(state)){List hospitalList = new JsonPathSelector("$.data.hospitalList[*]").selectList(page.getRawText());MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);String cityId = resultMap.getString("cityId");if(CollectionUtils.isNotEmpty(hospitalList)){List<BdHospitalRpc> bdHospitalList = new ArrayList<BdHospitalRpc>();for(Object obj:hospitalList){JSONObject jsonObj = (JSONObject)obj;String name = (String)jsonObj.get("name");System.out.println("name:"+name);String address = (String)jsonObj.get("address");String level = (String)jsonObj.get("level");Integer insurance = (Integer)jsonObj.get("insurance");String phone = (String)jsonObj.get("phone");String grade = (String)jsonObj.get("grade");Integer doctorNum = (Integer)jsonObj.get("doctorNum");String imageUrl = (String)jsonObj.get("logo");Integer serveNum = (Integer)jsonObj.get("serveNum");Integer commentNum = (Integer)jsonObj.get("commentNum");String routeLink = (String)jsonObj.get("routeLink");MultiMap<String> routeLinkMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(routeLink);String location = routeLinkMap.getString("location");String latitude = "";String longitude = "";if(StringUtils.isNotEmpty(location)){String[] locationArray = location.split(",");latitude = locationArray.length>0?locationArray[0]:"";longitude = locationArray.length>1?locationArray[1]:"";}BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();bdHospitalRpc.setSourceId(name);bdHospitalRpc.setName(name);bdHospitalRpc.setAddress(address);bdHospitalRpc.setLevel(level);bdHospitalRpc.setPhone(phone);bdHospitalRpc.setImageUrl(imageUrl);bdHospitalRpc.setLatitude(latitude);bdHospitalRpc.setLongitude(longitude);bdHospitalRpc.setScore(grade);String city = CITY_MAP.get(cityId);bdHospitalRpc.setCity(city);String insuranceStr = insurance==null?"":insurance.toString();bdHospitalRpc.setIsMedicalInsurance(insuranceStr);String doctorNumStr = doctorNum==null?"":doctorNum.toString();bdHospitalRpc.setHighQualityDoctorNum(doctorNumStr);String serveNumStr = serveNum==null?"":serveNum.toString();bdHospitalRpc.setFinishedServiceNum(serveNumStr);String commentNumStr=commentNum==null?"":commentNum.toString();bdHospitalRpc.setPatientCommentNum(commentNumStr);bdHospitalList.add(bdHospitalRpc);String infoUrl = "https://yi.baidu.com/pc/hospital/info?key="+name;String allDeptUrl = "https://yi.baidu.com/pc/hospital/alldep?key="+name;page.addTargetRequest(infoUrl);page.addTargetRequest(allDeptUrl);}page.putField("bdHospitalList", bdHospitalList);}}}catch(Exception e){e.printStackTrace();}}elseif(page.getUrl().regex(HOSPITAL_INFO_URL).match()){try{MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);  String hosName = resultMap.getString("key");  BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();List<String> contextList = page.getHtml().xpath("ul[@class='container-list-info']/li[@class='ys-util-margin-b35']/p[@class='ys-util-text-smaller ys-util-margin-t9 ys-util-margin-b30']/text()").all();if(CollectionUtils.isNotEmpty(contextList)){String context1 = contextList.size()>=1?contextList.get(0):"";String context2 = contextList.size()>=2?contextList.get(1):"";String context3 = contextList.size()>=3?contextList.get(2):"";String context4 = contextList.size()>=4?contextList.get(3):"";String context5 = contextList.size()>=5?contextList.get(4):"";bdHospitalRpc.setContent(context1);bdHospitalRpc.setHistory(context2);bdHospitalRpc.setCharacteristicDept(context3);bdHospitalRpc.setTeam(context4);bdHospitalRpc.setHonor(context5);// System.out.println("医院概况:"+context1);// System.out.println("历史沿革:"+context2);// System.out.println("特色科室:"+context3);// System.out.println("医护团队:"+context4);// System.out.println("医院荣誉:"+context5);}bdHospitalRpc.setSourceId(hosName);page.putField("bdHospitalRpc", bdHospitalRpc);}catch(Exception e){e.printStackTrace();}}else if(page.getUrl().regex(HOSPITAL_DEPT_URL).match()){try{  MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);  String hosName = resultMap.getString("key");  String topDepts = "";  List<String> tableHtml = page.getHtml().xpath("div[@class='container-common-office']/table[@class='ys-util-margin-b15 list-office ys-util-border-big']").all();List<BdDepartmentRpc> departmentList = new ArrayList<BdDepartmentRpc>();for(String html:tableHtml){Document document = Jsoup.parse(html);String platDept = Xsoup.select(document, "td[@class='primary-office']/h4/text()").get();List<String> hospitalDepts = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']").list();List<String> hospitalDeptNames = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']/text()").list();//重点科室信息if(StringUtils.isEmpty(platDept)){topDepts = com.pz998.quartz.utils.StringUtils.listToString(hospitalDeptNames);//医院科室信息}else{for(String d:hospitalDepts){Document deptDocument = Jsoup.parse(d);String deptName = Xsoup.select(deptDocument, "a/text()").get();String deptHref = Xsoup.select(deptDocument, "a/@href").get();MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(deptHref);String deptId = deptResultMap.getString("adminDepartId");String hosId = deptResultMap.getString("hosId");BdDepartmentRpc bdDepart = new BdDepartmentRpc();bdDepart.setSourceId(deptId);bdDepart.setName(deptName);bdDepart.setParentSource(platDept);bdDepart.setHospitalSource(hosName);departmentList.add(bdDepart);//将科室详情地址放入目标采集队列page.addTargetRequest(deptHref);//将科室下医生列表链接放入队列for(int i=1;i<6;i++){String doctorUrl = "https://yi.baidu.com/pc/admindepartment/doctorlist?diseaseId=0&medTitle=0&serviceType=0&page="+i+"&pageSize=8&provId=0&cityId=0&regionId=0&adminDepartId="+deptId+"&hosId="+hosId;page.addTargetRequest(doctorUrl);}}}}BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();bdHospitalRpc.setSourceId(hosName);System.out.println("重点科室:"+topDepts);bdHospitalRpc.setCharacteristicFaculty(topDepts);page.putField("hosTopDept", bdHospitalRpc);page.putField("departmentList", departmentList);// System.out.println(page.getHtml().toString());}catch(Exception e){e.printStackTrace();}//采集科室信息}else if(page.getUrl().regex(DEPT_INFO_URL).match()){String deptPhone = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-normal-height']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString();String deptAddress = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t8 ys-util-text-normal']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString();String content = page.getHtml().xpath("div[@class='office-info']/p[@class='ys-util-text-smaller ys-util-margin-t15 office-info-total']/text()").toString();String titleDescr = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-min-height']/h3[@class='ys-util-text-min ys-util-margin-r12']/text()").toString();MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);  String deptId = deptResultMap.getString("adminDepartId");String hosId = deptResultMap.getString("hosId");BdDepartmentRpc bdDepartmentRpc = new BdDepartmentRpc();bdDepartmentRpc.setAddress(deptAddress);bdDepartmentRpc.setPhone(deptPhone);bdDepartmentRpc.setContent(content);bdDepartmentRpc.setSourceId(deptId);bdDepartmentRpc.setTitleDescr(titleDescr);page.putField("bdDepartmentRpc", bdDepartmentRpc);}else if(page.getUrl().regex(DOCTOR_LIST_URL).match()){String status = new JsonPathSelector("$.status").select(page.getRawText());if(STATE_SUCCESS.equals(status)){String data = new JsonPathSelector("$.data[*]").select(page.getRawText());if(data!=null){MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);String deptId = deptResultMap.getString("adminDepartId");String hosId = deptResultMap.getString("hosId");String pageNum = deptResultMap.getString("page");List<BdDepartmentDiseaseRelaRpc> BdDepartmentDiseaseRelaRpcList = new ArrayList<BdDepartmentDiseaseRelaRpc>();JSONParser jsonParser = new JSONParser();JSONObject dataJo = null;try {dataJo = (JSONObject)jsonParser.parse(data);} catch (ParseException e) {e.printStackTrace();}if("1".equals(pageNum)){JSONArray diseaseArray= dataJo==null?null:(JSONArray)dataJo.get("selectorList");if(CollectionUtils.isNotEmpty(diseaseArray)){JSONObject obj = (JSONObject)diseaseArray.get(0);JSONArray diseaseList = (JSONArray)obj.get("list");if(CollectionUtils.isNotEmpty(diseaseList)){for(Object disease:diseaseList){JSONObject diseaseJo=(JSONObject)disease;String itemName = (String)diseaseJo.get("itemName");if("全部".equals(itemName)){continue;}BdDepartmentDiseaseRelaRpc bdDepartmentDiseaseRelaRpc = new BdDepartmentDiseaseRelaRpc();bdDepartmentDiseaseRelaRpc.setHospitalSourceId(hosId);bdDepartmentDiseaseRelaRpc.setDepartmentSourceId(deptId);bdDepartmentDiseaseRelaRpc.setDiseaseSource(itemName);BdDepartmentDiseaseRelaRpcList.add(bdDepartmentDiseaseRelaRpc);}}}}page.putField("bdDepartmentDiseaseRelaRpcList", BdDepartmentDiseaseRelaRpcList);if(dataJo.containsKey("doctorList")){List doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText());if(CollectionUtils.isNotEmpty(doctorList)){//收集医生信息List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>();//收集医生与疾病关系信息List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>();for(Object o:doctorList){JSONObject doctorJo = (JSONObject)o;//医生认证信息String identifyMarkStr = "";if(doctorJo.containsKey("doctorIdentify")){List<String> identifyMarkList = new JsonPathSelector("$.doctorIdentify[*].identifyMark").selectList(doctorJo.toJSONString());identifyMarkStr = com.pz998.quartz.utils.StringUtils.listToString(identifyMarkList);}String doctorName = (String)doctorJo.get("doctorName");String doctorTitle= (String)doctorJo.get("doctorTitle");Object commentScore = doctorJo.get("commentScore");String doctorSkill = (String)doctorJo.get("doctorSkill");String allTimeHref = (String)doctorJo.get("allTimeHref");String doctorPhoto = (String)doctorJo.get("doctorPhoto");//医生详情页加入目标采集page.addTargetRequest(allTimeHref);MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(allTimeHref);String doctorId = resultMap.getString("doctorId");BdDoctorRpc bdDoctorRpc = new BdDoctorRpc();bdDoctorRpc.setHospitalSourceId(hosId);bdDoctorRpc.setDepartmentSourceId(deptId);bdDoctorRpc.setSourceId(doctorId);bdDoctorRpc.setName(doctorName);bdDoctorRpc.setPracticeTitle(doctorTitle);String commentScoreStr = commentScore==null?"":commentScore.toString();bdDoctorRpc.setRecommendScore(commentScoreStr);bdDoctorRpc.setDiseaseTag(doctorSkill);bdDoctorRpc.setImageUrl(doctorPhoto);bdDoctorRpc.setIdentifyMark(identifyMarkStr);bdDoctorList.add(bdDoctorRpc);JSONArray treatPatientArray = (JSONArray)doctorJo.get("treatPatient");if(CollectionUtils.isNotEmpty(treatPatientArray)){for(Object treatPatient:treatPatientArray){JSONObject treatPatientJo = (JSONObject)treatPatient;String diseaseName = (String)treatPatientJo.get("diseaseName");BdDiseaseDoctorRelaRpc bdDiseaseDoctorRelaRpc = new BdDiseaseDoctorRelaRpc();bdDiseaseDoctorRelaRpc.setDiseaseSourceId(diseaseName);bdDiseaseDoctorRelaRpc.setDoctorSourceId(doctorId);bdDiseaseDoctorRelaList.add(bdDiseaseDoctorRelaRpc);}}}page.putField("bdDiseaseDoctorRelaList", bdDiseaseDoctorRelaList);page.putField("bdDoctorList", bdDoctorList);}}}}}else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);  String doctorId = deptResultMap.getString("doctorId");  BdDoctorRpc bdDoctorRpc = new BdDoctorRpc();  bdDoctorRpc.setSourceId(doctorId);  String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString();  bdDoctorRpc.setIntro(experience);  List<String> commentList = page.getHtml().xpath("ul[@class='summary-comment']/li/p[@class='ys-util-text-default ys-util-text-smaller']/i[@class='comment-score ys-util-text-primary ys-util-text-big']/text()").all();  if(CollectionUtils.isNotEmpty(commentList)){  String recommendScore = commentList.size()>=1?commentList.get(0):"";  String treatmentEffectScore = commentList.size()>=2?commentList.get(1):"";  String attitudeScore = commentList.size()>=3?commentList.get(2):"";  bdDoctorRpc.setRecommendScore(recommendScore);  bdDoctorRpc.setTreatmentEffectScore(treatmentEffectScore);  bdDoctorRpc.setAttitudeScore(attitudeScore);  }  page.putField("bdDoctorRpc", bdDoctorRpc);}}@Overridepublic Site getSite() {return site;}public static void main(String[] args) {Spider.create(new YiBaiduProcessor()).addUrl(START_URL).thread(10).run();}}
  • 上述代码采集百度医生数据,采集线路进入医院列表-->医院详情-->科室列表-->科室详情-->医生列表-->医生详情

  • 每个eles if 匹配一类页面地址 即上面说的采集链路上的一个采集节点

  • 采集相应数据时会将网站的原始关系映射采集过来 ,在构建本地存储对象时从采集链接中获取采集,如医院,医生id值

    如果代码    }else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){

    MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String doctorId = deptResultMap.getString("doctorId");

    BdDoctorRpc bdDoctorRpc = new BdDoctorRpc(); bdDoctorRpc.setSourceId(doctorId);

    String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString(); if(StringUtils.isEmpty(experience)){ experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10 doctor-info-total']/text()").toString(); } bdDoctorRpc.setIntro(experience); System.out.println("experience:"+experience);

  • 解析Ajax json结果

    List<String> doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText());

if(CollectionUtils.isNotEmpty(doctorList)){ //收集医生信息 List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>(); //收集医生与疾病关系信息 List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>(); for(String o:doctorList){ JSONObject doctorJo = JSON.parseObject(o);

  • 针对元素特征一样的元素集 如li 列表 table 表格 需要依次获取其中的内容
0 0