webmagic实战使用
来源:互联网 发布:南京黑马程序员培训班 编辑:程序博客网 时间:2024/06/08 03:11
一.引入依赖包
<dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.4.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.4.3</version> </dependency>
二.代码
package com.pz998.quartz.spider;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.apache.commons.collections.CollectionUtils;import org.apache.commons.lang.StringUtils;import org.eclipse.jetty.util.MultiMap;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import com.pz998.rpc.model.entity.BdDepartmentDiseaseRelaRpc;import com.pz998.rpc.model.entity.BdDepartmentRpc;import com.pz998.rpc.model.entity.BdDiseaseDoctorRelaRpc;import com.pz998.rpc.model.entity.BdDoctorRpc;import com.pz998.rpc.model.entity.BdHospitalRpc;import net.minidev.json.JSONArray;import net.minidev.json.JSONObject;import net.minidev.json.parser.JSONParser;import net.minidev.json.parser.ParseException;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;import us.codecraft.webmagic.selector.JsonPathSelector;import us.codecraft.xsoup.Xsoup;public class YiBaiduProcessor implements PageProcessor{private static final String START_URL = "https://yi.baidu.com/pc/hospital/list?cityId=371&pageSize=10&page=1";private static final String HOSPITAL_DETAIL_URL = "https://yi\\.baidu\\.com/pc/hospital/index\\?zt=pcpinzhuan&zt_ext=&pvid=\\d+&key=\\S+";private static final String HOSPITAL_LIST_URL = "https://yi\\.baidu\\.com/pc/hospital/list\\?cityId=\\d++&pageSize=10&page=\\d++";private static final String HOSPITAL_INFO_URL ="https://yi\\.baidu\\.com/pc/hospital/info\\?key=\\S+";private static final String DEPT_INFO_URL = "https://yi\\.baidu\\.com/pc/admindepartment/detail\\?zt=\\w+&zt_ext=&pvid=\\d+&hosId=\\d+&adminDepartId=\\d+";private static final String HOSPITAL_DEPT_URL ="https://yi\\.baidu\\.com/pc/hospital/alldep\\?key=\\S+";private static final String DOCTOR_LIST_URL = "https://yi\\.baidu\\.com/pc/admindepartment/doctorlist\\?diseaseId=0&medTitle=0&serviceType=0&page=\\d+&pageSize=8&provId=0&cityId=0®ionId=0&adminDepartId=\\d+&hosId=\\d+";private static final String DOCTOR_INFO_URL = "https://yi\\.baidu\\.com/pc/doctor/detailpage\\?zt=\\w+&zt_ext=&pvid=0&doctorId=\\d+";//https://yi.baidu.com/pc/hospital/info?key=%E6%AD%A6%E6%B1%89%E5%B8%82%E5%A6%87%E5%A5%B3%E5%84%BF%E7%AB%A5%E5%8C%BB%E7%96%97%E4%BF%9D%E5%81%A5%E4%B8%AD%E5%BF%83//https://yi.baidu.com/pc/hospital/alldep?key=private Site site = Site.me();public static final String STATE_SUCCESS = "0";public static final Map<String,String> CITY_MAP = new HashMap<String,String>();static{CITY_MAP.put("371","武汉");CITY_MAP.put("1", "北京");CITY_MAP.put("2", "上海");CITY_MAP.put("84","广州");}@Overridepublic void process(Page page) {String url=page.getUrl().toString();if(page.getUrl().regex(HOSPITAL_LIST_URL).match()){try{String state = new JsonPathSelector("$.status").select(page.getRawText());if(STATE_SUCCESS.equals(state)){List hospitalList = new JsonPathSelector("$.data.hospitalList[*]").selectList(page.getRawText());MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);String cityId = resultMap.getString("cityId");if(CollectionUtils.isNotEmpty(hospitalList)){List<BdHospitalRpc> bdHospitalList = new ArrayList<BdHospitalRpc>();for(Object obj:hospitalList){JSONObject jsonObj = (JSONObject)obj;String name = (String)jsonObj.get("name");System.out.println("name:"+name);String address = (String)jsonObj.get("address");String level = (String)jsonObj.get("level");Integer insurance = (Integer)jsonObj.get("insurance");String phone = (String)jsonObj.get("phone");String grade = (String)jsonObj.get("grade");Integer doctorNum = (Integer)jsonObj.get("doctorNum");String imageUrl = (String)jsonObj.get("logo");Integer serveNum = (Integer)jsonObj.get("serveNum");Integer commentNum = (Integer)jsonObj.get("commentNum");String routeLink = (String)jsonObj.get("routeLink");MultiMap<String> routeLinkMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(routeLink);String location = routeLinkMap.getString("location");String latitude = "";String longitude = "";if(StringUtils.isNotEmpty(location)){String[] locationArray = location.split(",");latitude = locationArray.length>0?locationArray[0]:"";longitude = locationArray.length>1?locationArray[1]:"";}BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();bdHospitalRpc.setSourceId(name);bdHospitalRpc.setName(name);bdHospitalRpc.setAddress(address);bdHospitalRpc.setLevel(level);bdHospitalRpc.setPhone(phone);bdHospitalRpc.setImageUrl(imageUrl);bdHospitalRpc.setLatitude(latitude);bdHospitalRpc.setLongitude(longitude);bdHospitalRpc.setScore(grade);String city = CITY_MAP.get(cityId);bdHospitalRpc.setCity(city);String insuranceStr = insurance==null?"":insurance.toString();bdHospitalRpc.setIsMedicalInsurance(insuranceStr);String doctorNumStr = doctorNum==null?"":doctorNum.toString();bdHospitalRpc.setHighQualityDoctorNum(doctorNumStr);String serveNumStr = serveNum==null?"":serveNum.toString();bdHospitalRpc.setFinishedServiceNum(serveNumStr);String commentNumStr=commentNum==null?"":commentNum.toString();bdHospitalRpc.setPatientCommentNum(commentNumStr);bdHospitalList.add(bdHospitalRpc);String infoUrl = "https://yi.baidu.com/pc/hospital/info?key="+name;String allDeptUrl = "https://yi.baidu.com/pc/hospital/alldep?key="+name;page.addTargetRequest(infoUrl);page.addTargetRequest(allDeptUrl);}page.putField("bdHospitalList", bdHospitalList);}}}catch(Exception e){e.printStackTrace();}}elseif(page.getUrl().regex(HOSPITAL_INFO_URL).match()){try{MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String hosName = resultMap.getString("key"); BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();List<String> contextList = page.getHtml().xpath("ul[@class='container-list-info']/li[@class='ys-util-margin-b35']/p[@class='ys-util-text-smaller ys-util-margin-t9 ys-util-margin-b30']/text()").all();if(CollectionUtils.isNotEmpty(contextList)){String context1 = contextList.size()>=1?contextList.get(0):"";String context2 = contextList.size()>=2?contextList.get(1):"";String context3 = contextList.size()>=3?contextList.get(2):"";String context4 = contextList.size()>=4?contextList.get(3):"";String context5 = contextList.size()>=5?contextList.get(4):"";bdHospitalRpc.setContent(context1);bdHospitalRpc.setHistory(context2);bdHospitalRpc.setCharacteristicDept(context3);bdHospitalRpc.setTeam(context4);bdHospitalRpc.setHonor(context5);// System.out.println("医院概况:"+context1);// System.out.println("历史沿革:"+context2);// System.out.println("特色科室:"+context3);// System.out.println("医护团队:"+context4);// System.out.println("医院荣誉:"+context5);}bdHospitalRpc.setSourceId(hosName);page.putField("bdHospitalRpc", bdHospitalRpc);}catch(Exception e){e.printStackTrace();}}else if(page.getUrl().regex(HOSPITAL_DEPT_URL).match()){try{ MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String hosName = resultMap.getString("key"); String topDepts = ""; List<String> tableHtml = page.getHtml().xpath("div[@class='container-common-office']/table[@class='ys-util-margin-b15 list-office ys-util-border-big']").all();List<BdDepartmentRpc> departmentList = new ArrayList<BdDepartmentRpc>();for(String html:tableHtml){Document document = Jsoup.parse(html);String platDept = Xsoup.select(document, "td[@class='primary-office']/h4/text()").get();List<String> hospitalDepts = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']").list();List<String> hospitalDeptNames = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']/text()").list();//重点科室信息if(StringUtils.isEmpty(platDept)){topDepts = com.pz998.quartz.utils.StringUtils.listToString(hospitalDeptNames);//医院科室信息}else{for(String d:hospitalDepts){Document deptDocument = Jsoup.parse(d);String deptName = Xsoup.select(deptDocument, "a/text()").get();String deptHref = Xsoup.select(deptDocument, "a/@href").get();MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(deptHref);String deptId = deptResultMap.getString("adminDepartId");String hosId = deptResultMap.getString("hosId");BdDepartmentRpc bdDepart = new BdDepartmentRpc();bdDepart.setSourceId(deptId);bdDepart.setName(deptName);bdDepart.setParentSource(platDept);bdDepart.setHospitalSource(hosName);departmentList.add(bdDepart);//将科室详情地址放入目标采集队列page.addTargetRequest(deptHref);//将科室下医生列表链接放入队列for(int i=1;i<6;i++){String doctorUrl = "https://yi.baidu.com/pc/admindepartment/doctorlist?diseaseId=0&medTitle=0&serviceType=0&page="+i+"&pageSize=8&provId=0&cityId=0®ionId=0&adminDepartId="+deptId+"&hosId="+hosId;page.addTargetRequest(doctorUrl);}}}}BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();bdHospitalRpc.setSourceId(hosName);System.out.println("重点科室:"+topDepts);bdHospitalRpc.setCharacteristicFaculty(topDepts);page.putField("hosTopDept", bdHospitalRpc);page.putField("departmentList", departmentList);// System.out.println(page.getHtml().toString());}catch(Exception e){e.printStackTrace();}//采集科室信息}else if(page.getUrl().regex(DEPT_INFO_URL).match()){String deptPhone = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-normal-height']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString();String deptAddress = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t8 ys-util-text-normal']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString();String content = page.getHtml().xpath("div[@class='office-info']/p[@class='ys-util-text-smaller ys-util-margin-t15 office-info-total']/text()").toString();String titleDescr = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-min-height']/h3[@class='ys-util-text-min ys-util-margin-r12']/text()").toString();MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String deptId = deptResultMap.getString("adminDepartId");String hosId = deptResultMap.getString("hosId");BdDepartmentRpc bdDepartmentRpc = new BdDepartmentRpc();bdDepartmentRpc.setAddress(deptAddress);bdDepartmentRpc.setPhone(deptPhone);bdDepartmentRpc.setContent(content);bdDepartmentRpc.setSourceId(deptId);bdDepartmentRpc.setTitleDescr(titleDescr);page.putField("bdDepartmentRpc", bdDepartmentRpc);}else if(page.getUrl().regex(DOCTOR_LIST_URL).match()){String status = new JsonPathSelector("$.status").select(page.getRawText());if(STATE_SUCCESS.equals(status)){String data = new JsonPathSelector("$.data[*]").select(page.getRawText());if(data!=null){MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);String deptId = deptResultMap.getString("adminDepartId");String hosId = deptResultMap.getString("hosId");String pageNum = deptResultMap.getString("page");List<BdDepartmentDiseaseRelaRpc> BdDepartmentDiseaseRelaRpcList = new ArrayList<BdDepartmentDiseaseRelaRpc>();JSONParser jsonParser = new JSONParser();JSONObject dataJo = null;try {dataJo = (JSONObject)jsonParser.parse(data);} catch (ParseException e) {e.printStackTrace();}if("1".equals(pageNum)){JSONArray diseaseArray= dataJo==null?null:(JSONArray)dataJo.get("selectorList");if(CollectionUtils.isNotEmpty(diseaseArray)){JSONObject obj = (JSONObject)diseaseArray.get(0);JSONArray diseaseList = (JSONArray)obj.get("list");if(CollectionUtils.isNotEmpty(diseaseList)){for(Object disease:diseaseList){JSONObject diseaseJo=(JSONObject)disease;String itemName = (String)diseaseJo.get("itemName");if("全部".equals(itemName)){continue;}BdDepartmentDiseaseRelaRpc bdDepartmentDiseaseRelaRpc = new BdDepartmentDiseaseRelaRpc();bdDepartmentDiseaseRelaRpc.setHospitalSourceId(hosId);bdDepartmentDiseaseRelaRpc.setDepartmentSourceId(deptId);bdDepartmentDiseaseRelaRpc.setDiseaseSource(itemName);BdDepartmentDiseaseRelaRpcList.add(bdDepartmentDiseaseRelaRpc);}}}}page.putField("bdDepartmentDiseaseRelaRpcList", BdDepartmentDiseaseRelaRpcList);if(dataJo.containsKey("doctorList")){List doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText());if(CollectionUtils.isNotEmpty(doctorList)){//收集医生信息List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>();//收集医生与疾病关系信息List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>();for(Object o:doctorList){JSONObject doctorJo = (JSONObject)o;//医生认证信息String identifyMarkStr = "";if(doctorJo.containsKey("doctorIdentify")){List<String> identifyMarkList = new JsonPathSelector("$.doctorIdentify[*].identifyMark").selectList(doctorJo.toJSONString());identifyMarkStr = com.pz998.quartz.utils.StringUtils.listToString(identifyMarkList);}String doctorName = (String)doctorJo.get("doctorName");String doctorTitle= (String)doctorJo.get("doctorTitle");Object commentScore = doctorJo.get("commentScore");String doctorSkill = (String)doctorJo.get("doctorSkill");String allTimeHref = (String)doctorJo.get("allTimeHref");String doctorPhoto = (String)doctorJo.get("doctorPhoto");//医生详情页加入目标采集page.addTargetRequest(allTimeHref);MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(allTimeHref);String doctorId = resultMap.getString("doctorId");BdDoctorRpc bdDoctorRpc = new BdDoctorRpc();bdDoctorRpc.setHospitalSourceId(hosId);bdDoctorRpc.setDepartmentSourceId(deptId);bdDoctorRpc.setSourceId(doctorId);bdDoctorRpc.setName(doctorName);bdDoctorRpc.setPracticeTitle(doctorTitle);String commentScoreStr = commentScore==null?"":commentScore.toString();bdDoctorRpc.setRecommendScore(commentScoreStr);bdDoctorRpc.setDiseaseTag(doctorSkill);bdDoctorRpc.setImageUrl(doctorPhoto);bdDoctorRpc.setIdentifyMark(identifyMarkStr);bdDoctorList.add(bdDoctorRpc);JSONArray treatPatientArray = (JSONArray)doctorJo.get("treatPatient");if(CollectionUtils.isNotEmpty(treatPatientArray)){for(Object treatPatient:treatPatientArray){JSONObject treatPatientJo = (JSONObject)treatPatient;String diseaseName = (String)treatPatientJo.get("diseaseName");BdDiseaseDoctorRelaRpc bdDiseaseDoctorRelaRpc = new BdDiseaseDoctorRelaRpc();bdDiseaseDoctorRelaRpc.setDiseaseSourceId(diseaseName);bdDiseaseDoctorRelaRpc.setDoctorSourceId(doctorId);bdDiseaseDoctorRelaList.add(bdDiseaseDoctorRelaRpc);}}}page.putField("bdDiseaseDoctorRelaList", bdDiseaseDoctorRelaList);page.putField("bdDoctorList", bdDoctorList);}}}}}else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String doctorId = deptResultMap.getString("doctorId"); BdDoctorRpc bdDoctorRpc = new BdDoctorRpc(); bdDoctorRpc.setSourceId(doctorId); String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString(); bdDoctorRpc.setIntro(experience); List<String> commentList = page.getHtml().xpath("ul[@class='summary-comment']/li/p[@class='ys-util-text-default ys-util-text-smaller']/i[@class='comment-score ys-util-text-primary ys-util-text-big']/text()").all(); if(CollectionUtils.isNotEmpty(commentList)){ String recommendScore = commentList.size()>=1?commentList.get(0):""; String treatmentEffectScore = commentList.size()>=2?commentList.get(1):""; String attitudeScore = commentList.size()>=3?commentList.get(2):""; bdDoctorRpc.setRecommendScore(recommendScore); bdDoctorRpc.setTreatmentEffectScore(treatmentEffectScore); bdDoctorRpc.setAttitudeScore(attitudeScore); } page.putField("bdDoctorRpc", bdDoctorRpc);}}@Overridepublic Site getSite() {return site;}public static void main(String[] args) {Spider.create(new YiBaiduProcessor()).addUrl(START_URL).thread(10).run();}}
上述代码采集百度医生数据,采集线路进入医院列表-->医院详情-->科室列表-->科室详情-->医生列表-->医生详情
每个eles if 匹配一类页面地址 即上面说的采集链路上的一个采集节点
采集相应数据时会将网站的原始关系映射采集过来 ,在构建本地存储对象时从采集链接中获取采集,如医院,医生id值
如果代码 }else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){
MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String doctorId = deptResultMap.getString("doctorId");
BdDoctorRpc bdDoctorRpc = new BdDoctorRpc(); bdDoctorRpc.setSourceId(doctorId);
String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString(); if(StringUtils.isEmpty(experience)){ experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10 doctor-info-total']/text()").toString(); } bdDoctorRpc.setIntro(experience); System.out.println("experience:"+experience);
解析Ajax json结果
List<String> doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText());
if(CollectionUtils.isNotEmpty(doctorList)){ //收集医生信息 List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>(); //收集医生与疾病关系信息 List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>(); for(String o:doctorList){ JSONObject doctorJo = JSON.parseObject(o);
- 针对元素特征一样的元素集 如li 列表 table 表格 需要依次获取其中的内容
- webmagic实战使用
- WebMagic-使用入门
- webmagic爬虫使用
- webmagic使用疑问
- WebMagic-使用入门
- webmagic使用总结
- 基于WebMagic的java爬虫实战
- webmagic
- WebMagic
- webmagic
- 使用WebMagic爬新浪博客
- 使用注解编写WebMagic爬虫
- java 爬虫 WebMagic-使用入门
- 使用WebMagic爬虫框架爬取暴走漫画
- Java爬虫框架WebMagic的使用总结
- 使用WebMagic爬CSDN上的文章
- 使用webmagic 爬取天气网站
- 使用webmagic 爬取中关村评论
- jQuery tab切换收集
- LeetCode-Hash-242. Valid Anagram
- 第一次写博客
- Chapter 5: 关联式容器之 set 和 multiset
- 欢迎使用CSDN-markdown编辑器
- webmagic实战使用
- 洛谷 1196 银河英雄传说 并查集
- 实现卡片布局,左右滑动
- 洛谷P1147 连续自然数和
- 洛谷 P3366 【模板】最小生成树
- HTTP Request 和Response
- Python 设置系统默认编码
- HBase 学习笔记
- 插入排序