通过搜索词获取百度健康排位

来源:互联网 发布:知乎最近浏览 编辑:程序博客网 时间:2024/04/26 01:30

一、需求

       最近要为公司做百度健康排位抓取工具,就是通过一批搜索词得到百度健康展现出来的医院列表及其位置

  比如说我在百度健康搜索“北京人流”,得到如下图所示的列表,取得搜索出来的医院的位置,比如得到“北京奥北医院”,“左1”。“中国人民解放军第二炮兵总医院”,“右1”

以此类推,只需要第一页就行

二、代码实现

      我使用了Jsoup+htmlunit相结合方式获取页面和解析页面,采用ApachePOI解析和能导出excel

     

package com.huaxia.yanfa.export;import java.io.File;import java.io.FileInputStream;import java.io.FileOutputStream;import java.net.URLEncoder;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.apache.poi.hssf.usermodel.HSSFCell;import org.apache.poi.hssf.usermodel.HSSFCellStyle;import org.apache.poi.hssf.usermodel.HSSFRow;import org.apache.poi.hssf.usermodel.HSSFSheet;import org.apache.poi.hssf.usermodel.HSSFWorkbook;import org.apache.poi.xssf.usermodel.XSSFCell;import org.apache.poi.xssf.usermodel.XSSFRow;import org.apache.poi.xssf.usermodel.XSSFSheet;import org.apache.poi.xssf.usermodel.XSSFWorkbook;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;import com.gargoylesoftware.htmlunit.WebClient;import com.gargoylesoftware.htmlunit.html.HtmlPage;public class Exports {// 读取搜索词文件public List<String> readSearchKeys() {List<String> list = new ArrayList<String>();try {           
File inFile = new File("searchkeys.xlsx");if (inFile.isFile() && inFile.exists()) {FileInputStream is = new FileInputStream(inFile);System.out.println("开始读取搜索词");XSSFWorkbook wb = new XSSFWorkbook(is);XSSFSheet sheet = wb.getSheetAt(0);XSSFRow row;// 获取总行数int totalRowNum = sheet.getLastRowNum();for (int i = 1; i <= totalRowNum; i++) {row = sheet.getRow(i);if(row!=null){XSSFCell xccel = row.getCell(0);if(xccel!=null&&xccel.toString().trim().length()>0){list.add(xccel.toString());}}}System.out.println("读取搜索词完毕");}} catch (Exception e) {e.printStackTrace();}return list;}public void writeExcel(){WebClient webClient = new WebClient();webClient.setAjaxController(new NicelyResynchronizingAjaxController());// 获取html对象Document doc;try {List<Map<String, String>> list = new ArrayList<Map<String, String>>();Exports exp=new Exports();List<String> keys=exp.readSearchKeys();for(String key:keys){
                          //这个是百度健康搜索的URL   
String urls = "http://jiankang.baidu.com/juhe?aType=2&source=0&sessionID=1380510305320866&provID=1&provName=%E5%8C%97%E4%BA%AC&avoid_enter_submit=&wd="+ URLEncoder.encode(key, "UTF-8");HtmlPage htmlPage = webClient.getPage(urls);doc = Jsoup.parse(htmlPage.asXml());//获取左边的位置Elements links = doc.select("div.card-detail");int i = 1;for (Element es : links) {Elements title = es.select("a.card-hospital-name");String domin = title.get(0).text().replace(" ", "");if (title.get(0).hasAttr("em")) {domin = title.get(0).getElementsByTag("em").get(0).text().toString().replace(" ", "")+ domin;}Map<String, String> map = new HashMap<String, String>();map.put("searchKey", key);map.put("weizhi", "左" + i);map.put("hospitalName", domin);list.add(map);i++;}//获取右边的位置Element elright=doc.getElementById("health-right");Elements rightEl=elright.select("div.card-info"); i = 1;for (Element es : rightEl) {Elements title = es.select("a.card-hospital-name");String domin = title.get(0).text().replace(" ", "");if (title.get(0).hasAttr("em")) {domin = title.get(0).getElementsByTag("em").get(0).text().toString().replace(" ", "")+ domin;} Map<String, String> map = new HashMap<String, String>();map.put("searchKey", key);map.put("weizhi", "右" + i);map.put("hospitalName", domin);list.add(map); i++;}}// 导出excelHSSFWorkbook hwb = new HSSFWorkbook();HSSFSheet sheet = hwb.createSheet("IP");HSSFRow row = sheet.createRow((int) 0);HSSFCellStyle style = hwb.createCellStyle();style.setAlignment(HSSFCellStyle.ALIGN_CENTER);String head[] = { "搜索词", "位置", "医院名称" };for (int j = 0; j < head.length; j++) {HSSFCell cell = row.createCell(j);cell.setCellValue(head[j]);cell.setCellStyle(style);}           int  i=0;for (Map<String, String> map : list) {row = sheet.createRow(i + 1);for (int j = 0; j < head.length; j++) {HSSFCell cell = row.createCell(j);switch (j) {case 0:cell.setCellValue(map.get("searchKey") == null ? "": map.get("searchKey").toString());break;case 1:cell.setCellValue(map.get("weizhi") == null ? "": map.get("weizhi").toString());break;case 2:cell.setCellValue(map.get("hospitalName") == null ? "": map.get("hospitalName").toString());break;default:break;}}    i++;}hwb.write(new FileOutputStream(new File("百度健康排位.xls")));} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}public static void main(String[] args) {new Exports().writeExcel();}}


 

0 0
原创粉丝点击