简单的网页内容抓取实例(携程酒店)

来源:互联网 发布:淘宝反作弊系统 知乎 编辑:程序博客网 时间:2024/04/29 02:05

网页抓取有很多种,这里介绍一个简单方法,暴力但快速得到有规律的网页内容

比如携程酒店的网页内容,希望得到一下基本信息:

酒店名称

英文名称

城市

省份

地址

纬度

经度

经纬度(String 类型)

电话

酒店星级


这里是代码(带测试样例)

package webTextGrabber;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.List;public class WebContent { // hotelId, hotelUrl, cityId can be obtained at CtripUtil class private String hotelName; private String hotelEname; private String cityName; private String provinceName; private String address; private double lat; private double lng; private String coordinates; private String tel; private int hotelStars; public String getUrlSource(String url) throws IOException {  URL webpage = new URL(url);  URLConnection yc = webpage.openConnection();  BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream(), "UTF-8"));  String inputLine;  StringBuilder a = new StringBuilder();  while ((inputLine = in.readLine()) != null)   a.append(inputLine);  in.close();  return a.toString(); } public void setAll(String str) throws Exception {  try {   setHotelName(str);   setHotelEname(str);   setCityName(str);   setProvinceName(str);   setAddress(str);   setLat(str);   setLng(str);   setCoordinates();   setTel(str);   setHotelStars(str);  } catch (Exception e) {   throw new Exception(e);  } } public String setHotelName(String str) throws Exception {  try {   int index = str.indexOf("cn_n");   hotelName = str.substring(str.indexOf(">", index) + 1, str.indexOf("<", index));  } catch (Exception e) {   throw new Exception(e);  }  return hotelName; } public String setHotelEname(String str) throws Exception {  try {   int index = str.indexOf("en_n");   if (index == -1) {    return hotelEname;   }   hotelEname = str.substring(str.indexOf(">", index) + 1, str.indexOf("<", index));  } catch (Exception e) {   throw new Exception(e);  }  return hotelEname; } public String setCityName(String str) {  int index = str.indexOf("city");  cityName = str.substring(index + 5, str.indexOf(">", index) - 1);  return cityName; } public String setProvinceName(String str) {  int index = str.indexOf("province");  provinceName = str.substring(index + 9, str.indexOf(";", index));  return provinceName; }  public String setAddress(String str) {  int index = str.lastIndexOf("酒店地址");  if (!provinceName.equals(cityName)) {   address = provinceName + cityName;  } else {   address = provinceName;  }  address += str.substring(index + 5, str.indexOf(";", index));  return address; } public double setLat(String str) {  int index = str.indexOf("latitude");  lat = Double.parseDouble(str.substring(str.indexOf("content=", index) + 9, str.indexOf("/>", index) - 2));  return lat; } public double setLng(String str) {  int index = str.indexOf("longitude");  lng = Double.parseDouble(str.substring(str.indexOf("content=", index) + 9, str.indexOf("/>", index) - 2));  return lng; } public String setCoordinates() {  coordinates = "" + lat + ", " + lng;  return coordinates; } public String setTel(String str) throws Exception {  try {   int index = str.indexOf("电话0");   if (index == -1) {    return tel;   }   tel = str.substring(index + 2, index + 14);  } catch (Exception e) {   throw new Exception(e);  }  return tel; } public int setHotelStars(String str) throws Exception {  try {   int index = str.indexOf("hotel_stars");   if (index == -1) {    return hotelStars;   }   hotelStars = Integer.parseInt(str.substring(index + 11, index + 13));  } catch (Exception e) {   throw new Exception(e);  }  return hotelStars; } public String getHotelName() {  return hotelName; } public String getHotelEname() {  return hotelEname; } public String getCityName() {  return cityName; } public String getProvinceName() {  return provinceName; } public String getAddress() {  return address; } public double getLat() {  return lat; } public double getLng() {  return lng; } public String getCoordinates() {  return coordinates; } public String getTel() {  return tel; } public int getHotelStars() {  return hotelStars; } /**  *   * @param args  *   * @throws IOException  */ public static void main(final String args[]) throws IOException   {  final List<String> list = new ArrayList<String>();  list.add("http://hotels.ctrip.com/hotel/427952.html");  list.add("http://hotels.ctrip.com/hotel/671.html");  list.add("http://hotels.ctrip.com/hotel/2005959.html");  list.add("http://hotels.ctrip.com/hotel/481810.html");  list.add("http://hotels.ctrip.com/hotel/2104633.html");  list.add("http://hotels.ctrip.com/hotel/1481502.html");  list.add("http://hotels.ctrip.com/hotel/1720124.html");  list.add("http://hotels.ctrip.com/hotel/2165407.html");  list.add("http://hotels.ctrip.com/hotel/1636803.html");  list.add("http://hotels.ctrip.com/hotel/371188.html");    final WebContent wc = new WebContent();  for (int i = 0; i < list.size(); i++) {   String webinfo = wc.getUrlSource(list.get(i));   if (webinfo == null || webinfo.length() == 0 || webinfo.indexOf("验证") != -1) {    continue;   }   try {    wc.setAll(webinfo);   } catch (Exception e) {    // TODO Auto-generated catch block    e.printStackTrace();   }   System.out.println(wc.getHotelName());   System.out.println(wc.getHotelEname());   System.out.println(wc.getCityName());   System.out.println(wc.getProvinceName());   System.out.println(wc.getAddress());   System.out.println(wc.getLat());   System.out.println(wc.getLng());   System.out.println(wc.getCoordinates());   System.out.println(wc.getTel());   System.out.println(wc.getHotelStars());  }   }}


这里是输出结果:

北京金隅喜来登酒店Sheraton Beijing Dongcheng Hotel北京北京北京北三环东路36号39.97346873116.416302839.97346873, 116.4163028010-579888885上海大厦Broadway Mansions Hotel上海上海上海北苏州路20号31.250007605066121.4966374529131.250007605066, 121.49663745291021-632462605宁国伯爵王朝大酒店Bojue Dynasty Hotel宁国安徽安徽宁国宁阳西路155号30.606036618238118.9745495270930.606036618238, 118.974549527090563-41888885合肥天鹅湖大酒店Swan Lake Hotel合肥安徽安徽合肥政务文化新区东流路888号31.823922092928117.2360437115431.823922092928, 117.236043711540551-63536665宁国都市阳光酒店宁国安徽安徽宁国中溪北路8号30.633710023238118.9837501238230.633710023238, 118.983750123820563-41017883达拉特旗东达假日酒店Dongda Holiday Hotel达拉特旗内蒙古内蒙古达拉特旗树林召西街南侧40.402906010224110.0092594207540.402906010224, 110.009259420750477-39638883大理和舀田园度假酒店大理市云南云南大理市城北村(0872-2475995)25.861913102242100.1441007328125.861913102242, 100.144100732810872-24759953喀纳斯贾登峪回家休闲酒店(酒店区)Connectedhome布尔津新疆新疆布尔津喀纳斯贾登峪生活区游客接待基地一区48.50177952065587.15736932733348.501779520655, 87.1573693273330906-63275983欣得酒店(北京石佛营店)北京北京北京石佛营东里99号39.941142006731116.5124364799139.941142006731, 116.51243647991010-858141223青岛颐中皇冠假日酒店Crowne Plaza Qingdao青岛山东山东青岛香港中路76号36.070022690161120.4061509594936.070022690161, 120.406150959490532-85718883

当然,在具体的工作学习使用中,可以将数据存成相应的数据格式来保存在数据库中。

原创粉丝点击