简单的网页内容抓取实例(携程酒店)
来源:互联网 发布:淘宝反作弊系统 知乎 编辑:程序博客网 时间:2024/04/29 02:05
网页抓取有很多种,这里介绍一个简单方法,暴力但快速得到有规律的网页内容
比如携程酒店的网页内容,希望得到一下基本信息:
酒店名称
英文名称
城市
省份
地址
纬度
经度
经纬度(String 类型)
电话
酒店星级
这里是代码(带测试样例)
package webTextGrabber;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.List;public class WebContent { // hotelId, hotelUrl, cityId can be obtained at CtripUtil class private String hotelName; private String hotelEname; private String cityName; private String provinceName; private String address; private double lat; private double lng; private String coordinates; private String tel; private int hotelStars; public String getUrlSource(String url) throws IOException { URL webpage = new URL(url); URLConnection yc = webpage.openConnection(); BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream(), "UTF-8")); String inputLine; StringBuilder a = new StringBuilder(); while ((inputLine = in.readLine()) != null) a.append(inputLine); in.close(); return a.toString(); } public void setAll(String str) throws Exception { try { setHotelName(str); setHotelEname(str); setCityName(str); setProvinceName(str); setAddress(str); setLat(str); setLng(str); setCoordinates(); setTel(str); setHotelStars(str); } catch (Exception e) { throw new Exception(e); } } public String setHotelName(String str) throws Exception { try { int index = str.indexOf("cn_n"); hotelName = str.substring(str.indexOf(">", index) + 1, str.indexOf("<", index)); } catch (Exception e) { throw new Exception(e); } return hotelName; } public String setHotelEname(String str) throws Exception { try { int index = str.indexOf("en_n"); if (index == -1) { return hotelEname; } hotelEname = str.substring(str.indexOf(">", index) + 1, str.indexOf("<", index)); } catch (Exception e) { throw new Exception(e); } return hotelEname; } public String setCityName(String str) { int index = str.indexOf("city"); cityName = str.substring(index + 5, str.indexOf(">", index) - 1); return cityName; } public String setProvinceName(String str) { int index = str.indexOf("province"); provinceName = str.substring(index + 9, str.indexOf(";", index)); return provinceName; } public String setAddress(String str) { int index = str.lastIndexOf("酒店地址"); if (!provinceName.equals(cityName)) { address = provinceName + cityName; } else { address = provinceName; } address += str.substring(index + 5, str.indexOf(";", index)); return address; } public double setLat(String str) { int index = str.indexOf("latitude"); lat = Double.parseDouble(str.substring(str.indexOf("content=", index) + 9, str.indexOf("/>", index) - 2)); return lat; } public double setLng(String str) { int index = str.indexOf("longitude"); lng = Double.parseDouble(str.substring(str.indexOf("content=", index) + 9, str.indexOf("/>", index) - 2)); return lng; } public String setCoordinates() { coordinates = "" + lat + ", " + lng; return coordinates; } public String setTel(String str) throws Exception { try { int index = str.indexOf("电话0"); if (index == -1) { return tel; } tel = str.substring(index + 2, index + 14); } catch (Exception e) { throw new Exception(e); } return tel; } public int setHotelStars(String str) throws Exception { try { int index = str.indexOf("hotel_stars"); if (index == -1) { return hotelStars; } hotelStars = Integer.parseInt(str.substring(index + 11, index + 13)); } catch (Exception e) { throw new Exception(e); } return hotelStars; } public String getHotelName() { return hotelName; } public String getHotelEname() { return hotelEname; } public String getCityName() { return cityName; } public String getProvinceName() { return provinceName; } public String getAddress() { return address; } public double getLat() { return lat; } public double getLng() { return lng; } public String getCoordinates() { return coordinates; } public String getTel() { return tel; } public int getHotelStars() { return hotelStars; } /** * * @param args * * @throws IOException */ public static void main(final String args[]) throws IOException { final List<String> list = new ArrayList<String>(); list.add("http://hotels.ctrip.com/hotel/427952.html"); list.add("http://hotels.ctrip.com/hotel/671.html"); list.add("http://hotels.ctrip.com/hotel/2005959.html"); list.add("http://hotels.ctrip.com/hotel/481810.html"); list.add("http://hotels.ctrip.com/hotel/2104633.html"); list.add("http://hotels.ctrip.com/hotel/1481502.html"); list.add("http://hotels.ctrip.com/hotel/1720124.html"); list.add("http://hotels.ctrip.com/hotel/2165407.html"); list.add("http://hotels.ctrip.com/hotel/1636803.html"); list.add("http://hotels.ctrip.com/hotel/371188.html"); final WebContent wc = new WebContent(); for (int i = 0; i < list.size(); i++) { String webinfo = wc.getUrlSource(list.get(i)); if (webinfo == null || webinfo.length() == 0 || webinfo.indexOf("验证") != -1) { continue; } try { wc.setAll(webinfo); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println(wc.getHotelName()); System.out.println(wc.getHotelEname()); System.out.println(wc.getCityName()); System.out.println(wc.getProvinceName()); System.out.println(wc.getAddress()); System.out.println(wc.getLat()); System.out.println(wc.getLng()); System.out.println(wc.getCoordinates()); System.out.println(wc.getTel()); System.out.println(wc.getHotelStars()); } }}
这里是输出结果:
北京金隅喜来登酒店Sheraton Beijing Dongcheng Hotel北京北京北京北三环东路36号39.97346873116.416302839.97346873, 116.4163028010-579888885上海大厦Broadway Mansions Hotel上海上海上海北苏州路20号31.250007605066121.4966374529131.250007605066, 121.49663745291021-632462605宁国伯爵王朝大酒店Bojue Dynasty Hotel宁国安徽安徽宁国宁阳西路155号30.606036618238118.9745495270930.606036618238, 118.974549527090563-41888885合肥天鹅湖大酒店Swan Lake Hotel合肥安徽安徽合肥政务文化新区东流路888号31.823922092928117.2360437115431.823922092928, 117.236043711540551-63536665宁国都市阳光酒店宁国安徽安徽宁国中溪北路8号30.633710023238118.9837501238230.633710023238, 118.983750123820563-41017883达拉特旗东达假日酒店Dongda Holiday Hotel达拉特旗内蒙古内蒙古达拉特旗树林召西街南侧40.402906010224110.0092594207540.402906010224, 110.009259420750477-39638883大理和舀田园度假酒店大理市云南云南大理市城北村(0872-2475995)25.861913102242100.1441007328125.861913102242, 100.144100732810872-24759953喀纳斯贾登峪回家休闲酒店(酒店区)Connectedhome布尔津新疆新疆布尔津喀纳斯贾登峪生活区游客接待基地一区48.50177952065587.15736932733348.501779520655, 87.1573693273330906-63275983欣得酒店(北京石佛营店)北京北京北京石佛营东里99号39.941142006731116.5124364799139.941142006731, 116.51243647991010-858141223青岛颐中皇冠假日酒店Crowne Plaza Qingdao青岛山东山东青岛香港中路76号36.070022690161120.4061509594936.070022690161, 120.406150959490532-85718883
当然,在具体的工作学习使用中,可以将数据存成相应的数据格式来保存在数据库中。
阅读全文
0 0
- 简单的网页内容抓取实例(携程酒店)
- 第一个爬虫实例-简单抓取网页内容
- c#关于网页内容抓取,简单爬虫的实现。(包括动态,静态的)
- c#关于网页内容抓取,简单爬虫的实现。(包括动态,静态的)
- 使用HAP抓取HTML网页内容实例
- 网页内容抓取 图片的抓取方法
- 抓取网页内容的函数
- [python]抓取网页的内容
- 有关网页抓取的内容
- HttpClient抓取网页内容简单介绍
- HttpClient抓取网页内容简单介绍
- HttpClient抓取网页内容简单介绍
- Python简单抓取在线网页内容
- 抓取网页萃取网页内容的代码
- php 网页数据抓取 简单实例
- 抓取网页中指定节点的内容(java版本)
- 【Day6】如何抓取一个网页的内容(多种方法)
- js print打印网页指定区域内容的简单实例
- css3典型动画集合
- python实现共轭梯度法求解方程组
- 第十一章:使用类
- Oracle 执行计划(Explain Plan) 说明
- 反射的工具类
- 简单的网页内容抓取实例(携程酒店)
- 【数论 && 找规律】UVA 11752 The Super Powers
- html5 api方法 的兼容处理(前缀处理)
- 【spring学习笔记一】使用xml文件配置bean
- request 字符串请求的工具类
- Django 查询数据库并返回页面
- 推荐两个不错的公众号
- 调用约定(pascal,fastcall,stdcall,thiscall,cdecl)区别等
- Go安装