抓取IP数据(一)

来源:互联网 发布:js设置滚动条位置 编辑:程序博客网 时间:2024/06/05 18:17

1、工作的背景及意义:

由于经常要查找IP的运营商等相关属性,数据量不多的情况下手动查找还比较好,但是在数据量超过几百,几千,这样手动查找就比较费力了,这样机械的工作交给计算机处理最合适了。没有必要人为的机械性查找。

2、ip抓取第一阶段需求:

从这个链接http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest,抓取含有CN、ipv4的IP数据,再发、发送请求到数据查询中心http://wq.apnic.net/apnic-bin/whois.pl,获取返回的数据,并抓取其中ip相关属性的数据并保存。

3、程序文档分析:

3.1抓取(http:ftp……)连接下txt文本(含有CN\ipv4)的数据,将抓取到的数据存进HTJF.txt。【通过程序中的getmail()和savetxt()方法完成】

3.2发送请求到Ip查询网站,并接收返回的html文本(通过程序中的readtxt()\testpost方法完成)。

3.3对接收的文本进行解析、过滤

    第一次过滤:过滤完毕存进IpHTML.txt

    第二次过滤:过滤完毕存进Ip1.txt

    第三次过滤:过滤完毕存进Ip2.txt

最后入库:把IP的相关属性封装成对象存入数据前先遍历存不存在该IP段,

   存在:不执行

   不存在:执行JDBC操作

4、程序性能描述:

    从7万多条数据抓到3千几条数据,

   全程跑完历时:50分钟。期间抛出一次异常。


代码如下:

package com.htjf.ip;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.net.URL;import java.net.URLConnection;import java.sql.Connection;import java.sql.DriverManager;import java.sql.PreparedStatement;import java.sql.ResultSet;import java.sql.SQLException;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.List;import java.util.Random;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.Jsoup;import org.jsoup.helper.StringUtil;import org.jsoup.nodes.Document;/** * @author Qixuan *  */public class IpDemo {/** * @param args *            程序入口 * @throws IOException */public static void main(String args[]) throws IOException {// Document doc=null;// doc =// Jsoup.connect("http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest").timeout(1000000).get();try {System.out.println("爬取");// List<String> list=getMail();System.out.println("保存");// savetxt(list);System.out.println("发送请求");readtxt();// 发送请求并进行多个规则过滤/* * testPost("112.46.78.4");//发送请求, saveLastIP();//规则1 IpModel * ipModel=saveLastIP2();//规则2 *  * MySql ipsql=new MySql(); ipsql.insertIp(ipModel);//存进数据库 */} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}// //////////* * 1、读取文件 2、对读取的数据进行规则匹配,从中获取符合规则的数据 3、将符合规则的数据储存到集合中 */public static List<String> getMail() throws Exception {// /从本地文件中爬// BufferedReader br=new BufferedReader(new// FileReader("d:\\mail.html"));// /从网络文件中爬URL url = new URL("http://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-latest");BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));String mail_regex = "CN\\|ipv4";// /关键字的匹配规则Pattern p = Pattern.compile(mail_regex);// 将规则封装成对象List<String> list = new ArrayList<String>();String line = null;while ((line = br.readLine()) != null) {Matcher m = p.matcher(line);// 一行一行地进行匹配while (m.find()) {// m.group()找到就放进组里// list.add(m.group());list.add(line);}}return list;}/** * @param list *            将爬取到的含有CN、ipv4的数据存到HTJF.txt中 * @throws FileNotFoundException */public static void savetxt(List<String> listarray)throws FileNotFoundException {/* 将A文件中的内容,保存到B文件中 */// BufferedReader bufr=new BufferedReader(new InputStreamReader(new// FileInputStream("D://xuan.txt")));BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E://HTJF.txt")));try {System.out.println("有多少条记录:" + listarray.size());System.out.println("爬到的资源");/* * List<String> cnList=new ArrayList<String>(); List<String> * ipv4List=new ArrayList<String>(); List<String> ipList=new * ArrayList<String>(); */String[] str = new String[10];for (String mail : listarray) {System.out.println("====>" + mail);str = mail.split("\\|");String line = null;// int length=str.length;bufw.write(str[3]);bufw.write(",");bufw.write(str[2]);bufw.write(",");bufw.write(str[1]);bufw.newLine();// /换行bufw.flush();// 刷新}bufw.close();//} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}// ////////** * 读取文本,并Post到ip运营商查询网站中 *  * @throws IOException */public static void readtxt() throws IOException {BufferedReader bufr = new BufferedReader(new InputStreamReader(new FileInputStream("E://HTJF.txt")));String[] array = new String[10];Document doc = null;String line = null;while ((line = bufr.readLine()) != null) {array = line.split("\\,");System.out.println("ip地址:" + array[0]);String searchtextIp = array[0];testPost(searchtextIp);// 发送请求并过滤,调用3000几次}}/** * @throws IOException *             使用java程序模拟页面发送http的post请求,并过滤标签 */public static void testPost(String iptest) throws IOException {/** * 首先要和URL下的URLConnection对话。 URLConnection可以很容易的从URL得到。比如: // Using * java.net.URL and //java.net.URLConnection *  * 使用页面发送请求的正常流程:在页面http://www.faircanton.com/message/loginlytebox. * asp中输入用户名和密码,然后按登录, * 跳转到页面http://www.faircanton.com/message/check.asp进行验证 验证的的结果返回到另一个页面 *  * 使用java程序发送请求的流程:使用URLConnection向http://www.faircanton.com/message/ * check.asp发送请求 并传递两个参数:用户名和密码 然后用程序获取验证结果 */URL url = new URL("http://wq.apnic.net/apnic-bin/whois.pl");URLConnection connection = url.openConnection();/** * 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。 * 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做: */connection.setDoOutput(true);/** * 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ... */OutputStreamWriter out = new OutputStreamWriter(connection.getOutputStream(), "8859_1");out.write("searchtext=" + iptest + "&form_type=advanced"); // 向页面传递数据。post的关键所在!// out.write("username=kevin&password=*********"); //向页面传递数据。post的关键所在!// remember to clean upout.flush();out.close();/** * 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT: * text/plain Content-type: application/x-www-form-urlencoded * Content-length: 99 username=bob password=someword */// 一旦发送成功,用以下方法就可以得到服务器的回应:String sCurrentLine;String sTotalString;sCurrentLine = "";sTotalString = "";InputStream l_urlStream;l_urlStream = connection.getInputStream();// 传说中的三层包装阿!String mail_regex = "<(.[^>]*)>";// /过滤标签的规则Pattern p = Pattern.compile(mail_regex);// 将规则封装成对象BufferedReader l_reader = new BufferedReader(new InputStreamReader(l_urlStream));BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E://IpHTML.txt")));while ((sCurrentLine = l_reader.readLine()) != null) {// sTotalString += sCurrentLine + "/r/n";// Matcher m=p.matcher(sCurrentLine);//一行一行地进行匹配// Matcher m=p.matcher(sCurrentLine);//一行一行地进行匹配sCurrentLine = sCurrentLine.replaceAll(mail_regex, "").trim();bufw.write(sCurrentLine);bufw.newLine();// /换行bufw.flush();// 刷新}bufw.close();// System.out.println("页面相应的内容");// System.out.println(sTotalString);System.out.println("第一次过滤完毕,开始下一轮过滤");saveLastIP();// 第二次过滤}// ////////////////////** * @throws IOException *             匹配ip所需要的字段1 */public static void saveLastIP() throws IOException {BufferedReader bufr = new BufferedReader(new InputStreamReader(new FileInputStream("E://IpHTML.txt")));BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E://Ip1.txt")));String[] mail_regex1 = { "inetnum:", "netname:", "descr:", "country:" };// /IP的匹配规则String line = null;while ((line = bufr.readLine()) != null) {for (int i = 0; i < mail_regex1.length; i++) {Pattern p = Pattern.compile(mail_regex1[i]);// 将规则封装成对象Matcher m = p.matcher(line);// 一行一行地进行匹配while (m.find()) {Pattern p2 = Pattern.compile("\\s*|\t|\r|\n");Matcher m2 = p2.matcher(line);String line2 = m2.replaceAll("");bufw.write(line2);bufw.newLine();// /换行bufw.flush();// 刷新}}}bufw.close();System.out.println("第二次过滤完毕,开始下一轮过滤");saveLastIP2();// 第三次过滤}/** * @throws IOException *             匹配ip所需要的字段 */public static void saveLastIP2() throws IOException {BufferedReader bufr = new BufferedReader(new InputStreamReader(new FileInputStream("E://Ip1.txt")));BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E://Ip2.txt")));String[] array = new String[2];String[] temp = new String[2];IpModel ipModel = new IpModel();int k = 1;String line = null;while ((line = bufr.readLine()) != null) {array = line.split("\\:");if ("inetnum".equals(array[0])) {temp = array[1].split("\\-");ipModel.setStartIp(temp[0]);ipModel.setEndIp(temp[1]);} else if ("netname".equals(array[0])) {if (array[1].indexOf("-") > 0) {temp = array[1].split("\\-");ipModel.setProvince(temp[0]);ipModel.setAttribution(temp[1]);} else {ipModel.setProvince("");ipModel.setAttribution(array[1]);}} else if ("descr".equals(array[0])) {if (k == 1) {try {if (StringUtil.isBlank(array[1])) {ipModel.setOperator("");} else {ipModel.setOperator(array[1]);}} catch (ArrayIndexOutOfBoundsException e) {System.out.println("数组越界!");e.printStackTrace();}} else if (k == 2) {try {if (StringUtil.isBlank(array[1])) {ipModel.setOperator("");} else {ipModel.setOperator(array[1]);}} catch (ArrayIndexOutOfBoundsException e) {System.out.println("数组越界!");e.printStackTrace();}} else if (k == 3) {try {if (StringUtil.isBlank(array[1])) {ipModel.setOperator("");} else {ipModel.setOperator(array[1]);}} catch (ArrayIndexOutOfBoundsException e) {System.out.println("数组越界!");e.printStackTrace();}} else if (k == 4) {try {if (StringUtil.isBlank(array[1])) {ipModel.setOperator("");} else {ipModel.setOperator(array[1]);}} catch (ArrayIndexOutOfBoundsException e) {System.out.println("数组越界!");e.printStackTrace();}}k++;} else if ("country".equals(array[0])) {ipModel.setCountry(array[1]);}}bufw.write(ipModel.getCountry());bufw.write("  ");// /换行bufw.write(ipModel.getAttribution()); // 归属地bufw.write("  ");// /换行bufw.write(ipModel.getProvince());bufw.write("  ");// /换行bufw.write(ipModel.getOperator());// 运营商bufw.write("  ");// /换行bufw.write(ipModel.getStartIp());bufw.write("  ");// /换行bufw.write(ipModel.getEndIp());bufw.newLine();// /换行bufw.flush();// 刷新bufw.close();SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddhhmmss");String ipId = getRandomString(14) + sdf.format(new Date());ipModel.setIpId(ipId);System.out.println("匹配完毕保存到数据库");MySql ipsqlSql = new MySql();ipsqlSql.insertIp(ipModel);// return ipModel;}/** * @param length * @return 生成随机数 */public static String getRandomString(int length) { // length表示生成字符串的长度String base = "abcdefghijklmnopqrstuvwxyz0123456789";Random random = new Random();StringBuffer sb = new StringBuffer();for (int i = 0; i < length; i++) {int number = random.nextInt(base.length());sb.append(base.charAt(number));}return sb.toString();}}// ///////////////////////////class MySql {public static String username;public static String password;public static Connection connection;public static PreparedStatement ps;// //构造函数public MySql() {String url = "jdbc:mysql://127.0.0.1:3306/ipselect?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull";String username = "root";String password = "";// 加载驱动程序以连接数据库try {Class.forName("com.mysql.jdbc.Driver");connection = DriverManager.getConnection(url, username, password);}// 捕获加载驱动程序异常catch (ClassNotFoundException cnfex) {System.err.println("装载 JDBC/ODBC 驱动程序失败");cnfex.printStackTrace();}// 捕获连接数据库异常catch (SQLException sqlex) {System.err.println("无法连接数据库");sqlex.printStackTrace();}}/** * @param ipModel *            private String country;//国家地区 private String province;//省份 *            private String operator;//运营商 private String attribution;//归属地 *            private String startIp;//起始Ip private String endIp;//结束Ip *  */public void insertIp(IpModel ipModel) {MySql ipsql = new MySql();List<IpModel> list = ipsql.findIp(ipModel);if (list.size() > 0) {System.out.println("已存在有数据");} else {try {ps = connection.prepareStatement("insert into iptable (ip_id,country,province,operator,attribution,startIp,endIp) values (?,?,?,?,?,?,?)");/* * SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMddhhmmss"); * String ipId=sdf.format(new Date()); */ps.setString(1, ipModel.getIpId());ps.setString(2, ipModel.getCountry());ps.setString(3, ipModel.getProvince());ps.setString(4, ipModel.getOperator());ps.setString(5, ipModel.getAttribution());ps.setString(6, ipModel.getStartIp());ps.setString(7, ipModel.getEndIp());ps.executeUpdate();System.out.println("记录插入成功");} catch (SQLException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}/** * @param ipModel *            更新update table set a=REPLACE(a,'1','2'); */public void updateIp(IpModel ipModel) {try {ps = connection.prepareStatement("update iptable set(country=?,province=?,operator=?,attribution=?,startIp,endIp=?) where ip_id=?");ps.setString(1, ipModel.getIpId());ps.setString(2, ipModel.getCountry());ps.setString(3, ipModel.getProvince());ps.setString(4, ipModel.getOperator());ps.setString(5, ipModel.getAttribution());ps.setString(6, ipModel.getStartIp());ps.setString(7, ipModel.getEndIp());ps.executeUpdate();} catch (SQLException e) {// TODO Auto-generated catch blocke.printStackTrace();}}/** * @param ipModel * @return 查询 */public List<IpModel> findIp(IpModel ipModel) {java.util.List<IpModel> list = new ArrayList<IpModel>();try {ps = connection.prepareStatement("select * from iptable where startIp=? and endIp=?");ps.setString(1, ipModel.getStartIp());ps.setString(2, ipModel.getEndIp());ResultSet rs = ps.executeQuery();IpModel ipmodel = new IpModel();while (rs.next()) {ipmodel.setStartIp(rs.getString("ip_id"));ipmodel.setStartIp(rs.getString("startIp"));ipmodel.setStartIp(rs.getString("endIp"));list.add(ipmodel);}} catch (SQLException e) {// TODO Auto-generated catch blocke.printStackTrace();}return list;}}


ip属性:

package com.htjf.ip;/** * @author Qixuan *  */public class IpModel {private String ipId;private String country;// 国家地区private String province;// 省份private String operator;// 运营商private String attribution;// 归属地private String startIp;// 起始Ipprivate String endIp;// 结束Ippublic String getIpId() {return ipId;}public void setIpId(String ipId) {this.ipId = ipId;}public String getCountry() {return country;}public void setCountry(String country) {this.country = country;}public String getProvince() {return province;}public void setProvince(String province) {this.province = province;}public String getOperator() {return operator;}public void setOperator(String operator) {this.operator = operator;}public String getAttribution() {return attribution;}public void setAttribution(String attribution) {this.attribution = attribution;}public String getStartIp() {return startIp;}public void setStartIp(String startIp) {this.startIp = startIp;}public String getEndIp() {return endIp;}public void setEndIp(String endIp) {this.endIp = endIp;}}



3 0