ip抓取(二)

来源:互联网 发布:乐高ev3机器人编程 编辑:程序博客网 时间:2024/06/15 09:30

2、ip抓取第二阶段需求:

根据第一阶段抓取到的数据,向另一个IP查询网站、发送请求到数据查询中心http://199604.com/ip/,获取返回的数据,并抓取其中ip相关属性的数据并保存。

 

3、程序文档分析:

3.1根据已有的数据去http://199604.com/ip/, ip查询网站发送请求方法为requestPost();

并将返回的html文本存放在HTJF.txt;(利用分页查询SQL语句)每次取出100条,然后100条记录循环发送请求

 3.2对接收的文本进行解析、过滤

    第一次过滤:过滤完毕

    第二次过滤:过滤完毕存进IpOperator1.txt

3.3对已过滤过的IP数据进行处理

   3.3.1执行过程中遇到抓取数据不完整的IP,将它写入BugIp.txt文本

   3.3.2合法的Ip段,存入ipdata数据表

3.3.3有问题Ip(即开始ip查询的数据与结束IP的数据不一致)存放到ipspecial数据表。

 

4、程序性能描述:

    第一阶段抓到的数据:共3594条

合法ip:3143条

有问题Ip:450条

丢失数据:1条

   全程跑完历时:80分钟。期间抛出一次异常。

5、第二价段工作已完毕(耗时2天)

IpDemo2.java

package com.htjf.ip2;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.io.StringReader;import java.net.URL;import java.net.URLConnection;import java.sql.Connection;import java.sql.DriverManager;import java.sql.PreparedStatement;import java.sql.ResultSet;import java.sql.SQLException;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.Random;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.jsoup.helper.StringUtil;import com.mysql.jdbc.StringUtils;import com.htjf.ip.IpModel;/** * @author Administrator *  */public class IpDemo2 {/** * @param args */public static void main(String args[]) {System.out.println("程序入口");SqlData sqlData = new SqlData();int num = sqlData.findIpCount();// int k=0;int k = 3339;while (k < num) {List<IpModel> iplist = sqlData.findIp(k, 100);int j = 0;for (IpModel ipmodel : iplist) {System.out.println("===" + j);IpData ipdata = new IpData();IpSpecial ipSpecial = new IpSpecial();j++;String startIp = ipmodel.getStartIp();System.out.println("startIp:" + startIp);try {Map<String, String> ipmap = requestPost(startIp);ipdata.setIpId(ipmap.get("ipId"));ipdata.setStartIp(ipmap.get("startIp"));ipdata.setCountry(ipmap.get("country"));ipdata.setProvince(ipmap.get("province"));// 省份ipdata.setCity(ipmap.get("city"));// 城市ipdata.setOperator(ipmap.get("operator"));// 运营商} catch (IOException e1) {// TODO Auto-generated catch blocke1.printStackTrace();}String endIp = ipmodel.getEndIp().trim();System.out.println("endIp:" + endIp);try {Map<String, String> ipmap2 = requestPost(endIp);ipSpecial.setIpId(ipmap2.get("ipId"));ipSpecial.setEndIp(ipmap2.get("startIp"));// /结束ipipSpecial.setCountry(ipmap2.get("country"));ipSpecial.setProvince(ipmap2.get("province"));// 省份ipSpecial.setCity(ipmap2.get("city"));// 城市ipSpecial.setOperator(ipmap2.get("operator"));// 运营商} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}/* * if(StringUtil.isBlank(ipdata.getCity())){ ipdata.setCity(""); * }else if (StringUtil.isBlank(ipSpecial.getCity())) { * ipdata.setCity(""); } */if (ipdata.getCity().equals(ipSpecial.getCity())) {ipdata.setEndIp(ipSpecial.getEndIp());sqlData.insertIp(ipdata);// /System.out.println("---数据添加---");} else {ipSpecial.setStartIp(ipdata.getStartIp());sqlData.insertIpSpecial(ipSpecial);System.out.println("---特殊IP---");}}k = k + iplist.size();}}/** * @param ipString * @throws IOException *             发送请求 */public static Map<String, String> requestPost(String ipString)throws IOException {URL url = new URL("http://199604.com/ip/");URLConnection connection = url.openConnection();/** * 然后把连接设为输出模式。URLConnection通常作为输入来使用,比如下载一个Web页。 * 通过把URLConnection设为输出,你可以把数据向你个Web页传送。下面是如何做: */connection.setConnectTimeout(500000);connection.setDoOutput(true); // 是否向服务器发送数据connection.setReadTimeout(300000);/** * 最后,为了得到OutputStream,简单起见,把它约束在Writer并且放入POST信息中,例如: ... */OutputStreamWriter out = new OutputStreamWriter(connection.getOutputStream(), "8859_1");out.write("ip=" + ipString + "&action=2"); // 向页面传递数据。post的关键所在!// out.write("username=kevin&password=*********"); //向页面传递数据。post的关键所在!// remember to clean upout.flush();out.close();/** * 这样就可以发送一个看起来象这样的POST: POST /jobsearch/jobsearch.cgi HTTP 1.0 ACCEPT: * text/plain Content-type: application/x-www-form-urlencoded * Content-length: 99 username=bob password=someword */// 一旦发送成功,用以下方法就可以得到服务器的回应:String sCurrentLine;String sTotalString;sCurrentLine = "";sTotalString = "";InputStream l_urlStream;l_urlStream = connection.getInputStream();// 获取返回的Html内容// 传说中的三层包装阿!BufferedReader l_reader = new BufferedReader(new InputStreamReader(l_urlStream));String html_regex = "<(.[^>]*)>";// /过滤标签的规则Pattern p = Pattern.compile(html_regex);// 将规则封装成对象BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E://IpHTML.txt")));while ((sCurrentLine = l_reader.readLine()) != null) {sCurrentLine = sCurrentLine.replaceAll("<tr>", "ipOperator:");sCurrentLine = sCurrentLine.replaceAll("</td>", ",");sCurrentLine = sCurrentLine.replaceAll(html_regex, "");bufw.write(sCurrentLine);bufw.newLine();// /换行bufw.flush();// 刷新}bufw.close();System.out.println("第一次过滤完毕,开始下一轮过滤");String ipstr = saveIPOperator();System.out.println("第一次过滤完毕,开始下一轮过滤");Map<String, String> ipmap = saveIPOperator2(ipstr, ipString);return ipmap;}/** * @return * @throws IOException *             过滤一 */public static String saveIPOperator() throws IOException {BufferedReader bufr = new BufferedReader(new InputStreamReader(new FileInputStream("E://IpHTML.txt")));BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E://IpOperator1.txt")));String ip_regex1 = "ipOperator:";// /IP的匹配规则int k = 1;String ipstr = "";String line = null;while ((line = bufr.readLine()) != null) {Pattern p = Pattern.compile(ip_regex1);// 将规则封装成对象Matcher m = p.matcher(line);// 一行一行地进行匹配while (m.find()) {if (k == 2) {Pattern p2 = Pattern.compile("\\s*|\t|\r|\n");Matcher m2 = p2.matcher(line);String line2 = m2.replaceAll("");ipstr = line2;bufw.write(line2);bufw.newLine();// /换行bufw.flush();// 刷新}k++;}}bufw.close();return ipstr;}/** * @param ipstr * @throws IOException */public static Map<String, String> saveIPOperator2(String ipstr,String ipString) throws IOException {String iparray[] = new String[2];// IpData ipdata=new IpData();iparray = ipstr.split("\\:");String ipos[] = iparray[1].split(",");/* * for(int i=0;i<ipos.length;i++){ System.out.println("===ip:"+ipos[i]); * } */BufferedWriter bufw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("E://BugIp.txt", true)));Map<String, String> ipmap2 = new HashMap<String, String>();ipmap2.put("Sip", "");SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddhhmmss");String ipId = getRandomString(14) + sdf.format(new Date());Map<String, String> ipmap = new HashMap<String, String>();ipmap.put("ipId", ipId);ipmap.put("startIp", ipString);try {ipmap.put("country", ipos[1]);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();ipmap.put("country", "");ipmap2.put("Sip", ipString);}try {ipmap.put("province", ipos[2]);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();ipmap.put("province", "");ipmap2.put("Sip", ipString);}try {ipmap.put("city", ipos[3]);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();ipmap.put("city", "");ipmap2.put("Sip", ipString);}try {ipmap.put("county", ipos[4]);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();ipmap.put("county", "");}try {ipmap.put("operator", ipos[5]);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();ipmap.put("operator", "");ipmap2.put("Sip", ipString);}System.out.println("----------" + ipmap2.get("Sip"));bufw.write(ipmap2.get("Sip"));bufw.newLine();bufw.flush();bufw.close();/* * ipdata.setIpId(ipId); ipdata.setStartIp(ipos[0]); * ipdata.setCountry(ipos[1]); ipdata.setProvince(ipos[2]);//省份 * ipdata.setCity(ipos[3]);//城市 ipdata.setOperator(ipos[5]);//运营商 */return ipmap;}/** * @param length * @return 生成随机数 */public static String getRandomString(int length) { // length表示生成字符串的长度String base = "abcdefghijklmnopqrstuvwxyz0123456789";Random random = new Random();StringBuffer sb = new StringBuffer();for (int i = 0; i < length; i++) {int number = random.nextInt(base.length());sb.append(base.charAt(number));}return sb.toString();}}// //////////////class SqlData {public static String username;public static String password;public static Connection connection;public static PreparedStatement ps;// //构造函数public SqlData() {String url = "jdbc:mysql://127.0.0.1:3306/ipselect?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull";String username = "root";String password = "";// 加载驱动程序以连接数据库try {Class.forName("com.mysql.jdbc.Driver");connection = DriverManager.getConnection(url, username, password);}// 捕获加载驱动程序异常catch (ClassNotFoundException cnfex) {System.err.println("装载 JDBC/ODBC 驱动程序失败");cnfex.printStackTrace();}// 捕获连接数据库异常catch (SQLException sqlex) {System.err.println("无法连接数据库");sqlex.printStackTrace();}}/** * @param ipModel * @return 查询 数据 */public int findIpCount() {// java.util.List<IpModel> list = new ArrayList<IpModel>();int num = 0;try {ps = connection.prepareStatement("select count(*) from iptable");ResultSet rs = ps.executeQuery();rs.next();num = rs.getInt(1);// ps.close();} catch (SQLException e) {// TODO Auto-generated catch blocke.printStackTrace();}System.out.println("====count:" + num);return num;}/** * @param ipModel * @return 查询 数据 */public List<IpModel> findIp(int offset, int amount) {java.util.List<IpModel> listiptable = new ArrayList<IpModel>();try {ps = connection.prepareStatement("select * from iptable limit ?,?");ps.setInt(1, offset);ps.setInt(2, amount);ResultSet rs = ps.executeQuery();while (rs.next()) {IpModel ipmodel2 = new IpModel();ipmodel2.setIpId(rs.getString("ip_id"));ipmodel2.setStartIp(rs.getString("startIp"));System.out.println("iptable:" + rs.getString("startIp"));ipmodel2.setEndIp(rs.getString("endIp"));System.out.println("iptable:" + rs.getString("endIp"));listiptable.add(ipmodel2);}} catch (SQLException e) {// TODO Auto-generated catch blocke.printStackTrace();}return listiptable;}/** * @param ipModel *            添加数据到ipdata数据表,符合要求的Ip */public void insertIp(IpData ipData) {SqlData ipsql = new SqlData();List<IpData> list = ipsql.findIpData(ipData);if (list.size() > 0) {System.out.println("已存在有数据");} else {try {ps = connection.prepareStatement("insert into ipdata (ip_id,country,province,city,county,operator,startIp,endIp) values (?,?,?,?,?,?,?,?)");/* * SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMddhhmmss"); * String ipId=sdf.format(new Date()); */ps.setString(1, ipData.getIpId());ps.setString(2, ipData.getCountry());ps.setString(3, ipData.getProvince());ps.setString(4, ipData.getCity());ps.setString(5, ipData.getCounty());ps.setString(6, ipData.getOperator());ps.setString(7, ipData.getStartIp());ps.setString(8, ipData.getEndIp());ps.executeUpdate();System.out.println("------记录插入成功------");} catch (SQLException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}/** * @return 查询IPData,符合要求的IP的数据表 */public List<IpData> findIpData(IpData ipData11) {java.util.List<IpData> list = new ArrayList<IpData>();try {ps = connection.prepareStatement("select * from ipdata where startIp=? and endIp=?");ps.setString(1, ipData11.getStartIp());ps.setString(2, ipData11.getEndIp());ResultSet rs = ps.executeQuery();IpData ipData21 = new IpData();while (rs.next()) {ipData21.setIpId(rs.getString("ip_id"));ipData21.setStartIp(rs.getString("startIp"));ipData21.setEndIp(rs.getString("endIp"));list.add(ipData21);}} catch (SQLException e) {// TODO Auto-generated catch blocke.printStackTrace();}return list;}/** * @param ipModel *            添加数据到ipdata数据表,符合要求的Ip */public void insertIpSpecial(IpSpecial ipData) {SqlData ipsql = new SqlData();List<IpSpecial> list = ipsql.findIpSpecial(ipData);if (list.size() > 0) {System.out.println("已存在有数据");} else {try {ps = connection.prepareStatement("insert into ipspecial (ip_id,country,province,city,county,operator,startIp,endIp) values (?,?,?,?,?,?,?,?)");/* * SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMddhhmmss"); * String ipId=sdf.format(new Date()); */ps.setString(1, ipData.getIpId());ps.setString(2, ipData.getCountry());ps.setString(3, ipData.getProvince());ps.setString(4, ipData.getCity());ps.setString(5, ipData.getCounty());ps.setString(6, ipData.getOperator());ps.setString(7, ipData.getStartIp());ps.setString(8, ipData.getEndIp());ps.executeUpdate();System.out.println("-------特殊Ip插入成功------");} catch (SQLException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}/** * @return 查询IpSpecial,特殊的IP */public List<IpSpecial> findIpSpecial(IpSpecial ipSpecial) {java.util.List<IpSpecial> list = new ArrayList<IpSpecial>();try {ps = connection.prepareStatement("select * from ipspecial where startIp=? and endIp=?");ps.setString(1, ipSpecial.getStartIp());ps.setString(2, ipSpecial.getEndIp());ResultSet rs = ps.executeQuery();IpSpecial ipSpecial2 = new IpSpecial();while (rs.next()) {ipSpecial2.setIpId(rs.getString("ip_id"));ipSpecial2.setStartIp(rs.getString("startIp"));ipSpecial2.setEndIp(rs.getString("endIp"));list.add(ipSpecial2);}} catch (SQLException e) {// TODO Auto-generated catch blocke.printStackTrace();}return list;}}




4 0
原创粉丝点击