远程抓取页面信息并解析XML

来源:互联网 发布:哈工大深圳知乎 编辑:程序博客网 时间:2024/05/01 12:33
 远程抓取页面信息并解析XML

XmlTransfer.java  负责链接对方服务器

package untitled1;

import java.net.URL;
import java.net.URLConnection;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;

import org.w3c.dom.*;
import javax.xml.parsers.*;

public class XmlTransfer{
  private String urlAddr;
  private String xmlStr;
  HttpURLConnection urlCon = null;

  public XmlTransfer(String _urlAddr,String _xmlStr) {
    this.urlAddr = _urlAddr;
    this.xmlStr = _xmlStr;
  }
  public InputStream get() throws Exception
  {
    if(urlCon==null){urlCon=getUrlConnection();}
    if(urlCon==null){throw new Exception("连接失败");}
    PrintWriter out = new PrintWriter(urlCon.getOutputStream());
    out.print(xmlStr);
    out.flush();
    out.close();

    urlCon.disconnect();
    InputStream fin1 = urlCon.getInputStream();
    return fin1;
  }

  private HttpURLConnection getUrlConnection(){

    try{
      URL url = new URL(urlAddr);
      URLConnection conn = url.openConnection();
      urlCon = (HttpURLConnection)conn;
      urlCon.setRequestProperty("Content-type", "text/html;charset=gb2312");
      urlCon.setDoOutput(true);
      urlCon.setRequestMethod("GET");
      urlCon.setUseCaches(false);
    }
    catch (MalformedURLException mex) {
      mex.printStackTrace();
    }
    catch (ProtocolException pex) {
      pex.printStackTrace();
    }
    catch (IOException iex) {
      iex.printStackTrace();
    }

    return urlCon;
  }


  public static String getHttp( String strURL ){
      XmlTransfer xt=new XmlTransfer(strURL,"");
      StringBuffer sb = new StringBuffer();
      try{
          InputStream is = xt.get();
          byte[] b = new byte[1024];
          int iCount = 0;
          while ((iCount = is.read(b)) > 0) {
              sb.append(new String(b, 0, iCount));
          }
      }catch(Exception e){
          sb.append("An error occurs in XmlTransfer.getHttp/n");
          sb.append(e.getMessage());
      }

     return (sb.toString());
  }

  public static void main(String[] args) throws Exception {
    System.out.println( XmlTransfer.getHttp("http://215.117.110.81/yyoa/oainfo.jsp?comm=person") );
                                   //http://192.168.0.110/testProvince.html","");
  }
}
 

UsrDataSync.java  负责抓取页面

package untitled1;

import java.util.Calendar;
import java.util.TimerTask;
import javax.servlet.ServletContext;
import java.io.File;
/**
 * <p>Title: </p>
 *
 * <p>Description: </p>
 *
 * <p>Copyright: Copyright (c) 2006</p>
 *
 * <p>Company: </p>
 *
 * @author not attributable
 * @version 1.0
 */
public class UsrDataSync {

    public UsrDataSync() {
    }

    public static boolean doSync(){
        String strXml;
        ParseXML px = new ParseXML();
        strXml = XmlTransfer.getHttp("http://215.117.110.81/yyoa/oainfo.jsp?comm=person");
        strXml = strXml.replaceAll("/r/n", "");
        px.doParse(strXml);

        return false;
    }
    public static void main(String[] args) throws Exception {
    UsrDataSync dd= new UsrDataSync();
    dd.doSync();
  }

}

ParseXML.java  解析XML(包括正则表达式)

//import java.awt.*;
//import javax.servlet.*;
//import javax.servlet.http.*;
//import javax.servlet.jsp.*;
//import org.apache.jasper.runtime.*;
package usersync;

import java.io.*;
import java.util.*;
import javax.xml.parsers.*;
import org.w3c.dom.*;
import java.net.URL;
import java.net.URLConnection;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import javax.swing.*;
import java.sql.*;
/**
 * <p>Title: </p>
 *
 * <p>Description: </p>
 *
 * <p>Copyright: Copyright (c) 2006</p>
 *
 * <p>Company: </p>
 *
 * @author not attributable
 * @version 1.0
 */
public class ParseXML{

   // StringBuffer os = new StringBuffer();
    Document doc = null;
    public Connection con=null;
    public Connection con_history=null;

    public String doParse(String str) {
        try {
        DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
      //InputStream is=xt.get();
      doc = builder.parse(new  ByteArrayInputStream(str.getBytes()));
      NodeList nl= doc.getElementsByTagName("person");
      int i=0;
      int len=nl.getLength();
      Element tempElement=null;
      while(i<len)
      {
        tempElement=(Element) nl.item(i);
        System.out.println(tempElement.getFirstChild().getNodeValue().toString());
        tempElement.normalize();
        /*
        System.out.print(tempElement.getAttribute("id"));
        System.out.print("    ");
        System.out.print(tempElement.getAttribute("name"));
        System.out.print("    ");
        System.out.print(tempElement.getAttribute("logname"));
        System.out.println();
        */
       String duty=nl.item(i).getChildNodes().item(1).toString();
       String department=nl.item(i).getChildNodes().item(3).toString();
       String station=nl.item(i).getChildNodes().item(5).toString();
       String state=nl.item(i).getChildNodes().item(7).toString();
       String description=nl.item(i).getChildNodes().item(9).toString();
       //String s="1312311231";

       //正则表达式
       duty = duty.replaceAll(".duty//sid..//d....","");
       duty = duty.replaceAll("</duty>","");
       department = department.replaceAll(".department//sid..//d..","");
       department = department.replaceAll("</department>","");
       station = station.replaceAll(".station//sid..//d..","");
       station = station.replaceAll("</station>","");
       state = state.replaceAll(".state//sid..//d..","");
       state = state.replaceAll("</state>","");
       description = description.replaceAll(".description//sid..//d..","");
       description = description.replaceAll("</description>","");
       description = description.replaceAll("<description />","");

       syncUser(Integer.parseInt(tempElement.getAttribute("id")),
              tempElement.getAttribute("name"),
              tempElement.getAttribute("logname"),
              duty,
              department,
              station,
              state,
              description);

//        syncUser(Integer.parseInt(tempElement.getAttribute("id")),
//              tempElement.getAttribute("name"),
//               tempElement.getAttribute("logname"),
//               nl.item(i).getChildNodes().item(1).getTextContent(),
//               nl.item(i).getChildNodes().item(3).getTextContent(),
//               nl.item(i).getChildNodes().item(5).getTextContent(),
//               nl.item(i).getChildNodes().item(7).getTextContent(),
//               nl.item(i).getChildNodes().item(9).getTextContent());
        i++;
      }
      System.out.println(doc.toString());
      //System.out.println(os.toString());
//      System.out.println("==============" + System.currentTimeMillis() + "==============");
  }catch(Exception e){
  System.out.println(e.getMessage());
}
        return null;
    }


    private boolean syncUser(int uid, String usrname, String logname, String duty, String department, String station, String state, String description ){
      int pos = 0;//0: new, 1:running, 2:history
      try{
      Class.forName("com.mysql.jdbc.Driver");
      con=DriverManager.getConnection("jdbc:mysql://localhost:3306/blog","root","root");
      con_history=DriverManager.getConnection("jdbc:mysql://localhost:3306/blog_history","root","root");

      Statement st = con.createStatement();
      Statement stt = con_history.createStatement();

      //新表
      ResultSet rs = st.executeQuery("select * from blogusers where id=" + Integer.toString(uid) );
      //老表
      ResultSet rss = stt.executeQuery("select * from blogusers where id=" + uid);


      if(rs.next())
      {
           String dp = department.substring(0,2);
           if(dp.equals("中央"))
           {
               st.executeUpdate("update blogusers set id=" + uid +
                                ",TRUENAME='" + usrname + "',DUTYNAME='" +
                                duty + "'," +
                                "FLAG=" + 1 + ",DEPMENT='"+department+"' where id=" + uid);
           }else{

               st.executeUpdate("update blogusers set id=" + uid +
                                  ",TRUENAME='" + usrname + "',DUTYNAME='" +
                                  duty + "'," +
                                  "FLAG=" + 2 + ",DEPMENT='"+department+"' where id=" + uid);

           }
          //运行库
          pos = 1;
      }
      else if(rss.next())
      {
          st.executeUpdate("update blogusers set id=" + uid +
                                  ",TRUENAME='" + usrname + "',DUTYNAME='" +
                                  duty + "'," + "FLAG=" + 2 + ",DEPMENT='"+department+"' where id=" + uid);
             //老库
             pos = 2;
      }

      if(state.equals("在职"))
      {
          switch( pos ){
            //新库
          case 0:
              //insert to running
              st.executeUpdate("insert into blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+
                              " values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith("中央")==true?1:2)+",'"+department+"')");
              break;
              //老库
           case 2:
              //move from running to history
              moveUser(con, con_history, uid,  usrname,  logname,  duty,  department,  station,  state,  description);
              break;
          }
      }else{
          switch( pos ){
            //新库
          case 0:
              //insert to history
              stt.executeUpdate("insert into blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+
                              " values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith("中央")==true?1:2)+",'"+department+"')");
              break;
              //运行库
          case 1:
              //move from history to running
              moveUser(con_history, con, uid,  usrname,  logname,  duty,  department,  station,  state,  description );
              break;
          }

      }
//      rs.close();
//      st.close();
//      con.close();
     }catch(Exception e){
      e.printStackTrace();
     }
        return false;
    }

    private void moveUser(Connection src, Connection dest, int uid, String usrname, String logname, String duty, String department, String station, String state, String description ) throws
            SQLException {
        Statement st1=src.createStatement();
        Statement st2=dest.createStatement();
        //查询运行库
        ResultSet rs1=st1.executeQuery("select * from blogusers where id="+uid);
        String s1="";
        String s2="";
        String s3="";
        //String s4="";
        //String s5="";
        String s6="";
        //String s7="";
        String s8="";
        if(rs1.next())
        {
            s1=rs1.getString(1);
            s2=rs1.getString(2);
            s3=rs1.getString(3);
            //s4=rs1.getString(4);
            //s5=rs1.getString(5);
            s6=rs1.getString(6);
            //s7=rs1.getString(7);
            s8=rs1.getString(8);
        }
        //插入老库
        st2.executeUpdate("insert into blogusers(id,TRUENAME,DUTYNAME,FLAG,DEPMENT)"+
        " values("+uid+",'"+usrname+"','"+duty+"',"+(department.startsWith("中央")==true?1:2)+",'"+department+"')");
        //删除运行库记录
        st1.executeUpdate("delete from blogusers where id=" + uid);
    }
}


 
原创粉丝点击