JAVA使用jsoup技术实现网站URL解析爬取|爬取网站登陆后页面动态数据

来源:互联网 发布:淘宝联盟的优惠券在哪 编辑:程序博客网 时间:2024/06/17 17:12

下面是爬取51.la网站数据,您可以注册网站进行测试参考 网址:51.la

package com.cc.api.test.user;


import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.ig.common.utils.*;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.Date;

/**
 * Created by Administrator on 2015/8/24.
 */
public class JsoupApiTest {



    @Test
    public void testGame () {
         String wd = "http: www.51.la";
         String url = "http: www.51.la/login.asp";
         SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
         String newDate = sdf.format(new Date());
         Connection.Response res = null;
         try {
             res = Jsoup.connect(url).data("uname", "betlog", "upass", "Betlogbiwin").method(Connection.Method.POST).execute();
         } catch (IOException e) {
             e.printStackTrace();
         }
 
 
          这儿的SESSIONID需要根据要登录的目标网站设置的session Cookie名字而定
         String sessionId = res.cookie("ajstat");
         try {
             Document objectDoc = Jsoup.connect("http: www.51.la/user/index.asp").cookie("ajstat", sessionId).get();
             System.out.print("统计ID报表" + "\r\n");
             Elements groups = objectDoc.getElementsByClass("bodys_zw").first().getElementsByTag("div").first().getElementsByClass("grplistlink");
             String groupInfo = "";
             for(Element gr : groups){
                 groupInfo = groupInfo+ gr.text()+"   ";
             }
             System.out.print(groupInfo + "\r\n\n");
             System.out.print("获取具体的平台的数据情况 : " + "\r\n");
 
 
             Elements ptElements = objectDoc.getElementsByClass("idlist_box");
             for (Element pt :ptElements){
                 String bcptName = pt.getElementsByClass("idlist_n").first().text();
                 System.out.print(bcptName + "\r\n");
             }
 

 
 
             Element gr = objectDoc.getElementsByClass("idlist_box").first();
             String bcptName = gr.getElementsByClass("idlist_n").first().text();
             System.out.print(bcptName + "\r\n");
 
             Elements bcreports  = gr.getElementsByClass("idlist_o").first().getElementsByTag("a");
             String agentlinks = bcreports.first().attr("href").substring(bcreports.first().attr("href").indexOf("/"), bcreports.first().attr("href").length());
             String agentId = agentlinks.substring(agentlinks.indexOf("id") + 3, agentlinks.length());
 
             String accInfo = "http: www.51.la/report/3_last.asp?id="+agentId+"&slailu=&skey=&spage=&sip=&d1="+newDate+"&t=kuai"; 访问明细
             String serchEngine = "http: www.51.la/report/3_SE.asp?id="+agentId+"&d1="+newDate+"&d2="+newDate; 搜索引擎
             String keyWords = "http: www.51.la/report/3_keyword.asp?id="+agentId+"&s=&d1="+newDate+"&d2="+newDate+"&ord=k_ci"; 关键词
             String route  = "http: www.51.la/report/3_Lailu.asp?id="+agentId+"&d1="+newDate+"&d2="+newDate+"&s=&ord=l_ci"; 来路
             String pageBrowser = "http: www.51.la/report/3_page.asp?id="+agentId+"&isdm=0&s=&d1="+newDate+"&d2="+newDate+"&ord=p_ci"; 页面浏览
             String domain = "http: www.51.la/report/3_page.asp?id="+agentId+"&isdm=1&s=&d1="+newDate+"&d2="+newDate+"&ord=p_ci"; 域名
             String returnedCust = "http: www.51.la/report/3_Client.asp?id="+agentId+"&t=huitou&d1="+newDate+"&d2="+newDate; 回头客
             String browsingDepth = "http: www.51.la/report/3_Shendu.asp?id="+agentId+"&d1="+newDate+"&d2="+newDate; 浏览深度
             String os = "http: www.51.la/report/3_Client.asp?id="+agentId+"&t=os&d1="+newDate+"&d2="+newDate; 操作系统
             String browser = "http: www.51.la/report/3_Client.asp?id="+agentId+"&t=soft&d1="+newDate+"&d2="+newDate; 浏览器
 
             System.out.print("IP         访问地址          上站时间         来路          入口网址\r\n\n");
             Document pageaccInfoDom = Jsoup.connect(accInfo).cookie("ajstat", sessionId).get();
             int pageaccInfoSize = 1;
             if(null!=pageaccInfoDom.getElementById("pageslink")){
                 pageaccInfoSize = Integer.valueOf(pageaccInfoDom.getElementById("pageslink").getElementsByTag("a").last().text());
             }
             System.out.print("来路网站:总页数"+pageaccInfoSize+" \r\n");
 
 
             if(null!=pageaccInfoDom.getElementsByClass("bodys_zw").first().getElementsByTag("table")){
 
 
                 for (int i=0;i<pageaccInfoSize ;i++) {
                     String requestRoteUrl = accInfo + "&p=" + (i + 1);
                     Document pageaccInfoDoms = Jsoup.connect(accInfo).cookie("ajstat", sessionId).get();
                     Elements accInfoTrs = pageaccInfoDoms.getElementsByClass("bodys_zw").first().getElementsByTag("table").first().getElementsByTag("tbody").first().getElementsByTag("tr");
 
                     int tri = 0;
                     for (Element trs : accInfoTrs){
                         ++tri;
                         if (tri>2 && tri<accInfoTrs.size()){
                             Elements accInfoTds = trs.getElementsByTag("td");
                             String accInfotdText = "";
 
                             String ip = accInfoTds.get(0).text(); ip
                             String address = accInfoTds.get(1).text(); 地址
                             String upperStationTime = newDate +" "+ accInfoTds.get(2).text(); 上站时间
                             String routes = accInfoTds.get(3).text(); 来路
                             String entranceSite = accInfoTds.get(4).getElementsByTag("a").first().attr("href"); 入口网址
                             accInfotdText = accInfotdText + ip + "       "+ address+"        "+upperStationTime+"        "+routes + "        "+entranceSite;
                             System.out.print("\r\n"+accInfotdText);
                         }
 
                     }
                 }
             }
             String todayFlowSum = objectDoc.getElementsByClass("sitelist_n").get(1).text();
             System.out.print("\r\n\n"+todayFlowSum + "\r\n");
 
 
         } catch (Exception e) {
             e.printStackTrace();
         }
    }

 
      浏览器、操作系统、浏览深度、回头客
     public String[] publicGrapMeht(String url, String sessionId)throws Exception{
         System.out.print("浏览器         访问量           比例  \r\n");
         Document publicGrapDom = Jsoup.connect(url).cookie("ajstat", sessionId).get();
         Elements publicGrapTrs  = publicGrapDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
         String returnGrapInfo[] = new String[3];
         int sertri = 0;
         for (Element graptrs : publicGrapTrs){
             ++sertri;
             if (sertri>2 && sertri<publicGrapTrs.size()){
                 Elements serTds = graptrs.getElementsByTag("td");
                 String se = serTds.get(0).text(); 浏览情况
                 String sumNum = serTds.get(1).text(); 总量
                 String proportion  = serTds.get(2).text(); 比例
 
                 returnGrapInfo[0] = se;
                 returnGrapInfo[1] = sumNum;
                 returnGrapInfo[2] = proportion;
 
                 String serEngineText = ""+se+"    "+sumNum + "     "+proportion;
                 System.out.print("\r\n"+serEngineText);
             }
         }
         return returnGrapInfo;
     }
 
 
      浏览器
     public void browserMeht(String browser,String sessionId)throws Exception{
         System.out.print("浏览器         访问量           比例  \r\n");
         Document browserDom = Jsoup.connect(browser).cookie("ajstat", sessionId).get();
         Elements browserTrs  = browserDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
         int sertri = 0;
         for (Element sertrs : browserTrs){
             ++sertri;
             if (sertri>2 && sertri<browserTrs.size()){
                 Elements serTds = sertrs.getElementsByTag("td");
                 String se = serTds.get(0).text(); 浏览情况
                 String sumNum = serTds.get(1).text(); 总量
                 String proportion  = serTds.get(2).text(); 比例
                 String serEngineText = ""+se+"    "+sumNum + "     "+proportion;
                 System.out.print("\r\n"+serEngineText);
             }
         }
     }
      操作系统
     public void osMeth(String os,String sessionId)throws  Exception{
         System.out.print("操作系统         访问量           比例  \r\n");
         Document osDom = Jsoup.connect(os).cookie("ajstat", sessionId).get();
         Elements osTrs  = osDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
         int sertri = 0;
         for (Element sertrs : osTrs){
             ++sertri;
             if (sertri>2 && sertri<osTrs.size()){
                 Elements serTds = sertrs.getElementsByTag("td");
                 String se = serTds.get(0).text(); 浏览情况
                 String sumNum = serTds.get(1).text(); 总量
                 String proportion  = serTds.get(2).text(); 比例
                 String serEngineText = ""+se+"    "+sumNum + "     "+proportion;
                 System.out.print("\r\n"+serEngineText);
             }
         }
     }
      浏览深度
     public void browsingDepthMeth(String browsingDepth,String sessionId)throws  Exception{
         System.out.print("浏览深度         访问量           比例  \r\n");
         Document browsingDepthDom = Jsoup.connect(browsingDepth).cookie("ajstat", sessionId).get();
         Elements browsingDeptTrs  = browsingDepthDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
         int sertri = 0;
         for (Element sertrs : browsingDeptTrs){
             ++sertri;
             if (sertri>2 && sertri<browsingDeptTrs.size()){
                 Elements serTds = sertrs.getElementsByTag("td");
                 String se = serTds.get(0).text(); 浏览情况
                 String sumNum = serTds.get(1).text(); 总量
                 String proportion  = serTds.get(2).text(); 比例
                 String serEngineText = ""+se+"    "+sumNum + "     "+proportion;
                 System.out.print("\r\n"+serEngineText);
             }
         }
     }
      回头客
     public void returnedCustMeth(String returnedCust,String sessionId)throws Exception{
         System.out.print("回头率分析         访问量           比例  \r\n");
         Document returnedCustDom = Jsoup.connect(returnedCust).cookie("ajstat", sessionId).get();
         Elements returnedCustTrs  = returnedCustDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
         int sertri = 0;
         for (Element sertrs : returnedCustTrs){
             ++sertri;
             if (sertri>2 && sertri<returnedCustTrs.size()){
                 Elements serTds = sertrs.getElementsByTag("td");
                 String se = serTds.get(0).text(); 搜索引擎
                 String sumNum = serTds.get(1).text(); 总量
                 String proportion  = serTds.get(2).text(); 比例
                 String serEngineText = ""+se+"    "+sumNum + "     "+proportion;
                 System.out.print("\r\n"+serEngineText);
             }
         }
     }
      域名
     public void domainMeth(String domain,String sessionId)throws  Exception{
         System.out.print("被访问域名     IP入口    UV入口      新UV     浏览量    比例  \r\n");
         Document domainDom = Jsoup.connect(domain).cookie("ajstat", sessionId).get();
         if(null!=domainDom.getElementById("tablist")){
             int domainPageSize = 1;
             if(null!=domainDom.getElementById("pageslink")){
                 domainPageSize = Integer.valueOf(domainDom.getElementById("pageslink").getElementsByTag("a").last().text());
             }
             System.out.print("域名:总页数"+domainPageSize+" \r\n");
 
             for (int i=0;i<domainPageSize ;i++){
                 String requestDomainUrl = domain + "&p="+(i+1);
                 Document pageDomiansDom = Jsoup.connect(requestDomainUrl).cookie("ajstat", sessionId).get();
                 Elements pageDomiansTrs  = pageDomiansDom.getElementById("tablist").getElementsByTag("tbody").first().getElementsByTag("tr");
 
                 int routei = 0;
                 for (Element rttr : pageDomiansTrs){
                     ++routei;
                     if (routei>1){
                         Elements rtTds = rttr.getElementsByTag("td");
                         String routeText = "";
 
 
                         String textInfo = rtTds.get(0).text();
                         int msgLength = textInfo.length();
                         if(textInfo.indexOf("[细]")>0){
                             msgLength = textInfo.indexOf("[细]")-1;
                         }
                         String msginfo = textInfo.substring(textInfo.indexOf("[细]")+3,msgLength); 被访问的域名
                         String  entranceIP = rtTds.get(1).text();
                         String  entranceUV = rtTds.get(2).text();
                         String  newUV = rtTds.get(3).text();
                         String  views  = rtTds.get(4).text();
                         String  ratio  = rtTds.get(5).text();
 
                         routeText = routeText + msginfo + "       "+entranceIP+ "       "+entranceUV+ "       "+newUV+ "       "+views+"       "+ratio;
                         System.out.print("\r\n"+routeText);
                     }
                 }
             }
         }else {
             System.out.print("来路抓取   无数据");
         }
     }
      来路
     public void routeMeth(String route,String sessionId)throws  Exception{
           System.out.print("来路网站   贡献IP    贡献率   贡献UV   新UV \r\n");
             Document routeDom = Jsoup.connect(route).cookie("ajstat", sessionId).get();
              if(null!=routeDom.getElementById("tablist")){
                  int routePageSize = 1;
                  if(null!=routeDom.getElementById("pageslink")){
                      routePageSize = Integer.valueOf(routeDom.getElementById("pageslink").getElementsByTag("a").last().text());
                  }
                  System.out.print("来路网站:总页数"+routePageSize+" \r\n");
 
                 for (int i=0;i<routePageSize ;i++){
                     String requestRoteUrl = route + "&p="+(i+1);
                     Document pageRouteDom = Jsoup.connect(requestRoteUrl).cookie("ajstat", sessionId).get();
                     Elements pageRouteTrs  = pageRouteDom.getElementById("tablist").getElementsByTag("tbody").first().getElementsByTag("tr");
 
 
                     int routei = 0;
                     for (Element rttr : pageRouteTrs){
                         ++routei;
                         if (routei>1){
                             Elements rtTds = rttr.getElementsByTag("td");
                             String routeText = "";
 
                             String textInfo = rtTds.get(0).text();
                             int msgLength = textInfo.length();
                             if(textInfo.indexOf("[GO]")>0){
                                 msgLength = textInfo.indexOf("[GO]")-1;
                             }
                             String msginfo = textInfo.substring(textInfo.indexOf("史]")+3,msgLength); 来路网站
                             String contributeIP = rtTds.get(1).text(); 贡献IP
                             String rateContribute = rtTds.get(2).text(); 贡献率
                             String contributionUV = rtTds.get(4).text(); 贡献UV
                             String newUV = rtTds.get(5).text(); 新UV
 
                             routeText = routeText + msginfo + "       "+contributeIP+ "       "+rateContribute+ "       "+contributionUV+ "       "+newUV;
                             System.out.print("\r\n"+routeText);
                         }
                     }
                 }
             }else {
                 System.out.print("来路抓取   无数据");
             }
 
     }
      关键词
     public void  keyWordsMeth(String keyWords,String sessionId)throws  Exception{
         System.out.print("关键词 1 ~ 2 ( 共 2 )    贡献IP     比例     百度     好搜     谷歌     搜搜      搜狗      雅虎    其它     贡献UV   新UV \r\n");
             Document keyWordsDom = Jsoup.connect(keyWords).cookie("ajstat", sessionId).get();
             if(null!=keyWordsDom.getElementById("tablist")){
                 Elements keyWordsTrs  = keyWordsDom.getElementById("tablist").getElementsByTag("tbody").first().getElementsByTag("tr");
                 int kwi = 0;
                 for (Element kwts : keyWordsTrs){
                     ++kwi;
                     if (kwi>1){
                         Elements serTds = kwts.getElementsByTag("td");
                         String keyWordText = "";
 
                         String textInfo = serTds.get(0).text();
                         int msgLength = textInfo.length();
                         if(textInfo.indexOf("[GO]")>0){
                             msgLength = textInfo.indexOf("[GO]")-1;
                         }
                         String msginfo = textInfo.substring(textInfo.indexOf("史]")+3,msgLength); 关键词
                         String contributionIP = serTds.get(1).text(); 贡献IP
                         String proportion  = serTds.get(2).text(); 比例
                         String baidu  = serTds.get(4).text(); 百度
                         String haosou  = serTds.get(5).text(); 好搜
                         String google  = serTds.get(6).text(); 谷歌
                         String soso  = serTds.get(7).text(); 搜搜
                         String sogou  = serTds.get(8).text(); 搜狗
                         String yahoo  = serTds.get(9).text(); 雅虎
                         String other  = serTds.get(10).text(); 其它
                         String contributionUV  = serTds.get(11).text(); 贡献UV
                         String newUv  = serTds.get(12).text(); 新UV
                         keyWordText = keyWordText + "    "+msginfo+ "    "+contributionIP+ "    "+proportion+ "    "+baidu+ "    "+haosou+ "    "+google
                                 + "    "+soso+ "    "+sogou+ "    "+yahoo+ "    "+other+ "    "+contributionUV+"     "+newUv;
                         System.out.print("\r\n"+keyWordText);
                     }
                 }
             }else {
                 System.out.print("关键词搜索   无数据");
             }
     }
      搜索引擎
     public void serchEngineMeth(String serchEngine,String sessionId)throws  Exception{
          System.out.print("搜索引擎         总量           比例 \r\n");
             Document serchEngineDom = Jsoup.connect(serchEngine).cookie("ajstat", sessionId).get();
             Elements serchEngineTrs  = serchEngineDom.getElementById("gra_shu").getElementsByTag("tbody").first().getElementsByTag("tr");
             int sertri = 0;
             for (Element sertrs : serchEngineTrs){
                 ++sertri;
                 if (sertri>2 && sertri<serchEngineTrs.size()){
                     Elements serTds = sertrs.getElementsByTag("td");
                     String se = serTds.get(0).text(); 搜索引擎
                     String sumNum = serTds.get(1).text(); 总量
                     String proportion  = serTds.get(2).text(); 比例
                     String serEngineText = ""+se+"    "+sumNum + "     "+proportion;
                     System.out.print("\r\n"+serEngineText);
                 }
             }
     }
      访问明细数据查询
     public void  pageaccInfoMeth(String accInfo, String sessionId,String newDate)throws Exception{
         System.out.print("IP         访问地址          上站时间         来路          入口网址\r\n\n");
         Document pageaccInfoDom = Jsoup.connect(accInfo).cookie("ajstat", sessionId).get();
         int pageaccInfoSize = 1;
         if(null!=pageaccInfoDom.getElementById("pageslink")){
             pageaccInfoSize = Integer.valueOf(pageaccInfoDom.getElementById("pageslink").getElementsByTag("a").last().text());
         }
         System.out.print("来路网站:总页数"+pageaccInfoSize+" \r\n");
 
 
         if(null!=pageaccInfoDom.getElementsByClass("bodys_zw").first().getElementsByTag("table")){
 
 
             for (int i=0;i<pageaccInfoSize ;i++) {
                 String requestRoteUrl = accInfo + "&p=" + (i + 1);
                 Document pageaccInfoDoms = Jsoup.connect(accInfo).cookie("ajstat", sessionId).get();
                 Elements accInfoTrs = pageaccInfoDoms.getElementsByClass("bodys_zw").first().getElementsByTag("table").first().getElementsByTag("tbody").first().getElementsByTag("tr");
 
                 int tri = 0;
                 for (Element trs : accInfoTrs){
                     ++tri;
                     if (tri>2 && tri<accInfoTrs.size()){
                         Elements accInfoTds = trs.getElementsByTag("td");
                         String accInfotdText = "";
 
                         String ip = accInfoTds.get(0).text(); ip
                         String address = accInfoTds.get(1).text(); 地址
                         String upperStationTime = newDate +" "+ accInfoTds.get(2).text(); 上站时间
                         String routes = accInfoTds.get(3).text(); 来路
                         String entranceSite = accInfoTds.get(4).getElementsByTag("a").first().attr("href"); 入口网址
                         accInfotdText = accInfotdText + ip + "       "+ address+"        "+upperStationTime+"        "+routes + "        "+entranceSite;
                         System.out.print("\r\n"+accInfotdText);
                     }
 
                 }
             }
         }
 
     }
      页面浏览
     public void  pageBrowserMeth(String pageBrowser, String sessionId)throws Exception{
             System.out.print("页面浏览:页面地址       IP入口      UV入口      新UV        浏览量     比例  \r\n");
             Document pageBrowserDom = Jsoup.connect(pageBrowser).cookie("ajstat", sessionId).get();
             if(null!=pageBrowserDom.getElementById("tablist")){
                 int pageBrowserSize = 1;
                 if(null!=pageBrowserDom.getElementById("pageslink")){
                     pageBrowserSize = Integer.valueOf(pageBrowserDom.getElementById("pageslink").getElementsByTag("a").last().text());
                 }
                 System.out.print("页面浏览:总页数"+pageBrowserSize+" \r\n");
 
                 for (int i=0;i<pageBrowserSize ;i++){
                     String requestpageBrowserUrl = pageBrowser + "&p="+(i+1);
 
                     Document pageBrowsersDom = Jsoup.connect(requestpageBrowserUrl).cookie("ajstat", sessionId).get();
                     Elements pageBrowsersTrs  = pageBrowsersDom.getElementById("tablist").getElementsByTag("tbody").first().getElementsByTag("tr");
 
                     int pbi = 0;
                     for (Element pbtr : pageBrowsersTrs){
                         ++pbi;
                         if (pbi>1){
                             Elements pbTds = pbtr.getElementsByTag("td");
                             String pageBrowserText = "";
                             String pblink = pbTds.get(0).getElementsByTag("a").last().attr("href"); 页面网址
                             String iprk =  pbTds.get(1).text(); IP入口
                             String uvrk =  pbTds.get(2).text(); UV
                             String newUv =  pbTds.get(3).text(); 新UV
                             String lll =  pbTds.get(4).text(); 浏览量
                             String ratio =  pbTds.get(5).text(); 比例
 
                             pageBrowserText = pageBrowserText+ ""+pblink+ "       "+iprk+ "       "+uvrk+ "       "+newUv+ "       "+lll+ "       "+ratio;
                             System.out.print("\r\n"+pageBrowserText);
                         }
                     }
                 }
             }
     }
   
}

0 0