学习笔记4

来源:互联网 发布:淘宝女装素材 编辑:程序博客网 时间:2024/04/27 23:49

获取请求链接的服务器头链接,无协议,给用户加协议,tomcat下获取webApp路径,传入文件名,删除不是同年月日的所有文件,判断网页的编码,爬虫完美绕过服务器反爬检查代码

获取请求链接的服务器头链接:String hostPort = request.getScheme() + “://” + request.getServerName() + “:” + request.getServerPort();

// 无协议,给用户加协议        if (!url.contains("http")) {            try {                URL url0 = new URL("https://"+url);                url0.openStream();                url="https://"+url;            } catch (Exception e) {                    url="http://"+url;            }        }

tomcat下获取webApp路径:
nowpath=System.getProperty(“user.dir”);
tempdir=nowpath.replace(“bin”, “webapps”); //把bin 文件夹变到 webapps文件里面
删除文件夹下的文件

/**     * 传入文件名,删除不是同年月日的所有文件     *      * @param fileName     */    public void deleteOutDateJsp(String fileName) {        String nowpath; //当前tomcat的bin目录的路径 如 D:\java\software\apache-tomcat-6.0.14\bin        String tempdir;        nowpath=System.getProperty("user.dir");        tempdir=nowpath.replace("bin", "webapps"); //把bin 文件夹变到 webapps文件里面        String date = fileName.substring(0, 10);//      File folder = new File("src/main/webapp/static/temp/");//jetty        File folder = new File(tempdir+"/datacrawl/static/temp/");//tomcat        File[] files = folder.listFiles();        for (int i = 0; i < files.length; i++) {            if (files[i]!=null&&(files[i].getName().contains(date) || files[i].getName().equals("get_xpath.html"))) {                continue;            } else if (files[i]!=null){                files[i].delete();            }        }    }

判断网页的编码:

判断网页的编码:public String findCharset(String htmlfileName) {        BufferedReader bufReader = null;        try {            bufReader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(htmlfileName))));            for (String tmp1 = null; (tmp1 = bufReader.readLine()) != null; tmp1 = null) {                String tmp = new String(tmp1.toString());                if (tmp.contains("meta") && tmp.contains("charset") && (tmp.contains("gbk") || tmp.contains("GBK"))) {                    return "GBK";                }                if (tmp.contains("meta") && tmp.contains("charset")                        && (tmp.contains("utf-8") || tmp.contains("UTF-8"))) {                    return "UTF-8";                }                if (tmp.contains("meta") && tmp.contains("charset")                        && (tmp.contains("gb2312") || tmp.contains("GB2312"))) {                    return "GB2312";                }            }        } catch (IOException e) {            e.printStackTrace();        } finally {            try {                bufReader.close();            } catch (IOException e) {                e.printStackTrace();            }        }        return "";    }

爬虫完美绕过服务器反爬检查代码

HostnameVerifier hv = new HostnameVerifier() {                  public boolean verify(String urlHostName, SSLSession session) {                      System.out.println("Warning: URL Host: " + urlHostName + " vs. "                                         + session.getPeerHost());                      return true;                  }              };             trustAllHttpsCertificates();            HttpsURLConnection.setDefaultHostnameVerifier(hv);          // 插入取到的html代码        try {            URL url = new URL(string);            URLConnection connection = url.openConnection();            connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");             connection.connect();            /*URLConnection conn = url.openConnection();            conn.setRequestProperty("User-Agent",                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");*/            isr=new InputStreamReader(connection.getInputStream(),Charset.forName("UTF-8"));             //isr=new InputStreamReader(conn.getInputStream(), "UTF-8");            bufr = new BufferedReader(isr);        } catch (Exception e) {            e.printStackTrace();        }private void trustAllHttpsCertificates() {        javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];          javax.net.ssl.TrustManager tm = new miTM();          trustAllCerts[0] = tm;          javax.net.ssl.SSLContext sc = null;        try {            sc = javax.net.ssl.SSLContext                      .getInstance("SSL");        } catch (NoSuchAlgorithmException e1) {            // TODO Auto-generated catch block            e1.printStackTrace();        }          try {            sc.init(null, trustAllCerts, null);        } catch (KeyManagementException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }          javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc                  .getSocketFactory());    }     static class miTM implements javax.net.ssl.TrustManager,       javax.net.ssl.X509TrustManager {   public java.security.cert.X509Certificate[] getAcceptedIssuers() {       return null;   }   public boolean isServerTrusted(           java.security.cert.X509Certificate[] certs) {       return true;   }   public boolean isClientTrusted(           java.security.cert.X509Certificate[] certs) {       return true;   }   public void checkServerTrusted(           java.security.cert.X509Certificate[] certs, String authType)           throws java.security.cert.CertificateException {       return;   }   public void checkClientTrusted(           java.security.cert.X509Certificate[] certs, String authType)           throws java.security.cert.CertificateException {       return;   }  }
原创粉丝点击