学习笔记4
来源:互联网 发布:淘宝女装素材 编辑:程序博客网 时间:2024/04/27 23:49
获取请求链接的服务器头链接,无协议,给用户加协议,tomcat下获取webApp路径,传入文件名,删除不是同年月日的所有文件,判断网页的编码,爬虫完美绕过服务器反爬检查代码
获取请求链接的服务器头链接:String hostPort = request.getScheme() + “://” + request.getServerName() + “:” + request.getServerPort();
// 无协议,给用户加协议 if (!url.contains("http")) { try { URL url0 = new URL("https://"+url); url0.openStream(); url="https://"+url; } catch (Exception e) { url="http://"+url; } }
tomcat下获取webApp路径:
nowpath=System.getProperty(“user.dir”);
tempdir=nowpath.replace(“bin”, “webapps”); //把bin 文件夹变到 webapps文件里面
删除文件夹下的文件
/** * 传入文件名,删除不是同年月日的所有文件 * * @param fileName */ public void deleteOutDateJsp(String fileName) { String nowpath; //当前tomcat的bin目录的路径 如 D:\java\software\apache-tomcat-6.0.14\bin String tempdir; nowpath=System.getProperty("user.dir"); tempdir=nowpath.replace("bin", "webapps"); //把bin 文件夹变到 webapps文件里面 String date = fileName.substring(0, 10);// File folder = new File("src/main/webapp/static/temp/");//jetty File folder = new File(tempdir+"/datacrawl/static/temp/");//tomcat File[] files = folder.listFiles(); for (int i = 0; i < files.length; i++) { if (files[i]!=null&&(files[i].getName().contains(date) || files[i].getName().equals("get_xpath.html"))) { continue; } else if (files[i]!=null){ files[i].delete(); } } }
判断网页的编码:
判断网页的编码:public String findCharset(String htmlfileName) { BufferedReader bufReader = null; try { bufReader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(htmlfileName)))); for (String tmp1 = null; (tmp1 = bufReader.readLine()) != null; tmp1 = null) { String tmp = new String(tmp1.toString()); if (tmp.contains("meta") && tmp.contains("charset") && (tmp.contains("gbk") || tmp.contains("GBK"))) { return "GBK"; } if (tmp.contains("meta") && tmp.contains("charset") && (tmp.contains("utf-8") || tmp.contains("UTF-8"))) { return "UTF-8"; } if (tmp.contains("meta") && tmp.contains("charset") && (tmp.contains("gb2312") || tmp.contains("GB2312"))) { return "GB2312"; } } } catch (IOException e) { e.printStackTrace(); } finally { try { bufReader.close(); } catch (IOException e) { e.printStackTrace(); } } return ""; }
爬虫完美绕过服务器反爬检查代码
HostnameVerifier hv = new HostnameVerifier() { public boolean verify(String urlHostName, SSLSession session) { System.out.println("Warning: URL Host: " + urlHostName + " vs. " + session.getPeerHost()); return true; } }; trustAllHttpsCertificates(); HttpsURLConnection.setDefaultHostnameVerifier(hv); // 插入取到的html代码 try { URL url = new URL(string); URLConnection connection = url.openConnection(); connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); connection.connect(); /*URLConnection conn = url.openConnection(); conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");*/ isr=new InputStreamReader(connection.getInputStream(),Charset.forName("UTF-8")); //isr=new InputStreamReader(conn.getInputStream(), "UTF-8"); bufr = new BufferedReader(isr); } catch (Exception e) { e.printStackTrace(); }private void trustAllHttpsCertificates() { javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1]; javax.net.ssl.TrustManager tm = new miTM(); trustAllCerts[0] = tm; javax.net.ssl.SSLContext sc = null; try { sc = javax.net.ssl.SSLContext .getInstance("SSL"); } catch (NoSuchAlgorithmException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { sc.init(null, trustAllCerts, null); } catch (KeyManagementException e) { // TODO Auto-generated catch block e.printStackTrace(); } javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc .getSocketFactory()); } static class miTM implements javax.net.ssl.TrustManager, javax.net.ssl.X509TrustManager { public java.security.cert.X509Certificate[] getAcceptedIssuers() { return null; } public boolean isServerTrusted( java.security.cert.X509Certificate[] certs) { return true; } public boolean isClientTrusted( java.security.cert.X509Certificate[] certs) { return true; } public void checkServerTrusted( java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException { return; } public void checkClientTrusted( java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException { return; } }
阅读全文
0 0
- C++学习笔记(4)
- AD学习笔记4
- oracel学习笔记4
- JavaScript学习笔记4
- JavaScript 学习笔记 4
- EJB学习笔记(4)
- Solaris学习笔记(4)
- Java学习笔记4
- c#学习笔记4
- LSL学习笔记(4)
- stl学习笔记4
- JavaScript学习笔记4
- ASP+ 学习笔记 4
- HTML学习笔记(4)
- jsp学习笔记4
- java学习笔记4
- java学习笔记4
- Flex3学习笔记4
- 最多能喝多少瓶啤酒呢?
- IDEA集成Git版本控制工具—分享项目到GitHub上和从GitHub克隆项目
- 【并查集入门专题1】A+B+D 三道模板题 hdu1232 hdu1233 poj2524【并查集模板】
- 【noip 2009】最优贸易
- Leetcode 655. Print Binary Tree 打印二叉树 解题报告
- 学习笔记4
- ReentrantReadWriteLock深入分析
- HDOJ 2114 求和公式套用 简单数学题
- 如何用.net制作一个简易爬虫抓取华为应用市场数据
- 解决R/RStudio中安装包“无法与服务器建立连接”
- MTK
- Lambda表达式
- 线程1:线程的创建和启动
- 简易shell脚本打造终端字典