httpclient常用基本抓取类

来源:互联网 发布:防盗监控软件 编辑:程序博客网 时间:2024/05/21 17:08

以下是我常用的抓取类,直接调用其中方法可实现本机ip抓取,goagent代理ip抓取,代理ip抓取。以及对文件的下载,页面内容保存到本地等。

package crawlMethodManager;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileOutputStream;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.Reader;import java.io.UnsupportedEncodingException;import java.net.URL;import java.net.URLConnection;import java.nio.charset.Charset;import java.nio.charset.UnsupportedCharsetException;import org.apache.http.Header;import org.apache.http.HttpEntity;import org.apache.http.HttpHost;import org.apache.http.HttpResponse;import org.apache.http.ParseException;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.entity.DeflateDecompressingEntity;import org.apache.http.client.entity.GzipDecompressingEntity;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.conn.params.ConnRouteParams;import org.apache.http.entity.StringEntity;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;import org.apache.http.params.CoreConnectionPNames;import org.apache.http.params.CoreProtocolPNames;import org.apache.http.protocol.HTTP;import org.apache.http.util.CharArrayBuffer;@SuppressWarnings("deprecation")public class CrawlMethodManager {static String ip = "";static int port = 0;static String ipUrl = "http://localhost:8080/ipFilter/getIp/getIp";static HttpClient httpPostClient = new DefaultHttpClient(new ThreadSafeClientConnManager());/** * httpClient的get方法 *  * @param url *            String 要抓取的链接 * @param encode *            String 抓取时使用的编码 * @param goagentFlag *            boolean 是否启用goagent * @param goagentNum *            int goagent尝试的次数 * @param companyFlag *            boolean 是否启用代理 * @param companyNum *            int 代理尝试的次数 * @param localFlag *            boolean 是否启用本机 * @param localNum *            int 本机尝试的次数 */public String crawlPageContentByGet(String url, String encode,boolean goagentFlag, int goagentNum, boolean companyFlag,int companyNum, boolean localFlag, int localNum)throws ClientProtocolException, IOException {String content = "";if (goagentFlag && content.equals("")) {int goagentCount = 0;while (content.equals("") && goagentCount < goagentNum) {try {System.out.println("goagent正在请求");content = doGetByGoagent(url, encode);} catch (Exception e) {// System.out.println("goagent请求失败");}goagentCount++;}}if (companyFlag && content.equals("")) {int companyCount = 0;while (content.equals("") && companyCount < companyNum) {try {System.out.println("公司代理ip正在请求");content = getByCompanyProxy(url, encode);} catch (Exception e) {// System.out.println("公司代理ip请求失败");}companyCount++;}}if (localFlag && content.equals("")) {int localCount = 0;while (content.equals("") && localCount < localNum) {try {System.out.println("本机正在请求");content = doGet(url, encode);} catch (Exception e) {// System.out.println("本机请求失败");}localCount++;}}return content;}/** *  * @Description: get web content * @param @param url * @param @param encode * @param @return * @param @throws ClientProtocolException * @param @throws IOException * @return String * @throws * @author joe * @date 2014-12-11 */public String crawlPageContentByGet(String url, String encode)throws ClientProtocolException, IOException {String content = "";try {content = doGetByGoagent(url, encode);if (content == null || content.equals("")) {System.out.println("启用公司代理");content = getByCompanyProxy(url, encode);// if (content == null || content.equals("")) {// System.out.println("启用本机");// content = doGet(url, encode);// }}} catch (Exception e) {try {System.out.println("goagent连接失败,启用公司代理");content = getByCompanyProxy(url, encode);// if (content == null || content.equals("")) {// System.out.println("公司代理连接失败,启用本机");// content = doGet(url, encode);// }} catch (Exception e2) {try {content = getByCompanyProxy(url, encode);// e2.printStackTrace();// System.out.println("公司代理连接失败,5秒后启用本机");// Thread.sleep(5000);// content = doGet(url, encode);} catch (Exception e3) {e3.printStackTrace();}}}return content;}private String getByCompanyProxy(String url, String encode) {int count = 10;String result = "";String urlString = url;String proxy = "";HttpHost proxyHost = null;boolean newProxy = false;int oldProxyUsecount = 0;for (int i = 0; i <= count; i++) {if (!ip.equals("")) {proxyHost = new HttpHost(ip, port, null);}try {if (newProxy || oldProxyUsecount > 2 || ip.equals("")) {oldProxyUsecount = 0;String[] proxys = null;try {while (proxy.equals("") || !proxy.contains(":")) {System.out.println("ip为空,正在提取");proxy = doGet(ipUrl, "gbk");}proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split(":");} catch (Exception e) {while (proxy.equals("") || !proxy.contains(":")) {System.out.println("ip为空,正在提取");proxy = doGet(ipUrl, "gbk");}proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split(":");// proxy = doGet(// ,// "gbk");// proxys = proxy.split(":");}ip = proxys[0];port = Integer.parseInt(proxys[1]);proxyHost = new HttpHost(ip, port, null);}System.out.println("正在使用代理" + ip + ":" + port + ":" + port);HttpGet httpRequst = new HttpGet(urlString);httpRequst.addHeader("Accept-Encoding", "gzip,deflate,sdch");httpRequst.getParams().setParameter(CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);DefaultHttpClient httpClient = new DefaultHttpClient();httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 9000);// 连接时间20shttpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 9000);// 数据传输时间60shttpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, proxyHost);HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类if (httpResponse.getStatusLine().getStatusCode() == 200) {HttpEntity httpEntity = httpResponse.getEntity();if (httpEntity.getContentEncoding() != null) {if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new GzipDecompressingEntity(httpEntity);} else if ("deflate".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new DeflateDecompressingEntity(httpEntity);}}result = enCodetoString(httpEntity, encode);// 取出应答字符串if (resultTest(result)) {System.out.println(ip + "公司代理成功抓取" + url);return result;} else if (result.contains("function JumpSelf")&& result.contains("WebShieldSessionVerify")) {int indexs = result.indexOf("&WebShieldSessionVerify");int indexe = result.indexOf("\";}</script>");String verify = result.substring(indexs, indexe);urlString = urlString + verify;newProxy = false;} else if (result.contains("function JumpSelf")&& !result.contains("WebShieldSessionVerify")) {urlString = url;newProxy = false;} else {System.out.println("网页含有错误特殊字符" + urlString);oldProxyUsecount++;System.out.println(result);}} elseSystem.out.println(httpResponse.getStatusLine().getStatusCode() + " " + urlString + " 状态不为200");oldProxyUsecount++;httpRequst.abort();} catch (ClientProtocolException e) {newProxy = true;System.out.println(ip + "代理ip拒绝了");} catch (IOException e) {oldProxyUsecount++;System.out.println(ip + "代理读取超时");}}return "";}private String doGet(String url, String encode)throws ClientProtocolException, IOException {String result = "";try {HttpGet httpRequst = new HttpGet(url);// httpRequst.addHeader("Content-Type", "text/html;charset=" +// encode);// httpRequst.getParams().setParameter(// CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);DefaultHttpClient httpClient = new DefaultHttpClient();// httpClient.getParams().setParameter(// CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20shttpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 8000);// 数据传输时间60sHttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类if (httpResponse.getStatusLine().getStatusCode() == 200) {HttpEntity httpEntity = httpResponse.getEntity();if (httpEntity.getContentEncoding() != null) {if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new GzipDecompressingEntity(httpEntity);} else if ("deflate".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new DeflateDecompressingEntity(httpEntity);}}result = enCodetoString(httpEntity, encode);// 取出应答字符串} elsehttpRequst.abort();} catch (ClientProtocolException e) {System.out.println("doget代理读取超时");} catch (IOException e) {System.out.println("doget代理读取超时");}return result;}private String doGetByGoagent(String url, String encode)throws ClientProtocolException, IOException {String result = "";HttpGet httpRequst = new HttpGet(url);httpRequst.addHeader("Accept-Encoding", "gzip,deflate,sdch");httpRequst.getParams().setParameter(CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);DefaultHttpClient httpClient = new DefaultHttpClient();HttpHost proxyHost = new HttpHost("127.0.0.1", 8087, null);httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20shttpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,6000);// 数据传输时间60shttpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY,proxyHost);HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类if (httpResponse.getStatusLine().getStatusCode() == 200) {HttpEntity httpEntity = httpResponse.getEntity();if (httpEntity.getContentEncoding() != null) {if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new GzipDecompressingEntity(httpEntity);} else if ("deflate".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new DeflateDecompressingEntity(httpEntity);}}result = enCodetoString(httpEntity, encode);// 取出应答字符串} elsehttpRequst.abort();return result;}public String crawlPageContentByPost(String url, String pram, String encode)throws ClientProtocolException, IOException {String content = "";try {content = doPostByGoagent(url, pram, encode);if (content == null || content.equals("")) {content = doPostByGoagent(url, pram, encode);// System.out.println("启用公司代理");// content = postByCompanyProxy(url, pram, encode);// if (content == null || content.equals("")) {// System.out.println("5秒后启用本机");// Thread.sleep(5000);// content = doPost(url, pram, encode);// }}} catch (Exception e) {try {content = doPostByGoagent(url, pram, encode);// System.out.println("goagent连接失败,启用公司代理");// content = postByCompanyProxy(url, pram, encode);// if (content == null || content.equals("")) {// System.out.println("公司代理连接失败,启用本机");// content = doPost(url, pram, encode);// }} catch (Exception e2) {try {content = doPostByGoagent(url, pram, encode);// e2.printStackTrace();// content = postByCompanyProxy(url, pram, encode);// System.out.println("公司代理连接失败,启用本机");// content = doPost(url, pram, encode);} catch (Exception e3) {e3.printStackTrace();}}}return content;}private String doPostByGoagent(String url, String parm, String encode)throws ClientProtocolException, IOException {String result = "";HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象HttpHost proxy = new HttpHost("127.0.0.1", 8087, null);StringEntity entity = new StringEntity(parm);entity.setContentType("application/x-www-form-urlencoded");entity.setContentEncoding(encode);httpRequst.setEntity(entity);DefaultHttpClient httpClient = new DefaultHttpClient();httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20shttpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,8000);// 数据传输时间60shttpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY,proxy);HttpResponse httpResponse = httpClient.execute(httpRequst);// System.out.println(httpResponse.getStatusLine().getStatusCode());if (httpResponse.getStatusLine().getStatusCode() == 200) {HttpEntity httpEntity = httpResponse.getEntity();if (httpEntity.getContentEncoding() != null) {if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new GzipDecompressingEntity(httpEntity);} else if ("deflate".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new DeflateDecompressingEntity(httpEntity);}}result = enCodetoString(httpEntity, encode);// 取出应答字符串}return result;}public String doPost(String url, String parm, String encode)throws ClientProtocolException, IOException {String result = "";HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象StringEntity entity = new StringEntity(parm);entity.setContentType("application/x-www-form-urlencoded");entity.setContentEncoding(encode);httpRequst.setEntity(entity);DefaultHttpClient httpClient = new DefaultHttpClient();httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20shttpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,8000);// 数据传输时间60sHttpResponse httpResponse = httpClient.execute(httpRequst);// System.out.println(httpResponse.getStatusLine().getStatusCode());if (httpResponse.getStatusLine().getStatusCode() == 200) {HttpEntity httpEntity = httpResponse.getEntity();if (httpEntity.getContentEncoding() != null) {if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new GzipDecompressingEntity(httpEntity);} else if ("deflate".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new DeflateDecompressingEntity(httpEntity);}}result = enCodetoString(httpEntity, encode);// 取出应答字符串return result;}return result;}@SuppressWarnings("unused")private String postByCompanyProxy(String url, String parm, String encode)throws ClientProtocolException, IOException {int count = 5;String result = "";String urlString = url;boolean okProxy = false;boolean newProxy = false;int oldProxyUsecount = 0;for (int i = 0; i <= count; i++) {try {if (newProxy || oldProxyUsecount > 2 || ip.equals("")) {okProxy = postByCompanyProxyBoolean(url, parm, encode);}if (okProxy) {System.out.println("正在使用代理" + ip + ":" + port);HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象StringEntity entity = new StringEntity(parm);entity.setContentType("application/x-www-form-urlencoded");httpRequst.setEntity(entity);httpRequst.getParams().setParameter(CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);HttpResponse httpResponse = httpPostClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类if (httpResponse.getStatusLine().getStatusCode() == 200) {HttpEntity httpEntity = httpResponse.getEntity();if (httpEntity.getContentEncoding() != null) {if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new GzipDecompressingEntity(httpEntity);} else if ("deflate".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new DeflateDecompressingEntity(httpEntity);}}result = enCodetoString(httpEntity, encode);// 取出应答字符串// System.out.println(result);if (resultTest(result)) {return result;} else if (result.contains("function JumpSelf")&& result.contains("WebShieldSessionVerify")) {int indexs = result.indexOf("&WebShieldSessionVerify");int indexe = result.indexOf("\";}</script>");String verify = result.substring(indexs, indexe);urlString = urlString + verify;newProxy = false;} else if (result.contains("function JumpSelf")&& !result.contains("WebShieldSessionVerify")) {urlString = url;newProxy = false;}} else if (httpResponse.getStatusLine().getStatusCode() == 302) {System.out.println("重定向了");Header header = httpResponse.getFirstHeader("location");if (header != null) {urlString = header.getValue();System.out.println(urlString);if (urlString.contains("tabid=26")) {urlString = "http://www.landchina.com"+ urlString;result = getByHttpClient(urlString, encode,httpPostClient);if (resultTest(result)) {System.out.println(i + "公司代理成功抓取" + url);return result;}newProxy = false;}newProxy = false;}} else {httpRequst.abort();}} else {oldProxyUsecount++;}} catch (ClientProtocolException e) {newProxy = true;System.out.println(ip + "代理ip拒绝了");} catch (IOException e) {oldProxyUsecount++;System.out.println(ip + "代理读取超时");}}return "";}private String getByHttpClient(String url, String encode,HttpClient httpClient) {int count = 2;String result = "";String urlString = url;for (int i = 0; i <= count; i++) {try {HttpGet httpRequst = new HttpGet(urlString);httpRequst.setHeader("Content-Type","application/x-www-form-urlencoded");HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类if (httpResponse.getStatusLine().getStatusCode() == 200) {HttpEntity httpEntity = httpResponse.getEntity();if (httpEntity.getContentEncoding() != null) {if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new GzipDecompressingEntity(httpEntity);} else if ("deflate".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new DeflateDecompressingEntity(httpEntity);}}result = enCodetoString(httpEntity, encode);// 取出应答字符串if (resultTest(result)) {System.out.println(ip + "公司代理成功抓取" + url);return result;} else if (result.contains("function JumpSelf")&& result.contains("WebShieldSessionVerify")) {int indexs = result.indexOf("&WebShieldSessionVerify");int indexe = result.indexOf("\";}</script>");String verify = result.substring(indexs, indexe);urlString = urlString + verify;} else if (result.contains("function JumpSelf")&& !result.contains("WebShieldSessionVerify")) {urlString = url;}} elsehttpRequst.abort();} catch (ClientProtocolException e) {System.out.println(ip + "代理ip拒绝了");} catch (IOException e) {System.out.println(ip + "代理读取超时");}}return "";}/** * 新ip第一次访问时要先通过安全验证,这时只能得到首页的内容,所以在post前线验证一次 <功能详细描述> [参数说明] *  * @return void [返回类型说明] * @exception throws [违例类型] [违例说明] * @see [类、类#方法、类#成员] */private Boolean postByCompanyProxyBoolean(String url, String parm,String encode) throws ClientProtocolException, IOException {int count = 10;String result = "";String urlString = url;String proxy = "";HttpHost proxyHost = null;boolean newProxy = false;int oldProxyUsecount = 0;for (int i = 0; i <= count; i++) {try {if (newProxy || oldProxyUsecount > 2 || ip.equals("")) {oldProxyUsecount = 0;String[] proxys = null;try {while (proxy.equals("") || !proxy.contains(":")) {System.out.println("ip为空,正在提取");proxy = doGet(ipUrl, "gbk");}proxys = proxy.replaceAll("\"|//|/|\r\n| | ", "").split(":");} catch (Exception e) {while (proxy.equals("") || !proxy.contains(":")) {System.out.println("ip为空,正在提取");proxy = doGet(ipUrl, "gbk");}proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split(":");}ip = proxys[0];port = Integer.parseInt(proxys[1]);proxyHost = new HttpHost(ip, port, null);}System.out.println("正在使用代理" + ip + ":" + port);HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象StringEntity entity = new StringEntity(parm);entity.setContentType("application/x-www-form-urlencoded");httpRequst.setEntity(entity);httpRequst.getParams().setParameter(CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);httpPostClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 10000);// 连接时间20shttpPostClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 8000);// 数据传输时间60shttpPostClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, proxyHost);HttpResponse httpResponse = httpPostClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类if (httpResponse.getStatusLine().getStatusCode() == 200) {HttpEntity httpEntity = httpResponse.getEntity();if (httpEntity.getContentEncoding() != null) {if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new GzipDecompressingEntity(httpEntity);} else if ("deflate".equalsIgnoreCase(httpEntity.getContentEncoding().getValue())) {httpEntity = new DeflateDecompressingEntity(httpEntity);}}result = enCodetoString(httpEntity, encode);// 取出应答字符串// System.out.println(result);if (resultTest(result)) {return true;} else if (result.contains("function JumpSelf")&& result.contains("WebShieldSessionVerify")) {int indexs = result.indexOf("&WebShieldSessionVerify");int indexe = result.indexOf("\";}</script>");String verify = result.substring(indexs, indexe);urlString = urlString + verify;if (urlString.contains("tabid=26")&& !urlString.contains("landchina")) {urlString = "http://www.landchina.com" + urlString;result = getByHttpClient(urlString, encode,httpPostClient);if (resultTest(result)) {System.out.println(ip + "公司代理成功抓取" + url);return true;}newProxy = false;} else if (urlString.contains("tabid=26")&& urlString.contains("landchina")) {result = getByHttpClient(urlString, encode,httpPostClient);if (resultTest(result)) {System.out.println(ip + "公司代理成功抓取" + url);return true;}newProxy = false;}newProxy = false;} else if (result.contains("function JumpSelf")&& !result.contains("WebShieldSessionVerify")) {urlString = url;newProxy = false;}} else if (httpResponse.getStatusLine().getStatusCode() == 302) {System.out.println("重定向了");Header header = httpResponse.getFirstHeader("location");if (header != null) {urlString = header.getValue();System.out.println(urlString);if (urlString.contains("tabid=26")&& !urlString.contains("landchina")) {urlString = "http://www.landchina.com" + urlString;result = getByHttpClient(urlString, encode,httpPostClient);if (resultTest(result)) {System.out.println(ip + "公司代理成功抓取" + url);return true;}newProxy = false;} else if (urlString.contains("tabid=26")&& urlString.contains("landchina")) {result = getByHttpClient(urlString, encode,httpPostClient);if (resultTest(result)) {System.out.println(ip + "公司代理成功抓取" + url);return true;}newProxy = false;}newProxy = false;}} else {httpRequst.abort();}} catch (ClientProtocolException e) {newProxy = true;System.out.println(ip + "代理ip拒绝了");} catch (IOException e) {oldProxyUsecount++;System.out.println(ip + "代理读取超时");}}return false;}private Boolean resultTest(String result) {if (!result.equals("") && !result.equals("100")&& !result.contains("<title>blank")&& !result.contains("Error Page Messages")&& !result.contains("<title>404")&& !result.contains("您的访问出错了") && !result.contains("302 Found")&& !result.contains("出错页面") && !result.contains("没有找到这篇文章!")&& !result.contains("特定于实例的错误") && !result.contains("错误 404")&& !result.contains("Error report")&& !result.contains("function JumpSelf")&& !result.contains("refused") && !result.contains("网站防火墙")&& !result.contains("无法解析服务器") && !result.contains("STATUS OK")&& !result.contains("refresh")&& !result.contains("DownloadError")&& !result.contains("Not Found")&& !result.contains("Runtime Error")&& !result.contains("Service Unavailable")) {return true;}return false;}public static String enCodetoString(final HttpEntity entity,final String defaultCharset) throws IOException, ParseException {return enCodetoStringDo(entity,defaultCharset != null ? Charset.forName(defaultCharset) : null);}public static String enCodetoStringDo(final HttpEntity entity,Charset defaultCharset) throws IOException, ParseException {if (entity == null) {throw new IllegalArgumentException("HTTP entity may not be null");}InputStream instream = entity.getContent();if (instream == null) {return null;}try {if (entity.getContentLength() > Integer.MAX_VALUE) {throw new IllegalArgumentException("HTTP entity too large to be buffered in memory");}int i = (int) entity.getContentLength();if (i < 0) {i = 4096;}Charset charset = null;try {// ContentType contentType = ContentType.get(entity);// if (contentType != null) {// charset = contentType.getCharset();// }} catch (final UnsupportedCharsetException ex) {throw new UnsupportedEncodingException(ex.getMessage());}if (charset == null) {charset = defaultCharset;}if (charset == null) {charset = HTTP.DEF_CONTENT_CHARSET;}Reader reader = new InputStreamReader(instream, charset);CharArrayBuffer buffer = new CharArrayBuffer(i);char[] tmp = new char[1024];int l;while ((l = reader.read(tmp)) != -1) {buffer.append(tmp, 0, l);}return buffer.toString();} finally {instream.close();}}/** *  * @Description: TODO * @param @param 硬盘名 * @param @param 文件名 * @param @param 文件夹名 * @param @param 保存后缀名 * @param @param 保存的内容 * @return void * @throws * @author joe * @date 2015-3-6 */public static void writeToFile(String topName, String fileName,String tagName, String type, String content) {File dirFile = null;try {dirFile = new File(topName + ":\\" + tagName);if (!(dirFile.exists()) && !(dirFile.isDirectory())) {boolean creadok = dirFile.mkdirs();if (creadok) {System.out.println(" ok:创建文件夹成功! ");} else {System.out.println(" err:创建文件夹失败! ");}}} catch (Exception e) {e.printStackTrace();}String fullPath = dirFile + "/" + fileName + "." + type;write(fullPath, content);}/** * 写文件 *  * @param path * @param content */public static boolean write(String path, String content) {String s = new String();String s1 = new String();BufferedWriter output = null;try {File f = new File(path);if (f.exists()) {} else {System.out.println("文件不存在,正在创建...");if (f.createNewFile()) {System.out.println("文件创建成功!");} else {System.out.println("文件创建失败!");}}BufferedReader input = new BufferedReader(new FileReader(f));while ((s = input.readLine()) != null) {s1 += s + "\n";}System.out.println("原文件内容:" + s1);input.close();s1 += content;output = new BufferedWriter(new FileWriter(f));output.write(s1);output.flush();return true;} catch (Exception e) {e.printStackTrace();return false;} finally {if (output != null) {try {output.close();} catch (IOException e) {e.printStackTrace();}}}}/** *  * @Description: TODO * @param @param fileUrl文件链接 * @param @param topName硬盘名 * @param @param fileName文件名 * @param @param tagName文件夹名 * @param @param type 后缀名 * @return void   * @throws * @author joe * @date 2015-3-6 */public void downLoadFile(String fileUrl, String topName, String fileName,String tagName, String type) {// 下载网络文件int bytesum = 0;int byteread = 0;try {URL url = new URL(fileUrl);URLConnection conn = url.openConnection();InputStream inStream = conn.getInputStream();File fileD = new File(topName + ":/" + tagName);// 如果文件夹不存在则创建if (!fileD.exists() && !fileD.isDirectory()) {System.out.println("正在新建目录");fileD.mkdirs();;} else {System.out.println("目录存在");}File file = new File(topName + ":/" + tagName + "/" + fileName+ "." + type);if (!file.exists()) {try {file.createNewFile();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}FileOutputStream fs = new FileOutputStream(topName + ":/" + tagName+ "/" + fileName + "." + type);byte[] buffer = new byte[1204];while ((byteread = inStream.read(buffer)) != -1) {bytesum += byteread;System.out.println(bytesum);fs.write(buffer, 0, byteread);}System.out.println("downloaded ok");} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}public static void main(String[] args) throws ClientProtocolException,IOException {CrawlMethodManager manager = new CrawlMethodManager();}}



0 0
原创粉丝点击