httpclient自动获取页面编码设置进行字符编码,使httpclient适用所有网页抓取不乱码

来源:互联网 发布:宇航数控仿真软件 编辑:程序博客网 时间:2024/05/10 15:32
               /**          * 获取页面html内容          * @param method          * @param methodType          * @return String          * @throws UnsupportedEncodingException          * @throws IOException          */          private static String readInputStream(HttpMethod method) throws Exception{              String charset = "UTF-8";              if(method instanceof PostMethod){                  charset = ((PostMethod)method).getResponseCharSet();              }else{                  charset = ((GetMethod)method).getResponseCharSet();              }              byte[] bytes = method.getResponseBody();              String body = new String(bytes,"UTF-8");              charset = getCharSetByBody(body,charset);              return new String(bytes,charset);          }                    /**          * 根据页面body获取字符编码          * @param html          * @param charset          * @return          */          private static String getCharSetByBody(String html,String charset){              Document document = parseJSoupDocumentFromHtml(html, Constants.parseBaseUri);              Elements elements = document.select("meta");              for(Element metaElement : elements){                  if(metaElement!=null && StringUtils.isNotBlank(metaElement.attr("http-equiv")) && metaElement.attr("http-equiv").toLowerCase().equals("content-type")){                      String content = metaElement.attr("content");                      charset = getCharSet(content);                      break;                  }              }              return charset;          }                    /**          * 正则获取字符编码          * @param content          * @return          */          private static String getCharSet(String content){              String regex = ".*charset=([^;]*).*";              Pattern pattern = Pattern.compile(regex);              Matcher matcher = pattern.matcher(content);              if(matcher.find())                  return matcher.group(1);              else                  return null;          }  


原创粉丝点击