通过HtmlParser和HttpClient抽取网页关键字并解决乱码问题

来源:互联网 发布:恺英网络股票千股千评 编辑:程序博客网 时间:2024/06/08 06:53
基于HtmlParser和HttpClient抽取网页关键字,主要抽取网页的title、description、和keywords。HttpClient主要用来访问网页,而HtmlParser主要用来解析返回的html,但是在解析html的过程中,要根据网页的编码进行转换,不然会造成乱码,大多数网页的<meta>标签都会指定网页的编码,所以我们要将网页的编码一起解析出来,在来进行编码的转换。
import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.FileWriter;import java.io.InputStream;import java.io.InputStreamReader;import java.sql.Connection;import java.sql.PreparedStatement;import java.sql.ResultSet;import java.sql.Statement;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.methods.GetMethod;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.tags.MetaTag;import org.htmlparser.tags.TitleTag;import org.htmlparser.util.NodeList;import util.Utility;/** * 利用HtmlParser和HttpClient两个开源包进行网页的解析和网页的访问 */public class HtmlParser {public static void main(String []args) throws Exception {getWebAddress();//getContent("http://www.sogou.com/sogou","");}/** * ͨ从数据库获取网页地址 * @return * @throws Exception */public static void getWebAddress() throws Exception {Connection con = Utility.getConnection();Statement stmt = con.createStatement();PreparedStatement pstmt = con.prepareStatement("insert into content(userid,webaddress,title,keywords,description) values(?,?,?,?,?)");List<String> list = getUserIds();for(int i=162;i<list.size();i++) {String userid = list.get(i);System.out.println("第"+i+"个用户"+userid);List<String> address = new ArrayList<String>();String sql = "select distinct U from behavior where userid='"+userid+"'";ResultSet rs = stmt.executeQuery(sql);while(rs.next()) {String webaddress = rs.getString("U");if(webaddress != null) {if(!webaddress.contains("https")&&!webaddress.endsWith(".zip")&&!webaddress.endsWith(".jar")&&!webaddress.endsWith(".exe")) {address.add(webaddress);}}}for(int j=0;j<address.size();j++) {System.out.println("总共有"+address.size()+"条记录!");String webaddress = address.get(j);try {String t[] = getContent(webaddress,userid);pstmt.setString(1, userid);pstmt.setString(2, webaddress);pstmt.setString(3, t[0]);pstmt.setString(4, t[1]);pstmt.setString(5, t[2]);pstmt.addBatch();} catch (Throwable e) {System.out.println(e.toString());writeToFile(userid,webaddress);//将发生错误的地址写到文件中־continue;//为了保证程序能一直运行 当有错误发生时直接进行下一个循环}System.out.println(webaddress);System.out.println("第"+j+"条记录!");}pstmt.executeBatch();System.out.println("数据插入完成!!!!!");}}/**      * 从指定的网页获取需要的内容 * @throws Exception */@SuppressWarnings("serial")public static String[] getContent(String webAddress, String userid) throws Exception {String address = "";if (webAddress.contains("http")) {//如果地址不包含http则加上httpaddress = webAddress;} else {address = "http://" + webAddress;}String html = visitWeb(address);String charSet = getCharSet(html);Parser parse = new Parser();parse.setInputHTML(html);//parse.setEncoding(charSet);Map<String, String> map = new HashMap<String, String>();NodeFilter newFilter = new NodeClassFilter() {//建立一个过滤器 将Meta下面的keywords和description过滤出来public boolean accept(Node node) {if (node instanceof MetaTag) {MetaTag mt = (MetaTag) node;if (mt.getMetaTagName() != null) {return true;} else {return false;}} else if(node instanceof TitleTag ) {return true;}return false;}};NodeList keywords = parse.extractAllNodesThatMatch(newFilter);for (int i = 0; i < keywords.size(); i++) {if (keywords.elementAt(i) instanceof TitleTag) {TitleTag tt = (TitleTag) keywords.elementAt(i);map.put("title", tt.getTitle());} else {MetaTag mt = (MetaTag) keywords.elementAt(i);if (mt.getMetaTagName().equals("keywords") | mt.getMetaTagName().equals("Keywords")) {map.put("keywords", mt.getMetaContent());} else if(mt.getMetaTagName().equals("description") | mt.getMetaTagName().equals("Description")) {map.put("description", mt.getMetaContent());}}}String title = "";String keyword = "";String description = "";    //防止乱码 将获得的内容转换为所访问页面的编码if(map.containsKey("title")) {title = Utility.getCharset(map.get("title"),charSet);//System.out.println(title);}if (map.containsKey("keywords")) {keyword = Utility.getCharset(map.get("keywords"),charSet);//System.out.println(keyword);}if (map.containsKey("description")) {description = Utility.getCharset(map.get("description"),charSet);//System.out.println(description);}String t[] = {title,keyword,description};System.out.println(charSet);keywords.removeAll();return t;}/** * 获得所有用户的id * @throws Exception */public static List<String> getUserIds() throws Exception {List<String> list = new ArrayList<String>();String sql = "select userid from demographic";ResultSet rs = Utility.getResultSet(sql);while(rs.next()) {String userid = rs.getString("userid");list.add(userid);}return list;}/** * 获得网页的编码 * @param html * @return * @throws Exception */@SuppressWarnings("serial")public static String getCharSet(String html) throws Exception {String charSet = "utf-8";Parser parser = new Parser();parser.setInputHTML(html);NodeFilter newFilter = new NodeClassFilter() {//建立一个过滤器 过滤出charsetpublic boolean accept(Node node) {if (node instanceof MetaTag) {MetaTag mt = (MetaTag) node;if(mt.getAttribute("http-equiv")!= null) {if(mt.getAttribute("content").contains("charset")) {return true;}} else if(mt.getAttribute("charset")!= null) {return true;} else {return false;}}return false;}};NodeList keywords = parser.extractAllNodesThatMatch(newFilter);for(int i = 0;i < keywords.size(); i++) {MetaTag mt = (MetaTag) keywords.elementAt(i);if(mt.getAttribute("content") != null) {String charset = mt.getAttribute("content").toLowerCase().split("charset=")[1];charSet = charset;} else {String charset = mt.getAttribute("charset");charSet = charset;}}return charSet;}/** * 通过HttpClient访问页面 * @param address * @return * @throws Exception */public static String visitWeb(String address) throws Exception{HttpClient client = new HttpClient();client.getHttpConnectionManager().getParams().setConnectionTimeout(10000);//设置访问的网页的超时时间为10000ms client.getHttpConnectionManager().getParams().setSoTimeout(10000) ;//设置读取资源的超时时间为10000ms GetMethod method = new GetMethod(address);System.out.print("Connect The Web...");client.executeMethod(method);System.out.println("Connect Successful!");System.out.print("Begin To Read The Source...");InputStream is = method.getResponseBodyAsStream();InputStreamReader isr = new InputStreamReader(is,"ISO-8859-1"); //先设置统一 的编码为ISO-8859-1        BufferedReader br = new BufferedReader(isr);          StringBuffer resBuffer = new StringBuffer();          String resTemp = "";          while((resTemp = br.readLine()) != null){              resBuffer.append(resTemp);          }          String html = resBuffer.toString();         System.out.println("Source Read Successful!");        br.close();        isr.close();        is.close();        return html;}/** * 日志记录 * @param userid * @param address * @throws Exception */public static void writeToFile(String userid,String address) throws Exception {FileWriter fw = new FileWriter("E:\\HtmlParser\\log.txt",true);BufferedWriter bw = new BufferedWriter(fw);String content = userid + " " +address;bw.write(content);bw.newLine();bw.close();fw.close();System.out.println("日志记录!");}}

因为有的网页不能访问,所以我们为了保证程序持续执行,要进行try catch并将不能访问的网站进行日志记录,一开始的时候我使用的是HtmlParser来进行网站的访问和页面的解析,但是不知道为什么程序总是死在某一个地方,后来在网上查了相关的资料,所以采用了HttpClient进行网站的访问,效果很好,没有停止过。

一下是字符转换的部分:

public static String getCharset(String str, String charSet) {String newStr = "";try {newStr = new String(str.getBytes("ISO-8859-1"),charSet);} catch (UnsupportedEncodingException e) {e.printStackTrace();}return newStr;}




charset是从页面上获取到的网页的默认字符。这样处理后,基本上消除了乱码。

原创粉丝点击