使用htmlparser抓取网页链接

来源:互联网 发布:备份软件哪个好 编辑:程序博客网 时间:2024/04/29 13:12
package chapter9;import java.io.*;import org.htmlparser.util.*;import org.htmlparser.Parser;import org.htmlparser.filters.*;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.ParserException;/** 使用htmlparser抓取网页链接 */public class RadarSpecialSearchEngine {public static void main(String[] args) throws ParserException {try {TravelWordTable("D:\\workshop\\docs\\wordlist.txt");} catch (Exception e) {e.printStackTrace();}}public static void TravelWordTable(String filename) throws IOException {try {String buffer;FileWriter resultFile = null;PrintWriter myFile = null;String dstfile = filename + "_dsturl.txt";File writefile = new File(dstfile);if (!writefile.exists()) {writefile.createNewFile();}resultFile = new FileWriter(writefile);myFile = new PrintWriter(resultFile);BufferedReader reader = new BufferedReader(new FileReader(filename));while ((buffer = reader.readLine()) != null) {String url = "http://www.baidu.com/s?lm=0&si=&rn=10&ie=gb2312&ct=0&wd="+ buffer + "&pn=0&ver=0&cl=3";getBaiduUrls(url, "GB2312", myFile);}if (myFile != null)myFile.close();if (resultFile != null)resultFile.close();} catch (ParserException e) {e.printStackTrace();}}public static void getBaiduUrls(String url, String pageEncoding,PrintWriter writer) throws ParserException {NodeList nodeList = null;try {Parser parser = new Parser(url);parser.setEncoding(pageEncoding); // 设置解析编码格式// Baidu 检索结果的url连接和标题nodeList = parser.parse(new AndFilter(new HasAttributeFilter("target"), new HasAttributeFilter("href")));} catch (ParserException e) {e.printStackTrace();}if (nodeList != null && nodeList.size() > 0) { // 循环遍历每个Url节点for (int i = 0; i < nodeList.size(); i++) {String urlLink = ((LinkTag) nodeList.elementAt(i)).extractLink();String LinkName = ((LinkTag) nodeList.elementAt(i)).getLinkText();if (urlLink.indexOf("bnu") == 0 || urlLink.indexOf("http") == 0)System.out.println("结果 " + i + " 标题:" + LinkName);System.out.println("       链接:" + urlLink);writer.println(urlLink);}}}}

0 0
原创粉丝点击