【网络爬虫】使用jsoup对dom树解析

来源:互联网 发布:csol2控制台fps优化 编辑:程序博客网 时间:2024/05/29 03:13

基于jsoup的二次封装,对dom树解析。

package test;import java.util.LinkedList;import java.util.List;import junit.framework.TestCase;import com.vaolan.parser.JsoupHtmlParser;import com.vaolan.status.DataFormatStatus;import com.vaolan.utils.IOUtil;import com.vaolan.utils.StaticValue;public class RandomTest extends TestCase {    public static void testCleanTest() {        String fileName = "d:/test_2.txt";        String htmlSource = IOUtil.readFile(fileName,                StaticValue.default_encoding);        String cleanTxt = JsoupHtmlParser.getCleanTxt(htmlSource);        System.out.println(cleanTxt);    }    public static void testgetTagCleanTxt() {        String fileName = "d:/test_2.txt";        String htmlSource = IOUtil.readFile(fileName,                StaticValue.default_encoding);        String cleanTxt = JsoupHtmlParser.getTagContent(htmlSource, "inner",                DataFormatStatus.CleanTxt);        System.out.println(cleanTxt);    }    public static void testGetNestTagContent() {        String fileName = "d:/test_2.txt";        String htmlSource = IOUtil.readFile(fileName,                StaticValue.default_encoding);        List<String> tagList = new LinkedList<String>();        tagList.add("outer");        // tagList.add("inner");        List<String> cleanTxt = JsoupHtmlParser.getNestTagContent(htmlSource,                tagList, DataFormatStatus.CleanTxt, true);        System.out.println(cleanTxt);    }    public static void testGetNodeContentBySelector() {        String fileName = "d:/test_2.txt";        String htmlSource = IOUtil.readFile(fileName,                StaticValue.default_encoding);        List<String> selList = new LinkedList<String>();        selList.add("outer");        // selList.add(".subBrand");        selList.add("inner");        List<String> cleanTxt = JsoupHtmlParser.getNodeContentBySelector(                htmlSource, selList, DataFormatStatus.TagAllContent, true);        if (cleanTxt != null) {            for (String item : cleanTxt) {                System.out.println(item);            }        } else {            System.out.println("结果集为null");        }    }    public static void removeSelector() {        String fileName = "d:/test_2.txt";        String htmlSource = IOUtil.readFile(fileName,                StaticValue.default_encoding);        // List<String> list = JsoupHtmlParser.getAllHref(htmlSource);        String selector = "outer";        List<String> removeList = new LinkedList<String>();        removeList.add("inner");        // removeList.add("span");        String result = JsoupHtmlParser.removeInnerContent(htmlSource,                selector, removeList);        // for (String url : list) {        // System.out.println(url);        // }        System.out.println(result);    }    public static void main(String[] args) {        // 得到html source的纯文本        // testCleanTest();        // testgetTagCleanTxt();        // testGetNestTagContent();        // testGetNodeContentBySelector();        removeSelector();    }}

测试结果:
这里写图片描述

0 0