html页面数据抓取

来源:互联网 发布:linux shutdown命令 编辑:程序博客网 时间:2024/05/18 00:22
package com.myhitron.jlw.forum.util;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;import com.myhitron.jlw.core.util.DataUtil;public class CatchHtmlUtil {    // img标签    private static final String IMGURL_REGEX = "(&quot;.*?&quot;|<img.*src=(.*?)[^>]*?>)";    // src路径    private static final String IMGSRC_REGEX = "http:\"?(.*?)(\"|>|\\s+)";    /**     * 获取富文本信息的的文字     * @author xuye     * Date: 2017年8月22日 上午10:04:10     * @param html     * @return     */    public static String catchWord(String HTMLSource) {        if (!DataUtil.isNotEmpty(HTMLSource)) {            return null;        }        String regMatchEnter = "\\s*|\t|\r|\n";        Pattern p = Pattern.compile(regMatchEnter);        Matcher m = p.matcher(HTMLSource);        HTMLSource = m.replaceAll("");        String regMatchTag = "<[^>]*>";        Pattern p1 = Pattern.compile(regMatchTag);        Matcher m1 = p1.matcher(HTMLSource);        HTMLSource = m1.replaceAll("");        HTMLSource = HTMLSource.replaceAll(regMatchTag, "");        return HTMLSource;    }    /**     * 获取img标签     * @author xuye     * Date: 2017年8月22日 上午10:07:58     * @param HTML     * @return     */    public static List<String> getImageUrl(String html) {        Matcher matcher = Pattern.compile(IMGURL_REGEX).matcher(html);        List<String> listImgUrl = new ArrayList<String>();        while (matcher.find()) {            listImgUrl.add(matcher.group().replaceAll("&quot;", "\""));        }        return listImgUrl;    }    /**     *  获取ImgSrc路径     * @author xuye     * Date: 2017年8月22日 上午10:08:04     * @param listImageUrl     * @return     */    public static List<String> getImageSrc(List<String> listImageUrl) {        List<String> listImgSrc = new ArrayList<String>();        for (String image : listImageUrl) {            Matcher matcher = Pattern.compile(IMGSRC_REGEX).matcher(image);            while (matcher.find()) {                listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));            }        }        return listImgSrc;    }    public static void main(String[] args) {        String html = "<p style=\"text-indent:0em;margin:4px auto 0px auto;\"><br></p><img src=\"\" width=\"100%\"><p style=\"text-indent:0em;margin:4px auto 0px auto;\"></p><div style=\"margin: 30px 0px; background-repeat: no-repeat; background-position: center center; background-size: cover; height: 800px; background-image: url(&quot;http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523656437.jpg&quot;);\" class=\"image\"></div><div style=\"display: flex\"><div style=\"flex: 2;padding-left:100px;padding-right: 40px;\"><h1 style=\"margin-bottom: 20px;font-size: 30px;height:45px\" class=\"text\">将来网论坛</h1><h3 style=\"font-size: 16px;min-height: 180px;\" class=\"text\">好棒的论坛!</h3></div><p style=\"flex: 1;font-size: 16px;min-height:200px;margin: 0 40px;\" class=\"text\">赞赞赞</p></div><div style=\"display: flex;margin-top: 30px;\"><div style=\"flex: 1 1 0%; background-repeat: no-repeat; background-position: center center; background-size: cover; height: 800px; background-image: url(&quot;http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523686912.jpg&quot;);\" class=\"image\"></div><div style=\"flex: 2;margin-left: 30px\"><div style=\"background-repeat: no-repeat; background-position: center center; background-size: cover; height: 390px; margin-bottom: 20px; background-image: url(&quot;http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523688911.jpg&quot;);\" class=\"image\"></div><div style=\"background-repeat: no-repeat; background-position: center center; background-size: cover; height: 390px; background-image: url(&quot;http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523693965.jpg&quot;);\" class=\"image\"></div></div></div><div style=\"display: flex;margin-top: 30px;\"><p style=\"flex:1;font-size: 14px;margin:0 40px;min-height: 400px;\" class=\"text\">哇塞</p><p>哇塞</p><p>哇塞</p></div><div style=\"display: flex;margin: 30px 0;\"><div style=\"flex: 3 1 0%; background-repeat: no-repeat; background-position: center center; background-size: cover; height: 1200px; background-image: url(&quot;http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523713183.jpg&quot;);\" class=\"image\"></div><div style=\"flex: 1;padding:0 40px\"><p>哇塞</p><div style=\"background-repeat: no-repeat; background-position: center center; background-size: cover; height: 300px; margin: 60px 0px; background-image: url(&quot;http://jlw.myhitron.com/jlw-forum/img/thumb-86-1503523721282.jpg&quot;);\" class=\"image\"></div><p>哇塞</p></div></div>";        //String html = "<p style=\"text-indent:0em;margin:4px auto 0px auto;\"><font style=\"font-size:20.000000;color:#000000\">yu</font></p><img src=\"http://jlw.myhitron.com/jlw-forum/headimg/thumb-87-1503759242787_750_485.jpg\" width=\"100%\"/><p style=\"text-indent:0em;margin:4px auto 0px auto;\"></p><img src=\"http://jlw.myhitron.com/jlw-forum/headimg/thumb-87-1503759251229_1280_992.jpg\" width=\"100%\"/><p style=\"text-indent:0em;margin:4px auto 0px auto;\"></p>";        //获取文字        System.out.println(catchWord(html));        System.out.println("###############################################");        List<String> imgUrl = getImageUrl(html);        System.out.println("许晔抓图片" + imgUrl.toString());        //获取图片src地址        List<String> imgSrc = getImageSrc(imgUrl);        System.out.println("许晔抓的图片" + imgSrc.toString());    }}
原创粉丝点击