JAVA抓取网页的图片,JAVA利用正则表达式抓取网站图片

来源:互联网 发布:java漏洞扫描 编辑:程序博客网 时间:2024/05/18 07:12

利用java抓取网页上的所有图片:

用两个正则表达式:

1、匹配html中img标签的正则:<img.*src=(.*?)[^>]*?>

2、匹配img标签中得src中http路径的正则:http:\"?(.*?)(\"|>|\\s+)


实现:

package org.swinglife.main;import java.io.File;import java.io.FileOutputStream;import java.io.InputStream;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.List;import java.util.regex.Matcher;import java.util.regex.Pattern;/*** * java抓取网络图片 * @author swinglife * */public class CatchImage {// 地址private static final String URL = "http://www.csdn.net";// 编码private static final String ECODING = "UTF-8";// 获取img标签正则private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";// 获取src路径的正则private static final String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)";public static void main(String[] args) throws Exception {CatchImage cm = new CatchImage();//获得html文本内容String HTML = cm.getHTML(URL);//获取图片标签List<String> imgUrl = cm.getImageUrl(HTML);//获取图片src地址List<String> imgSrc = cm.getImageSrc(imgUrl);//下载图片cm.Download(imgSrc);}/*** * 获取HTML内容 *  * @param url * @return * @throws Exception */private String getHTML(String url) throws Exception {URL uri = new URL(url);URLConnection connection = uri.openConnection();InputStream in = connection.getInputStream();byte[] buf = new byte[1024];int length = 0;StringBuffer sb = new StringBuffer();while ((length = in.read(buf, 0, buf.length)) > 0) {sb.append(new String(buf, ECODING));}in.close();return sb.toString();}/*** * 获取ImageUrl地址 *  * @param HTML * @return */private List<String> getImageUrl(String HTML) {Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML);List<String> listImgUrl = new ArrayList<String>();while (matcher.find()) {listImgUrl.add(matcher.group());}return listImgUrl;}/*** * 获取ImageSrc地址 *  * @param listImageUrl * @return */private List<String> getImageSrc(List<String> listImageUrl) {List<String> listImgSrc = new ArrayList<String>();for (String image : listImageUrl) {Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);while (matcher.find()) {listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));}}return listImgSrc;}/*** * 下载图片 *  * @param listImgSrc */private void Download(List<String> listImgSrc) {try {for (String url : listImgSrc) {String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());URL uri = new URL(url);InputStream in = uri.openStream();FileOutputStream fo = new FileOutputStream(new File(imageName));byte[] buf = new byte[1024];int length = 0;System.out.println("开始下载:" + url);while ((length = in.read(buf, 0, buf.length)) != -1) {fo.write(buf, 0, length);}in.close();fo.close();System.out.println(imageName + "下载完成");}} catch (Exception e) {System.out.println("下载失败");}}}


0 0
原创粉丝点击