使用HtmlParser读取论坛图片
来源:互联网 发布:阿里云域名备案成功后 编辑:程序博客网 时间:2024/05/16 15:39
心血来潮,写了一个读取论坛图片的程序,能够自动把图片保存到硬盘上去,使用HtmlParse组件。
http://hintcnuie.javaeye.com/blog/172132
- package com.chen;
- import java.io.BufferedInputStream;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.HashSet;
- import java.util.Set;
- import org.htmlparser.NodeFilter;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.TagNameFilter;
- import org.htmlparser.tags.ImageTag;
- import org.htmlparser.tags.LinkTag;
- import org.htmlparser.tags.TitleTag;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- /
- public class HttpGet {
- private static int BUFFER_SIZE = 8096;// 缓冲区大小
- /**
- * 将HTTP资源另存为文件
- *
- * @param destUrl
- * String
- * @param title
- * @param fileName
- * String
- * @throws IOException
- * @throws Exception
- */
- public static void saveToFile(String destUrl, String title) {
- FileOutputStream fos = null;
- BufferedInputStream bis = null;
- HttpURLConnection httpUrl = null;
- URL url = null;
- byte[] buf = new byte[BUFFER_SIZE];
- int size = 0;
- int pos = destUrl.lastIndexOf('/');
- String fileName = "";
- if (pos != -1)
- fileName = destUrl.substring(pos + 1, destUrl.length());
- else
- fileName = destUrl.substring(destUrl.length() - 10, destUrl
- .length());
- String path = "D:" + File.separator + "temp" + File.separator
- + "images" + File.separator;
- System.out.println("title: " + title);
- if (null != title && !"".equals(title)) {
- File file = new File(path + title + File.separator);
- if (!file.exists()) {
- file.mkdirs();
- }
- path = file.getPath();
- }
- path = path + File.separator + fileName;
- System.out.print("/t" + path);
- // 建立链接
- try {
- url = new URL(destUrl);
- httpUrl = (HttpURLConnection) url.openConnection();
- // 连接指定的资源
- httpUrl.connect();
- // 获取网络输入流
- bis = new BufferedInputStream(httpUrl.getInputStream());
- // 建立文件
- fos = new FileOutputStream(path);
- // 保存文件
- while ((size = bis.read(buf)) != -1)
- fos.write(buf, 0, size);
- fos.close();
- bis.close();
- httpUrl.disconnect();
- } catch (MalformedURLException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException ex) {
- // TODO Auto-generated catch block
- ex.printStackTrace();
- }
- System.out.println(" /tsave completely");
- }
- /**
- * 主方法
- *
- * @param argv
- * String[]
- */
- public static void main(String argv[]) {
- String url = "http://xxx.com";
- // getImagesFromSinglePage(url);
- try {
- String page = null;
- for(int i=2;i<=105;i++){
- page="http://xx.com/html/13/13_"+i+".shtml";
- getPageLinks(page);
- }
- getPageLinks(url);
- // String title=getTitle(url);
- // getImages(url,title);
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- // getImagesByParser(url);
- }
- private static void getPageLinks(String page) throws ParserException {
- Parser myParser = new Parser(page);
- // 设置编码
- myParser.setEncoding("UTF-8");
- String filterStr = "a";
- NodeFilter filter = new TagNameFilter(filterStr);
- NodeList nodeList = myParser.extractAllNodesThatMatch(filter);
- System.out.println("size: " + nodeList.size());
- for (int i = 0; i < nodeList.size(); i++) {
- LinkTag linkTag = (LinkTag) nodeList.elementAt(i);
- String link = linkTag.getLink();
- String text = linkTag.getLinkText();
- text = TextProcess(text);
- if (link.endsWith(".shtml") && text.length() > 2) {
- try {
- getImages(link,text);
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- }
- }
- private static String TextProcess(String text) {
- text = text.trim();
- text = text.replaceAll(">", "");
- text = text.replaceAll("<", "");
- text = text.replaceAll("/", "");
- text = text.replaceAll(">", "");
- text = text.replaceAll(" ", "");
- int pos = 0;
- if ((pos = text.indexOf(":-")) != -1)
- text = text.substring(pos + 2);
- pos = text.indexOf("-");
- if (pos != -1)
- text = text.substring(0, pos);
- pos = text.indexOf("-");
- if (pos != -1)
- text = text.substring(0, pos);
- text = text.replace(".", "");
- text = text.replaceAll(",", "");
- text = text.replaceAll(",", "");
- return text;
- }
- private static String getTitle(String url) throws ParserException {
- Parser myParser = new Parser(url);
- // 设置编码
- myParser.setEncoding("UTF-8");
- String titleTag = "title";
- NodeFilter titleFilter = new TagNameFilter(titleTag);
- NodeList titleList = myParser.extractAllNodesThatMatch(titleFilter);
- int size = titleList.size();
- String title = null;
- if (size == 1) {
- TitleTag titleT = (TitleTag) titleList.elementAt(0);
- title = titleT.getTitle();
- }
- return title;
- }
- public static void getImages(String resource, String title)
- throws Exception {
- // Set
- Set<String> imagesSet = new HashSet<String>();
- Parser myParser = new Parser(resource);
- // 设置编码
- myParser.setEncoding("UTF-8");
- String filterStr = "img";
- NodeFilter filter = new TagNameFilter(filterStr);
- NodeList nodeList = myParser.extractAllNodesThatMatch(filter);
- System.out.println("size: " + nodeList.size());
- for (int i = 0; i < nodeList.size(); i++) {
- ImageTag imageTag = (ImageTag) nodeList.elementAt(i);
- String imageUrl = imageTag.getImageURL();
- System.out.println("iamge " + i + ": " + imageUrl);
- if (!imagesSet.contains(imageUrl)) {
- System.out.print("/t saving ...");
- saveToFile(imageTag.getImageURL(), title);
- } else {
- System.out.print("/t exist already,no need to save");
- }
- }
- }
- }
- 使用HtmlParser读取论坛图片
- python使用HTMLParser保存网页图片
- HTMLParser 使用
- HTMLParser使用
- HTMLParser使用
- HtmlParser使用
- HTMLParser使用
- HTMLParser使用
- htmlparser使用
- HTMLParser使用
- HTMLParser使用
- HTMLParser使用
- HTMLParser使用
- HTMLParser使用
- HTMLParser使用
- HTMLParser使用
- 使用HtmlParser 提取百度贴吧中的图片
- 使用HtmlParser使用心得
- 当你优柔寡断和胡思乱想的时候需要看的17条哲理
- WebService 接口调用指南
- 记团队【性能优化】主题会
- Windows 语音编程初步
- 在windows xp环境下如何完全卸载 oracle9i
- 使用HtmlParser读取论坛图片
- 在ActionScript Virtual Machine 上运行C/C++代码
- 有些路总得一个人走
- winxp下secure crt登录上linux机器,显示中文乱码的问题
- 使用LaTeX
- 惯例
- Platform Builder 5.0的Build OS菜单详解:
- VC 进程间通信
- OPENFILENAME 结构体定义