网络爬虫之获取图片到本地
来源:互联网 发布:防范电信网络诈骗宣传 编辑:程序博客网 时间:2024/04/30 13:09
/*
* Created on Aug 26, 2011 2:41:26 PM
*
* HtmlSourceGetter.java
*
* NOTICE OF PROPRIETARY RIGHTS
*
* This program is a confidential trade secret and the property of author. Use, examination,
* reproduction, disassembly, decompiling, transfer and/or disclosure to others of
* all or any part of this software program are strictly prohibited except by express
* written agreement with author.
*
* --------------------------------------------------------------------------------------
* Modification History
* Date Author Version Description
* Aug 26, 2011 Cross 1.0 New
* --------------------------------------------------------------------------------------
*/
package com.cross.tools;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
public class HtmlSourceGetter {
private static HttpURLConnection con = null;
private static BufferedInputStream bis = null;
private static OutputStream out = null;
public static void getSource(String url) {
public static void parseHTML(String url, String keyword) {
private static void processNodeList(NodeList list, String keyword) {
public static void extractLinks(String url) {
try {
Parser parser = new Parser(url);
parser.setEncoding("UTF-8");
// frame filter
NodeFilter frameFilter = new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("frame src=")) {
return true;
}
return false;
}
};
// image filter;
NodeFilter imageFilter = new NodeClassFilter(ImageTag.class);
// href filter;
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
// link or image filter
// OrFilter orFilter = new OrFilter(new NodeClassFilter(LinkTag.class),new NodeClassFilter(ImageTag.class));
// link or image or frame filter
// OrFilter allFilter = new OrFilter(orFilter,frameFilter);
NodeList nodeList = parser.extractAllNodesThatMatch(imageFilter);
for (int i = 0; i < nodeList.size(); i++) {
Node tag = nodeList.elementAt(i);
// <a href> tag
// if(tag instanceof LinkTag) {
// LinkTag link = (LinkTag)tag;
// String linkURL = link.getLink();
// String linkText = link.getLinkText();
// System.out.println("linkURL:"+linkURL);
// System.out.println("linkText:"+linkText);
// }
// <img src> tag
// else if(tag instanceof ImageTag) {
ImageTag image = (ImageTag)tag;
String imageURL = image.getImageURL();
String imageText = image.getText();
System.out.println("imageURL:"+imageURL);
System.out.println("imageText:"+imageText);
con = (HttpURLConnection)(new URL(imageURL).openConnection());
con.connect();
bis = new BufferedInputStream(con.getInputStream());
out = new FileOutputStream(new File("c:/cross/" + i + "_" +System.currentTimeMillis() +imageURL.substring(imageURL.lastIndexOf("."))));
byte[] buf = new byte[1024];
int size = 0;
while((size = bis.read(buf)) != -1){
out.write(buf, 0, size);
}
// out.flush();
// } else { // <frame src> tag eg:<frame src="test.html"/>
// String frame = tag.getText();
// String frameURL = frame.split("\"")[1];
// System.out.println("frameURL:"+frameURL);
//
// }
}
} catch (Exception e) {
System.err.println(e.getStackTrace());
} finally {
try {
out.close();
bis.close();
con.disconnect();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
// HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "@");
// HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "img");
HtmlSourceGetter.extractLinks("http://localhost:8080/");
// HtmlSourceGetter.extractLinks("http://localhost:8080/test/");
}
}
* Created on Aug 26, 2011 2:41:26 PM
*
* HtmlSourceGetter.java
*
* NOTICE OF PROPRIETARY RIGHTS
*
* This program is a confidential trade secret and the property of author. Use, examination,
* reproduction, disassembly, decompiling, transfer and/or disclosure to others of
* all or any part of this software program are strictly prohibited except by express
* written agreement with author.
*
* --------------------------------------------------------------------------------------
* Modification History
* Date Author Version Description
* Aug 26, 2011 Cross 1.0 New
* --------------------------------------------------------------------------------------
*/
package com.cross.tools;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
public class HtmlSourceGetter {
private static HttpURLConnection con = null;
private static BufferedInputStream bis = null;
private static OutputStream out = null;
public static void getSource(String url) {
public static void parseHTML(String url, String keyword) {
private static void processNodeList(NodeList list, String keyword) {
public static void extractLinks(String url) {
try {
Parser parser = new Parser(url);
parser.setEncoding("UTF-8");
// frame filter
NodeFilter frameFilter = new NodeFilter() {
@Override
public boolean accept(Node node) {
if(node.getText().startsWith("frame src=")) {
return true;
}
return false;
}
};
// image filter;
NodeFilter imageFilter = new NodeClassFilter(ImageTag.class);
// href filter;
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
// link or image filter
// OrFilter orFilter = new OrFilter(new NodeClassFilter(LinkTag.class),new NodeClassFilter(ImageTag.class));
// link or image or frame filter
// OrFilter allFilter = new OrFilter(orFilter,frameFilter);
NodeList nodeList = parser.extractAllNodesThatMatch(imageFilter);
for (int i = 0; i < nodeList.size(); i++) {
Node tag = nodeList.elementAt(i);
// <a href> tag
// if(tag instanceof LinkTag) {
// LinkTag link = (LinkTag)tag;
// String linkURL = link.getLink();
// String linkText = link.getLinkText();
// System.out.println("linkURL:"+linkURL);
// System.out.println("linkText:"+linkText);
// }
// <img src> tag
// else if(tag instanceof ImageTag) {
ImageTag image = (ImageTag)tag;
String imageURL = image.getImageURL();
String imageText = image.getText();
System.out.println("imageURL:"+imageURL);
System.out.println("imageText:"+imageText);
con = (HttpURLConnection)(new URL(imageURL).openConnection());
con.connect();
bis = new BufferedInputStream(con.getInputStream());
out = new FileOutputStream(new File("c:/cross/" + i + "_" +System.currentTimeMillis() +imageURL.substring(imageURL.lastIndexOf("."))));
byte[] buf = new byte[1024];
int size = 0;
while((size = bis.read(buf)) != -1){
out.write(buf, 0, size);
}
// out.flush();
// } else { // <frame src> tag eg:<frame src="test.html"/>
// String frame = tag.getText();
// String frameURL = frame.split("\"")[1];
// System.out.println("frameURL:"+frameURL);
//
// }
}
} catch (Exception e) {
System.err.println(e.getStackTrace());
} finally {
try {
out.close();
bis.close();
con.disconnect();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
// HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "@");
// HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "img");
HtmlSourceGetter.extractLinks("http://localhost:8080/");
// HtmlSourceGetter.extractLinks("http://localhost:8080/test/");
}
}
- 网络爬虫之获取图片到本地
- 获取网络图片,并保存到本地
- python爬虫之抓取网页中的图片到本地
- 获取网络图片并下载到本地相册
- Qt 获取网络gif图片并保存到本地显示
- 网络爬虫,获取页面图片
- python爬虫抓取图片到本地
- python爬虫-下载图片到本地目录
- 保存网络图片到本地
- 下载网络图片到本地
- 从网络获取图片资源缓存到本地,第二次进入直接从本地加载。
- volley获取网络图片,生成本地图片
- Android网络获取图片保存到本地,并在ViewPager中显示图片
- python 获取网络图片并下载到本地(由网络源码改编)
- android 将网络获取的图片保存到本地的Sqlite数据库中(包括json获取,解析,获取网络图片,创建本地数据库)
- Listview网络获取图片并保存到本地及取出显示
- 2、跑马灯效果-获取网络图片并缓存到本地
- 如何从网络上获取图片转换成字节流或者保存到本地
- 00040.Oracle数据库编码格式查看及乱码解决思路
- tomcat 出问题的解决思路
- SSH 和 JSP+servlet+JavaBean到底谁更优秀
- 利用Flex 4.5 SDK和Flash Builder 4.5开发的web和移动参考应用(一)
- PHP后台守护进程的实现
- 网络爬虫之获取图片到本地
- Android 2.2 和2.3 的音频驱动
- ComboBoxEdit 设置选项值(单选——多选)
- 今天开博客了,记录下以后在大数据处理方面的点点滴滴
- 七种寻址方式
- Demo系统开发思考
- 00041.Oracle 内存缓冲区信息获取
- Android的触摸屏和触摸按键的支持
- 00042.Oracle静态参数文件创建