java jsoup 网络爬虫
来源:互联网 发布:闪电购入驻淘宝了吗 编辑:程序博客网 时间:2024/05/15 23:54
- package com.iteye.injavawetrust.gethtml;
- import java.util.Map;
- import java.util.Set;
- /**
- *
- * @author InJavaWeTrust
- *
- */
- public class GetHtml {
- private static JsoupUtil ju = JsoupUtil.getInstance();
- public void getHtml(String url){
- ju.initUnvisitedUrl(url);
- //待访问队列不为空,已访问<10000
- while(!HtmlQueue.unVisitedUrlsEmpty() && HtmlQueue.getVisitedUrlNum() < 10000){
- String visitUrl = (String) HtmlQueue.unVisitedUrlDeQueue();
- if(null == visitUrl){
- continue;
- }
- Map<String, String> map = ju.getHtml(visitUrl);
- if(0 == map.size()){
- continue;
- }
- ju.getHtmlToLocal(map); //将html写如本地文件
- HtmlQueue.addVisitedUrl(visitUrl); //将该URL放入到已访问的URL队列中
- Set<String> links = ju.getAllUrl(visitUrl); //提取出下载网页中的URL
- for(String link :links){
- if(!link.startsWith(Constants.URL)){
- continue;
- }
- if(!ju.checkURL(link)){
- continue;
- }
- // 新的未访问的 URL加入队待访问的 URL队列
- HtmlQueue.addUnvisitedUrl(link);
- }
- }
- }
- public static void main(String[] args) {
- GetHtml gh = new GetHtml();
- long starTime = System.currentTimeMillis();
- gh.getHtml(Constants.URL);
- long endTime = System.currentTimeMillis();
- System.out.println("共下载 [" + HtmlQueue.getVisitedUrlNum() + "]");
- System.out.println("用时 [" + ju.msToss(endTime - starTime) + "]");
- }
- }
- 下载
- package com.iteye.injavawetrust.gethtml;
- import java.util.HashSet;
- import java.util.PriorityQueue;
- import java.util.Queue;
- import java.util.Set;
- /**
- *
- * @author InJavaWeTrust
- *
- */
- public class HtmlQueue {
- /**
- * 已访问的URL队列
- */
- private static Set<String> visitedUrl = new HashSet<String>();
- /**
- * 待访问的 URL队列
- */
- private static Queue<String> unVisitedUrl = new PriorityQueue<String>();
- /**
- * 获得待访问URL队列
- * @return
- */
- public static Queue<String> getUnVisitedUrl() {
- return unVisitedUrl;
- }
- /**
- * 添加到访问过的URL队列中
- * @param url
- */
- public static void addVisitedUrl(String url) {
- visitedUrl.add(url);
- }
- /**
- * 移除访问过的URL
- * @param url
- */
- public static void removeVisitedUrl(String url) {
- visitedUrl.remove(url);
- }
- /**
- * 未访问的URL出队列
- * @return
- */
- public static Object unVisitedUrlDeQueue() {
- return unVisitedUrl.poll();
- }
- /**
- * 添加到待访问的 URL队列,保证每个URL只被访问一次
- * @param url
- */
- public static void addUnvisitedUrl(String url) {
- if (url != null && !url.trim().equals("") && !visitedUrl.contains(url)
- && !unVisitedUrl.contains(url))
- unVisitedUrl.add(url);
- }
- /**
- * 获得已经访问的URL数目
- * @return
- */
- public static int getVisitedUrlNum() {
- return visitedUrl.size();
- }
- /**
- * 判断未访问的URL队列中是否为空
- * @return true-空;false-非空
- */
- public static boolean unVisitedUrlsEmpty() {
- return unVisitedUrl.isEmpty();
- }
- }
- package com.iteye.injavawetrust.gethtml;
- /**
- *
- * @author InJavaWeTrust
- *
- */
- public class Constants {
- public static String URL = "http://www.jqu.net.cn";
- public static String HTMLPATH = "E:\\InJavaWeTrust\\jsoup\\html\\";
- }
- package com.iteye.injavawetrust.gethtml;
- import java.io.File;
- import java.io.FileNotFoundException;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.OutputStreamWriter;
- import java.io.Writer;
- import java.text.SimpleDateFormat;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.Iterator;
- import java.util.Map;
- import java.util.Set;
- import java.util.TimeZone;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /** 下载
- *
- * @author InJavaWeTrust
- *
- */
- public class JsoupUtil {
- private JsoupUtil() {
- }
- private static final JsoupUtil instance = new JsoupUtil();
- public static JsoupUtil getInstance() {
- return instance;
- }
- /**
- * 初始化待访问URL队列
- * @param url URL
- */
- public void initUnvisitedUrl(String url) {
- HtmlQueue.addUnvisitedUrl(url);
- }
- /**
- * 获取URL
- * @param url URL
- * @return URL
- */
- public Set<String> getAllUrl(String url){
- Set<String> urls = new HashSet<String>();
- try {
- Document document = Jsoup.connect(url).timeout(5000).get();
- Elements hrefs = document.select("a[href]");
- Iterator<Element> hrefIter = hrefs.iterator();
- while (hrefIter.hasNext()) {
- Element href = hrefIter.next();
- urls.add(href.attr("href"));
- }
- Elements srcs = document.select("img[src]");
- Iterator<Element> srcIter = srcs.iterator();
- while(srcIter.hasNext()){
- Element src = srcIter.next();
- urls.add(src.attr("src"));
- }
- Elements opts = document.select("option[value]");
- Iterator<Element> optIter = opts.iterator();
- while(optIter.hasNext()){
- Element opt = optIter.next();
- urls.add(opt.attr("value"));
- }
- Elements links = document.select("link[href]");
- Iterator<Element> linkIter = links.iterator();
- while(linkIter.hasNext()){
- Element li = linkIter.next();
- urls.add(li.attr("href"));
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- return urls;
- }
- /**
- * 得到html内容和html名称
- * @param url URL
- * @return map[html-内容;title-名称]
- */
- public Map<String, String> getHtml(String url){
- Map<String, String> map = new HashMap<String, String>();
- try {
- Document document = Jsoup.connect(url).timeout(5000).get();
- map.put("html", document.html());
- map.put("title", url.replaceAll("/", "").replaceAll(":", ""));
- } catch (IOException e) {
- System.out.println("This is html has exception [" + url + "]");
- System.out.println(e.getMessage());
- }
- return map;
- }
- /**
- * URL是否以html结尾
- * @param url
- * @return true-是;false-否
- */
- public boolean checkURL(String url) {
- String html = url.substring(url.lastIndexOf(".") + 1);
- return "html".equals(html) ? true : false;
- }
- /**
- * 将html写入本地文件
- * @param htmlText html内容
- * @param htmlName html名称
- */
- public void getHtmlToLocal(Map<String, String> map){
- Writer writer = null;
- try {
- writer = new OutputStreamWriter(new FileOutputStream(new File(
- Constants.HTMLPATH + map.get("title"))), "UTF-8");
- writer.write(map.get("html"));
- writer.flush();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- if (writer != null) {
- try {
- writer.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }
- /**
- * 毫秒转换成hhmmss
- * @param ms 毫秒
- * @return hh:mm:ss
- */
- public String msToss(long ms) {
- SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");
- formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));
- String ss = formatter.format(ms);
- return ss;
- }
- }
0 0
- java jsoup 网络爬虫
- Jsoup网络爬虫
- htmluinit+jsoup 网络爬虫
- Jsoup之网络爬虫
- Java爬虫之Jsoup
- java爬虫:JSOUP
- Java编写网络爬虫笔记(第三部分:Jsoup的强大)
- 【正完成】Java基于Jsoup的网络爬虫工具实现
- [置顶]Java丨jsoup网络爬虫模拟登录思路解析
- Jsoup做的网络爬虫
- jsoup 的用法网络爬虫
- Jsoup 网络爬虫 学习例子
- 使用JSOUP实现网络爬虫
- Jsoup 爬虫 抓取网络图片
- Jsoup -- 网络爬虫解析器
- 使用JSOUP实现网络爬虫
- Java实现爬虫给App提供数据(Jsoup 网络爬虫)
- Java实现爬虫给App提供数据(Jsoup 网络爬虫)
- Exception in thread "main" java.lang.ClassNotFoundException
- 【ejabberd 16.09】【windows】安装完conf、database、logs文件夹在什么地方?
- 程序员经验总结
- 网络请求之-NSURLSeesion
- Android资源文件目录assets和raw
- java jsoup 网络爬虫
- 手摸手教你写Slack的Loading动画
- matlab如何打包生成exe
- 微信浏览器内打开App Store链接,并跳转到对应的app页面
- android.view.WindowManager$BadTokenException: Unable to add window -- token android.os.BinderProxy@2
- 大话程序猿眼里的高并发架构
- BZOJ4712 洪水
- 再谈JS各种高度命令
- Spring 常用注解