图片爬虫程序 JAVA
来源:互联网 发布:淮南网络宾馆 编辑:程序博客网 时间:2024/06/05 12:25
package com.liyiwen.Crawler;import java.io.*;import java.net.MalformedURLException;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.Collection;import java.util.SplittableRandom;import java.util.concurrent.*;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * Created by dell on 2015/7/18. *//** * 正则表达式中\表示转义,java字符出串\也表示转义 */public class ImgCrawler implements Runnable { public static void main(String[] urls) throws FileNotFoundException, InterruptedException, IOException, ExecutionException{ ImgCrawler imgCrawler = new ImgCrawler(); ArrayList<Thread> threads = new ArrayList<Thread>(); for (int i = 0; i < ImgCrawler.threads; ++i){ Thread thread = new Thread(imgCrawler); thread.start(); threads.add(thread); } Thread.currentThread().sleep(10000); for (Thread t : threads){ t.interrupt(); } System.out.println(imgCrawler.getWantedUrls().toString() + "李意文"); printToFile(imgCrawler.getWantedUrls()); System.out.println("succeed"); } public static void printToFile(Collection<String> strs) throws FileNotFoundException, IOException{ File file = new File("crawler.html"); if (!file.exists()){ file.createNewFile(); } Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8")); writer.write(1); for (String str : strs){ writer.append(str + "\r\n"); } writer.flush(); } static private int threads; static { threads = Runtime.getRuntime().availableProcessors(); } public BlockingQueue<String> getWantedUrls() { return wantedUrls; } private BlockingQueue<String> wantedUrls; private BlockingQueue<String> handledUrls; public ImgCrawler(){ wantedUrls = new LinkedBlockingQueue<String>(); handledUrls = new LinkedBlockingQueue<String>(); handledUrls.add("http://www.dedeshe.com/html/article/2015-7/index26785.html"); } @Override public void run() { try{ for (int i = 0; i < 1000; i++ ){ String url = null; url = handledUrls.take(); crawl(url); if (Thread.currentThread().isInterrupted()){ break; } } } catch (Throwable throwable){ System.out.println(throwable.getMessage()); } } private void crawl(String url) throws InterruptedException{ try { URL crawledUrl = new URL(url); URLConnection urlConnection = crawledUrl.openConnection(); BufferedReader reader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), "utf-8")); String line = null; ArrayList<String> matchUrls = null; ArrayList<String> hrefs = null; while ((line = reader.readLine()) != null) { // System.out.println(line); matchUrls = matchWantedUrl(line); if (null != matchUrls && !matchUrls.isEmpty()) { for (String matchUrl : matchUrls) { wantedUrls.put(matchUrl); } } hrefs = matchHref(line, url); if (null != hrefs && !hrefs.isEmpty()) { for (String href : hrefs) { handledUrls.put(href); } } } } catch (MalformedURLException e){ System.out.println(e.getMessage()); System.out.println("url 错误"); } catch (IOException e){ System.out.println("不能打开连接"); System.out.println(e.getMessage()); } } private ArrayList<String> matchWantedUrl(String line){ ArrayList<String> wantedUrsl = new ArrayList<String>(); Pattern pattern = Pattern.compile("<img .*?src=\"http.*?\".*?>"); Matcher matcher = pattern.matcher(line); while (matcher.find()){ wantedUrsl.add(matcher.group(0)); } return wantedUrsl; } private ArrayList<String> matchHref(String line, String url){ String rootURL = null; Pattern rootURLPattern = Pattern.compile("(http.+com).*"); Matcher rootURLMatcher = rootURLPattern.matcher(url); if (rootURLMatcher.find()){ rootURL = rootURLMatcher.group(1); System.out.println("根目录: " + rootURL); } ArrayList<String> wantedHrefs = new ArrayList<String>(); Pattern pattern = Pattern.compile("<a href=\"(.+?)\""); Matcher matcher = pattern.matcher(line); while (matcher.find()){ if (!matcher.group(1).startsWith("http") && rootURL != null){ wantedHrefs.add(rootURL + matcher.group(1)); System.out.println(rootURL + matcher.group(1)); }else{ wantedHrefs.add(matcher.group(1)); System.out.println(matcher.group(1)); } } return wantedHrefs; }}
0 0
- 图片爬虫程序 JAVA
- 图片爬虫程序
- 图片爬虫程序
- java小爬虫程序
- java网络爬虫程序
- Java爬虫小程序
- Java爬虫网页抓取图片
- 简单的java爬虫程序
- java实现网络爬虫程序
- 爬虫程序开发指南(java)
- 简单的java爬虫程序
- java实现爬虫爬网站图片
- java爬虫抓取网络上的图片
- 【初学】java爬虫并抓取图片保存
- java爬虫爬取百度图片
- Java爬虫爬取网站图片
- java爬虫爬取美女图片
- 爬虫实战:一个简易 Java 爬虫程序的实现
- zoj 3430(ac自动机)
- Trie树的java实现
- hdu 5442 Favorite Donut 后缀数组
- 安卓控件使用系列2:TextView实现图文(图片和文字)混排
- 【bzoj4292】 [PA2015]Równanie 乱搞
- 图片爬虫程序 JAVA
- 2015弱校联盟(1) -A. Easy Math
- 黑马程序员-----IO流
- 单链表顺序存储相关操作的c语言实现
- Android:动画效果translate、scale、alpha、rotate详解
- springmvc整合mybatis出现Could not autowire field:No matching bean of type错误
- JAVA实现旋转数组的最小数字问题(《剑指offer》)
- CSS列表
- Windows7 平台下Python+NLTK环境搭建