Java多线程爬虫和存储
来源:互联网 发布:我是淘宝骗保师 编辑:程序博客网 时间:2024/06/05 08:57
import org.apache.http.HttpHeaders;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.util.EntityUtils;import org.dom4j.Element;import java.io.IOException;import java.util.regex.Matcher;import java.util.regex.Pattern;public class GetBookInfoThread extends Thread{ private CloseableHttpClient httpClient; private String webAddress; private Element rootElement; private Pattern bookAuthorRegex; private Pattern bookPublishRegex; private Pattern bookIsbnRegex; private Pattern bookImgRegex; private String bookName; /** * * @param httpClient 用这个操作抓取 * @param webAddress 这个是抓取的网址 * @param rootElement 这个是一个xml文档的根节点,用这个来操作加入新的子节点 */ public GetBookInfoThread(CloseableHttpClient httpClient,String webAddress,String bookName,Element rootElement,Pattern bookAuthorRegex,Pattern bookPublishRegex,Pattern bookIsbnRegex,Pattern bookImgRegex) { this.httpClient = httpClient; this.webAddress = webAddress; this.rootElement = rootElement; this.bookAuthorRegex = bookAuthorRegex; this.bookPublishRegex = bookPublishRegex; this.bookIsbnRegex = bookIsbnRegex; this.bookName = bookName; this.bookImgRegex = bookImgRegex; } @Override public void run() { HttpGet getBookInfo = new HttpGet(webAddress); getBookInfo.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30"); CloseableHttpResponse bookInfoResponse; String bookInfoCode = null;//书籍具体信息网页源码 try { bookInfoResponse = httpClient.execute(getBookInfo); if (bookInfoResponse.getStatusLine().getStatusCode() != 200) { System.out.println("获取书本具体信息时出错,页面地址:" + webAddress + "错误信息" + bookInfoResponse.getStatusLine()); return; } bookInfoCode = EntityUtils.toString(bookInfoResponse.getEntity()); } catch (IOException e) { e.printStackTrace(); } Matcher bookAuthorMatcher = bookAuthorRegex.matcher(bookInfoCode); //匹配作者 Matcher bookPublishMatcher = bookPublishRegex.matcher(bookInfoCode); //匹配出版商 Matcher bookIsbnMatcher = bookIsbnRegex.matcher(bookInfoCode); //匹配isbn Matcher bookImgMatcher = bookImgRegex.matcher(bookInfoCode); //匹配图片地址 String bookName = this.bookName; String bookAuthor = ""; String bookPublish = ""; String bookIsbn = ""; String bookLink = webAddress; String bookImg = ""; if (bookAuthorMatcher.find()) { bookAuthor = bookAuthorMatcher.group(1); } if (bookPublishMatcher.find()) { bookPublish = bookPublishMatcher.group(1); } if (bookIsbnMatcher.find()) { bookIsbn = bookIsbnMatcher.group(1); } if (bookImgMatcher.find()) { bookImg = bookImgMatcher.group(1); }// System.out.println(bookName + "-" + bookAuthor + "-" + bookPublish + "-" + bookIsbn); Element bookElement = rootElement.addElement("book");//新建一个书的标签 bookElement.addAttribute("id",String.valueOf(Main.bookId++)); bookElement.addElement("name").setText(bookName); bookElement.addElement("author").setText(bookAuthor); bookElement.addElement("publish").setText(bookPublish); bookElement.addElement("isbn").setText(bookIsbn); bookElement.addElement("count").setText(String.valueOf((int)(Math.random() * 10) + 3)); bookElement.addElement("link").setText(bookLink); bookElement.addElement("img").setText(bookImg); System.out.println("抓取了:" + webAddress + " " + bookName); }}
import org.apache.http.HttpHeaders;import org.apache.http.HttpHost;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.dom4j.Document;import org.dom4j.DocumentHelper;import org.dom4j.Element;import java.io.IOException;import java.util.*;import java.util.regex.Matcher;import java.util.regex.Pattern;public class Main { CloseableHttpClient httpClient; static int bookId = 496; Map<String,Integer> proxyMap;//ip->端口 List<String> ipList;//从这个list中读出ip,再由ip从map中读出端口 int i = 0;//根据这个从list中取出ip,换上对应的代理 public static void main(String[] args) { Main m = new Main();// List<String> tagList = m.getTagList(); List<String> tagList = new LinkedList<String>();// tagList.add("经典");// tagList.add("日本文学");// tagList.add("散文");// tagList.add("中国文学");// tagList.add("算法");// tagList.add("童话");// tagList.add("外国文学");// tagList.add("文学");// tagList.add("小说");// tagList.add("漫画");// tagList.add("诗词");// tagList.add("心理学"); tagList.add("摄影"); tagList.add("理财"); tagList.add("经济学"); m.pullAndWrite(tagList,10); } public Main() {// HttpHost proxy = new HttpHost("122.225.106.35",80);// httpClient = HttpClients.custom().setProxy(proxy).build(); httpClient = HttpClients.createDefault(); setProxyMap(); } public void setProxyMap() { proxyMap = new HashMap<String, Integer>(); ipList = new LinkedList<String>(); proxyMap.put("211.68.122.171",80);ipList.add("211.68.122.171"); } public List<String> getTagList() { HttpGet getTag = new HttpGet("http://book.douban.com/tag/"); getTag.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30"); CloseableHttpResponse tagPageResponse = null; String tagPageCode = null;//网页源码 try { tagPageResponse = httpClient.execute(getTag); tagPageCode = EntityUtils.toString(tagPageResponse.getEntity()); tagPageResponse.close(); } catch (IOException e) { e.printStackTrace(); } finally { try { tagPageResponse.close(); } catch (IOException e) { e.printStackTrace(); } } Pattern p = Pattern.compile("class=\"tag\">(.*?)</a>"); Matcher m = p.matcher(tagPageCode); List<String> resultTagList = new LinkedList<String>(); while (m.find()) { resultTagList.add(m.group(1)); } return resultTagList; } /** * * @param tagList 要抓的图书的类别 * @param maxPageNum 每种图书最多抓取的页数 */ public void pullAndWrite(List<String> tagList,int maxPageNum) { Pattern bookAddressRegex = Pattern.compile("href=\"(.*?)\" class=\"title\" target=\"_blank\">(.*?)</a>"); //获取具体书籍网址的正则 Pattern bookAuthorRegex = Pattern.compile("(?s)<span class=\"pl\"> 作者</span>:.*?>(.*?)</a>");//匹配作者 Pattern bookPublishRegex = Pattern.compile("<span class=\"pl\">出版社:</span> (.*?)<br/>"); Pattern bookIsbnRegex = Pattern.compile("<span class=\"pl\">ISBN:</span> (.*?)<br/>"); Pattern bookImgRegex = Pattern.compile("<img src=\"(.*?)\" title=\"点击看大图\""); //分别抓取每一种类别的书籍 for (String tag:tagList) { int nowPageNum = 0;//目前正在抓取的页数 Document newDocument = DocumentHelper.createDocument(); Element rootElement = newDocument.addElement("root"); while (nowPageNum < maxPageNum) { System.out.println(1); String nowPageAddress = "http://www.douban.com/tag/" + tag + "/book?start=" + nowPageNum * 15;//当前页的网址 HttpGet getBooksPage = new HttpGet(nowPageAddress); getBooksPage.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30"); CloseableHttpResponse booksPageResponse; Matcher m = null; try { System.out.println(2); booksPageResponse = httpClient.execute(getBooksPage); System.out.println(3); m = bookAddressRegex.matcher(EntityUtils.toString(booksPageResponse.getEntity())); booksPageResponse.close(); if (booksPageResponse.getStatusLine().getStatusCode() != 200) { System.out.println("抓 " + nowPageAddress + " 时出错:"); System.out.println("错误信息:" + booksPageResponse.getStatusLine()); changeProxy(); continue;//换个代理继续爬当前页 } } catch (IOException e) { e.printStackTrace(); } //具体每一本书,具体抓取 int findCount = 0;//找到的书籍的数目 List<Thread> threadList = new LinkedList<Thread>(); while (m.find()) { threadList.add(new GetBookInfoThread(httpClient, m.group(1), m.group(2), rootElement, bookAuthorRegex, bookPublishRegex, bookIsbnRegex,bookImgRegex)); findCount++; } //没有知道到代表这种类别的书都找完了,那么直接退出此类书籍的查找 if (findCount == 0) { break; } for (Thread thread:threadList) { thread.start(); } for (Thread thread:threadList) { try { thread.join(); } catch (InterruptedException e) { e.printStackTrace(); } } nowPageNum++; } //一个类别爬完了再写入 new WriteBookInfoToFile(rootElement,"/home/geekgao/book/" + tag + ".xml").start(); //另开一个线程写入文件 } } private void changeProxy() { if (i >= ipList.size()) { System.out.println("代理用完了,退出"); System.exit(0); } String ip = ipList.get(i++); httpClient = HttpClients.custom().setProxy(new HttpHost(ip,proxyMap.get(ip))).build(); System.out.println("换代理啦,使用代理:" + ip + ",端口:" + proxyMap.get(ip)); }}
import org.dom4j.Element;import org.dom4j.io.XMLWriter;import java.io.FileWriter;import java.io.IOException;import java.io.Writer;public class WriteBookInfoToFile extends Thread { private Element root; private String fileAddress; public WriteBookInfoToFile(Element root,String fileAddress) { this.root = root; this.fileAddress = fileAddress; } @Override public void run() { Writer fileWriter; try { fileWriter = new FileWriter(fileAddress); XMLWriter xmlWriter = new XMLWriter(fileWriter); xmlWriter.write(root); xmlWriter.close(); System.out.println("[" + fileAddress + "]写入成功"); } catch (IOException e) { e.printStackTrace(); } }}
import org.dom4j.Document;import org.dom4j.DocumentException;import org.dom4j.Element;import org.dom4j.io.SAXReader;import java.io.File;import java.sql.DriverManager;import java.sql.SQLException;import java.sql.Statement;import java.util.List;public class WriteInfoToDB { public static void main(String[] args) { File folder = new File("/home/geekgao/book"); File[] XMLS = folder.listFiles(); SAXReader reader = new SAXReader(); Statement statement = null; //用这个执行sql语句 try { Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动 statement = DriverManager.getConnection("jdbc:mysql://localhost:3306/BookManage?user=root&password=root").createStatement(); } catch (SQLException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } for (File f:XMLS) { if (f.isDirectory()) { continue; } Document document = null; try { document = reader.read(f); } catch (DocumentException e) { e.printStackTrace(); } Element root = document.getRootElement(); List<Element> books = root.elements(); for (Element book:books) { String name = null; String author = null; String publish = null; String isbn = null; String count = null; String link = null; String img = null; List<Element> b = book.elements(); for (Element info:b) { if (info.getName().equals("name")) { name = info.getText(); } else if (info.getName().equals("author")) { author = info.getText(); } else if (info.getName().equals("publish")) { publish = info.getText(); } else if (info.getName().equals("isbn")) { isbn = info.getText(); } else if (info.getName().equals("count")) { count = info.getText(); } else if (info.getName().equals("link")) { link = info.getText(); } else if (info.getName().equals("img")) { img = info.getText(); }// System.out.println(info.getName() + ": " + info.getText()); } String sql = "INSERT INTO Book(bookPublish,bookName,bookAuthor,bookTag,bookIsbn,bookCount,bookRestCount,bookLink,bookImg) VALUES ('" + publish + "','" + name + "','" + author + "','" + f.getName().split("\\.")[0] + "','" + isbn + "','" + count + "','" + count + "','" + link + "','" + img + "');"; try { statement.execute(sql); } catch (SQLException e) { System.err.println("sql语句处错误:" + e.getMessage()); System.err.println("sql语句:" + sql); } } } }}
0 0
- Java多线程爬虫和存储
- java多线程爬虫实例
- java多线程爬虫实例
- java多线程爬虫实现
- java多线程爬虫
- Java多线程爬虫
- crawler4j java多线程网页爬虫
- 关于JAVA的多线程爬虫
- 百度百科多线程爬虫(Java)
- JAVA 多线程爬虫实例详解
- Java多线程爬虫爬取京东商品信息
- java 多线程实现 爬虫京东搜索商品爬虫
- Python高级爬虫(三):数据存储以及多线程
- 网络爬虫开发技术——数据存储以及多线程
- 多线程实现的Java爬虫程序
- 多线程实现的Java爬虫程序
- 用多线程实现的Java爬虫程序
- 用JAVA实现简单爬虫多线程抓取
- 如何用查询语句还原SQL的备份数据库?
- 【HDU5857】Median(方法)
- RabbitMQ的四种ExChange
- 新兴趣
- Android6.0权限适配
- Java多线程爬虫和存储
- 使用Python的Dataframe取两列时间值相差一年的所有行
- 简单的Qt网络通讯
- Linux下安装QT
- 一些关键字
- Hive JOIN使用详解
- JAVA 枚举使用详解
- Toolbar自定义样式,别致的布局,可复用
- 消除由于使用gcc编译选项-Wunused-parameter而导致的 warning: "unused parameter xxxx"警告