百度搜索-爬虫保存结果
来源:互联网 发布:mysql启动服务命令 编辑:程序博客网 时间:2024/04/26 06:39
百度搜索-爬虫保存结果
BaiduResult.java
package com.reptileBaidu.domain;public class BaiduResult { /** 标题 */ private String title; /** 地址 */ private String url; /** 概述 */ private String assumably; /** 关键字 */ private String searchContent; public BaiduResult(){ } public BaiduResult(String searchContent) { this.searchContent = searchContent; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getAssumably() { return assumably; } public void setAssumably(String assumably) { this.assumably = assumably; } public String getSearchContent() { return searchContent; } public void setSearchContent(String searchContent) { this.searchContent = searchContent; } @Override public String toString() { return "[title=" + title + ", url=" + url + ", assumably=" + assumably + "]"; }}
ConnnectionManager.java
package com.reptileBaidu.sql.util;import java.sql.DriverManager;import com.mysql.jdbc.Connection;public class ConnnectionManager { private static final ThreadLocal<Connection> connectionHolder = new ThreadLocal<Connection>(); private static final String BETADBURL = "jdbc:mysql://192.168.1.10:3306/reptilebaidu?useUnicode=true&characterEncoding=utf8&autoReconnect=true&user=root&password=pass4you"; public static Connection getConnectionFromThreadLocal() { Connection conn = connectionHolder.get(); try { if (conn == null || conn.isClosed()) { Connection con = ConnnectionManager.getConnection(); connectionHolder.set(con); System.out.println("[Thread]" + Thread.currentThread().getName()); return con; } return conn; } catch (Exception e) { System.out.println("[ThreadLocal Get Connection Error]" + e.getMessage()); } return null; } public static Connection getConnection() { Connection conn = null; try { Class.forName("com.mysql.jdbc.Driver"); conn = (Connection) DriverManager.getConnection(BETADBURL); } catch (Exception e) { System.out.println("[Get Connection Error]" + e.getMessage()); } return conn; }}
DataUpdater.java
package com.reptileBaidu.sql.util;import java.sql.SQLException;import java.util.List;import com.mysql.jdbc.PreparedStatement;import com.reptileBaidu.domain.BaiduResult;public class DataUpdater implements Runnable { private PreparedStatement pst; private List<BaiduResult> baiduResults; private final String SQL = "insert into reptilebaidu (`title` ,`url` , `assumably` , searchContent) VALUES (?, ? ,?,?)"; public DataUpdater(List<BaiduResult> baiduResults) { this.baiduResults = baiduResults; } public void run() { try { pst = (PreparedStatement) ConnnectionManager.getConnectionFromThreadLocal().prepareStatement(SQL); for (BaiduResult baiduResult : baiduResults) { pst.setString(1, baiduResult.getTitle()); pst.setString(2, baiduResult.getUrl()); pst.setString(3, baiduResult.getAssumably()); pst.setString(4, baiduResult.getSearchContent()); pst.addBatch(); } pst.executeBatch(); } catch (Exception e) { System.err.println("[SQL ERROR MESSAGE]" + e.getMessage()); } finally { close(pst); } } public void close(PreparedStatement pst) { if (pst != null) { try { pst.close(); } catch (SQLException e) { System.err.println("[Close Statement Error]" + e.getMessage()); } } }}
QunarThreadPoolExecutor.java
package com.reptileBaidu.sql.util;import java.util.concurrent.BlockingQueue;import java.util.concurrent.RejectedExecutionHandler;import java.util.concurrent.ThreadFactory;import java.util.concurrent.ThreadPoolExecutor;import java.util.concurrent.TimeUnit;import java.util.concurrent.atomic.AtomicInteger;import java.util.concurrent.atomic.AtomicLong;/** * 继承ThreadPoolExecutor来进行线程池配置 * @author 玮 * */public class QunarThreadPoolExecutor extends ThreadPoolExecutor { // 记录每个线程执行任务开始时间 private ThreadLocal<Long> start = new ThreadLocal<Long>(); // 记录所有任务完成使用的时间 private AtomicLong totals = new AtomicLong(); // 记录线程池完成的任务数 private AtomicInteger tasks = new AtomicInteger(); public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue, ThreadFactory threadFactory, RejectedExecutionHandler handler) { super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory, handler); } public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue, RejectedExecutionHandler handler) { super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, handler); } public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue, ThreadFactory threadFactory) { super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory); } /** * 重写构造方法 * @param corePoolSize 核心线程池大小 * @param maximumPoolSize 最大线程池大小 * @param keepAliveTime 线程池中超过corePoolSize数目的空闲线程最大存活时间;可以allowCoreThreadTimeOut(true)使得核心线程有效时间 * @param unit keepAliveTime时间单位 * @param workQueue 阻塞任务队列 */ public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit, BlockingQueue<Runnable> workQueue) { super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue); } /** * 每个线程在调用run方法之前调用该方法 * */ @Override protected void beforeExecute(Thread t, Runnable r) { super.beforeExecute(t, r); start.set(System.currentTimeMillis()); } /** * 每个线程在执行完run方法后调用该方法 * */ @Override protected void afterExecute(Runnable r, Throwable t) { super.afterExecute(r, t); tasks.incrementAndGet(); totals.addAndGet(System.currentTimeMillis() - start.get()); } /** * 关闭线程池时调用的方法 */ @Override protected void terminated() { super.terminated(); System.out.println("完成"+ tasks.get() +"个任务,平均耗时: [" + totals.get() / tasks.get() + "] ms"); }}
DataUpdaterMain .java
package com.reptileBaidu.sql.util;import java.util.List;import java.util.concurrent.LinkedBlockingQueue;import java.util.concurrent.TimeUnit;import com.reptileBaidu.domain.BaiduResult;/** * 利用线程池保存数据 * @author 玮 * */public class DataUpdaterMain { private LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>(); private QunarThreadPoolExecutor qunarThreadPoolExecutor = new QunarThreadPoolExecutor(5, 8, 5, TimeUnit.MINUTES, queue); /** * 关闭线程池 */ public void shutThreadPool() { if (qunarThreadPoolExecutor != null) { qunarThreadPoolExecutor.shutdown(); try { if (!qunarThreadPoolExecutor.awaitTermination(20 , TimeUnit.MINUTES)) { qunarThreadPoolExecutor.shutdownNow(); } } catch (Exception e) { System.err.println("[ThreadPool Close Error]" + e.getMessage()); } } } public boolean update(List<BaiduResult> baiduResults) { qunarThreadPoolExecutor.execute(new DataUpdater(baiduResults)); return true; }}
ReptileBaidu.java
package com.reptileBaidu.sql.util;import java.util.ArrayList;import java.util.List;import org.apache.log4j.Logger;import org.htmlparser.Parser;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.tags.Div;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.json.JSONException;import org.json.JSONObject;import com.reptileBaidu.domain.BaiduResult;public class ReptileBaidu { /** * 百度搜索,解析查询结果 * @param searchContent 搜索内容 * @param startNum 起始条数 默认0 * @param onePageNum 每页最大条数 最大为50 * @return * @throws JSONException */ public static List<BaiduResult> baiduSearch(String searchContent,int startNum,int onePageNum) throws JSONException{ String url = "http://www.baidu.com/s?word="+searchContent+"&cl=3&pn="+startNum+"&rn="+onePageNum; List<BaiduResult> baiduList = new ArrayList<BaiduResult>(); try { Parser parser = new Parser(url); // parser.setEncoding("utf-8"); // 得到所有经过过滤的标签 NodeList list = parser.extractAllNodesThatMatch(new OrFilter(new AndFilter(new HasAttributeFilter("class","c-abstract"),new NodeClassFilter(Div.class)),new AndFilter(new HasAttributeFilter("class","c-tools"),new NodeClassFilter(Div.class)))); //创建搜索结果对象,设置关键字 BaiduResult baidu = new BaiduResult(searchContent); for (int i = 0; i < list.size(); i++) { Div div = (Div)list.elementAt(i); if(div.getAttribute("id") == null){ //设置大概 baidu.setAssumably(div.getStringText()); }else if(baidu.getAssumably()!= ""){ String data = div.getAttribute("data-tools"); if(data != null){ JSONObject json1 = new JSONObject(data); //设置标题 baidu.setTitle(json1.getString("title")); //设置地址 baidu.setUrl(json1.getString("url")); System.out.println(baidu.toString()); baiduList.add(baidu); baidu = new BaiduResult(searchContent); } } } } catch (ParserException e) { System.out.println(url + "-->不存在"); System.out.println(e.getMessage()); } return baiduList; } public static void main(String[] args) { for(int i=1;i<11;i++){ SearchRunnable r0 = new SearchRunnable("阿拉善", 1000*(i-1), 1000*i); Thread t0 = new Thread(r0);// 创建线程 t0.start(); // 线程开启 } }}class SearchRunnable implements Runnable { private static Logger logger = Logger.getLogger(SearchRunnable.class); /** 关键字 */ private String searchContent; /** 起始条数 */ private int startNum = 0; /** 终止条数 */ private int endNum = 10000; public SearchRunnable(String searchContent, int startNum, int endNum) { this.searchContent = searchContent; this.startNum = startNum; this.endNum = endNum; } public void run() { System.out.println("开启线程"); long start = System.currentTimeMillis(); List<BaiduResult> baiduList = new ArrayList<BaiduResult>(); DataUpdaterMain dataUpdaterMain = new DataUpdaterMain(); int size = 0; try { for (int i = startNum; i < endNum; i += 50) { //进行爬虫 baiduList.addAll(ReptileBaidu.baiduSearch(searchContent, i,50)); if(baiduList.size() >=100){ //保存爬虫结果 dataUpdaterMain.update(baiduList); size+=baiduList.size(); baiduList = new ArrayList<BaiduResult>(); } } dataUpdaterMain.update(baiduList); size+=baiduList.size(); } catch (Exception e) { System.out.println(e.getMessage()); } finally { dataUpdaterMain.shutThreadPool(); logger.info("耗时[" + (System.currentTimeMillis() - start) + "]ms,保存"+size+"条数据"); } }}
0 0
- 百度搜索-爬虫保存结果
- 百度搜索结果爬虫
- python 爬虫百度搜索结果
- 【爬虫】爬取百度搜索结果页面
- python爬虫爬取百度搜索结果
- 爬取百度搜索结果的爬虫
- Python爬虫爬取百度搜索结果——邮箱地址
- 【百度爬虫系列 I】多关键字图片搜索结果汇总
- 屏蔽百度爬虫搜索
- python爬虫(11)身边的搜索专家——获取百度搜索结果
- 抓取百度搜索结果
- python使用get在百度搜索并保存第一页搜索结果
- 17.6.5 如何用python爬虫百度图片里面可加关键词的搜索结果
- 【百度爬虫系列 II】关键字搜索url结果汇总(给定关键字和页数)
- 百度搜索结果的调整
- php截取百度搜索结果
- 百度搜索结果URL解密
- python抓取百度搜索结果
- sql2005全库查找关键字
- iOS开发UI篇—Quartz2D简单使用(三)
- go语言string、int、int64互相转换
- 头疼的Maven 下载
- Openlayers之测量距离与面积
- 百度搜索-爬虫保存结果
- linux系统中如何进入退出vim编辑器,方法及区别
- MFC学习笔记——剪贴板通信
- 二分思路题Anton and Fairy Tale
- 图片流量节省大杀器:基于腾讯云CDN的sharpP自适应图片技术实践
- JavaScript事件
- 树莓派源的更新
- CalendarListview
- viewpager入门