百度搜索-爬虫保存结果

来源:互联网 发布:mysql启动服务命令 编辑:程序博客网 时间:2024/04/26 06:39

百度搜索-爬虫保存结果
BaiduResult.java

package com.reptileBaidu.domain;public class BaiduResult {    /** 标题 */    private String title;    /** 地址 */    private String url;    /** 概述 */    private String assumably;    /** 关键字 */    private String searchContent;    public BaiduResult(){    }    public BaiduResult(String searchContent) {        this.searchContent = searchContent;    }    public String getTitle() {        return title;    }    public void setTitle(String title) {        this.title = title;    }    public String getUrl() {        return url;    }    public void setUrl(String url) {        this.url = url;    }    public String getAssumably() {        return assumably;    }    public void setAssumably(String assumably) {        this.assumably = assumably;    }    public String getSearchContent() {        return searchContent;    }    public void setSearchContent(String searchContent) {        this.searchContent = searchContent;    }    @Override    public String toString() {        return "[title=" + title + ", url=" + url + ", assumably="                + assumably + "]";    }}

ConnnectionManager.java

package com.reptileBaidu.sql.util;import java.sql.DriverManager;import com.mysql.jdbc.Connection;public class ConnnectionManager {    private static final ThreadLocal<Connection> connectionHolder = new ThreadLocal<Connection>();    private static final String BETADBURL = "jdbc:mysql://192.168.1.10:3306/reptilebaidu?useUnicode=true&characterEncoding=utf8&autoReconnect=true&user=root&password=pass4you";    public static Connection getConnectionFromThreadLocal() {        Connection conn = connectionHolder.get();        try {            if (conn == null || conn.isClosed()) {                Connection con = ConnnectionManager.getConnection();                connectionHolder.set(con);                System.out.println("[Thread]" + Thread.currentThread().getName());                return con;            }            return conn;        } catch (Exception e) {            System.out.println("[ThreadLocal Get Connection Error]" + e.getMessage());        }        return null;    }    public static Connection getConnection() {        Connection conn = null;        try {            Class.forName("com.mysql.jdbc.Driver");            conn = (Connection) DriverManager.getConnection(BETADBURL);        } catch (Exception e) {            System.out.println("[Get Connection Error]" + e.getMessage());        }        return conn;    }}

DataUpdater.java

package com.reptileBaidu.sql.util;import java.sql.SQLException;import java.util.List;import com.mysql.jdbc.PreparedStatement;import com.reptileBaidu.domain.BaiduResult;public class DataUpdater implements Runnable {    private PreparedStatement pst;    private List<BaiduResult> baiduResults;    private final String SQL = "insert into reptilebaidu (`title` ,`url` , `assumably` , searchContent) VALUES (?, ? ,?,?)";    public DataUpdater(List<BaiduResult> baiduResults) {        this.baiduResults = baiduResults;    }    public void run() {        try {            pst = (PreparedStatement) ConnnectionManager.getConnectionFromThreadLocal().prepareStatement(SQL);            for (BaiduResult baiduResult : baiduResults) {                pst.setString(1, baiduResult.getTitle());                pst.setString(2, baiduResult.getUrl());                pst.setString(3, baiduResult.getAssumably());                pst.setString(4, baiduResult.getSearchContent());                pst.addBatch();            }            pst.executeBatch();        } catch (Exception e) {            System.err.println("[SQL ERROR MESSAGE]" + e.getMessage());        } finally {            close(pst);        }    }    public void close(PreparedStatement pst) {        if (pst != null) {            try {                pst.close();            } catch (SQLException e) {                System.err.println("[Close Statement Error]" + e.getMessage());            }        }    }}

QunarThreadPoolExecutor.java

package com.reptileBaidu.sql.util;import java.util.concurrent.BlockingQueue;import java.util.concurrent.RejectedExecutionHandler;import java.util.concurrent.ThreadFactory;import java.util.concurrent.ThreadPoolExecutor;import java.util.concurrent.TimeUnit;import java.util.concurrent.atomic.AtomicInteger;import java.util.concurrent.atomic.AtomicLong;/** * 继承ThreadPoolExecutor来进行线程池配置 * @author 玮 * */public class QunarThreadPoolExecutor  extends ThreadPoolExecutor {    // 记录每个线程执行任务开始时间    private ThreadLocal<Long> start = new ThreadLocal<Long>();    // 记录所有任务完成使用的时间    private AtomicLong totals = new AtomicLong();    // 记录线程池完成的任务数    private AtomicInteger tasks = new AtomicInteger();    public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,            BlockingQueue<Runnable> workQueue, ThreadFactory threadFactory, RejectedExecutionHandler handler) {        super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory, handler);    }    public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,            BlockingQueue<Runnable> workQueue, RejectedExecutionHandler handler) {        super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, handler);    }    public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,            BlockingQueue<Runnable> workQueue, ThreadFactory threadFactory) {        super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory);    }    /**     * 重写构造方法     * @param corePoolSize 核心线程池大小     * @param maximumPoolSize 最大线程池大小     * @param keepAliveTime 线程池中超过corePoolSize数目的空闲线程最大存活时间;可以allowCoreThreadTimeOut(true)使得核心线程有效时间     * @param unit keepAliveTime时间单位     * @param workQueue 阻塞任务队列     */    public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,            BlockingQueue<Runnable> workQueue) {        super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue);    }     /**     * 每个线程在调用run方法之前调用该方法     * */    @Override    protected void beforeExecute(Thread t, Runnable r) {        super.beforeExecute(t, r);        start.set(System.currentTimeMillis());    }    /**     * 每个线程在执行完run方法后调用该方法     * */    @Override    protected void afterExecute(Runnable r, Throwable t) {        super.afterExecute(r, t);        tasks.incrementAndGet();        totals.addAndGet(System.currentTimeMillis() - start.get());    }    /**     * 关闭线程池时调用的方法     */    @Override    protected void terminated() {        super.terminated();        System.out.println("完成"+ tasks.get() +"个任务,平均耗时: [" + totals.get() / tasks.get() + "] ms");    }}

DataUpdaterMain .java

package com.reptileBaidu.sql.util;import java.util.List;import java.util.concurrent.LinkedBlockingQueue;import java.util.concurrent.TimeUnit;import com.reptileBaidu.domain.BaiduResult;/** * 利用线程池保存数据 * @author 玮 * */public class DataUpdaterMain {   private LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>();   private QunarThreadPoolExecutor qunarThreadPoolExecutor = new QunarThreadPoolExecutor(5, 8, 5, TimeUnit.MINUTES, queue);   /**    * 关闭线程池    */   public void shutThreadPool() {       if (qunarThreadPoolExecutor != null) {           qunarThreadPoolExecutor.shutdown();           try {               if (!qunarThreadPoolExecutor.awaitTermination(20 , TimeUnit.MINUTES)) {                   qunarThreadPoolExecutor.shutdownNow();               }           } catch (Exception e) {               System.err.println("[ThreadPool Close Error]" + e.getMessage());           }       }   }   public boolean update(List<BaiduResult> baiduResults) {       qunarThreadPoolExecutor.execute(new DataUpdater(baiduResults));       return true;   }}

ReptileBaidu.java

package com.reptileBaidu.sql.util;import java.util.ArrayList;import java.util.List;import org.apache.log4j.Logger;import org.htmlparser.Parser;import org.htmlparser.filters.AndFilter;import org.htmlparser.filters.HasAttributeFilter;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.OrFilter;import org.htmlparser.tags.Div;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.json.JSONException;import org.json.JSONObject;import com.reptileBaidu.domain.BaiduResult;public class ReptileBaidu {    /**     * 百度搜索,解析查询结果     * @param searchContent 搜索内容     * @param startNum 起始条数 默认0     * @param onePageNum 每页最大条数  最大为50     * @return     * @throws JSONException     */    public static List<BaiduResult> baiduSearch(String searchContent,int startNum,int onePageNum) throws JSONException{        String url = "http://www.baidu.com/s?word="+searchContent+"&cl=3&pn="+startNum+"&rn="+onePageNum;        List<BaiduResult> baiduList = new ArrayList<BaiduResult>();        try {            Parser parser = new Parser(url);            // parser.setEncoding("utf-8");            // 得到所有经过过滤的标签            NodeList list = parser.extractAllNodesThatMatch(new OrFilter(new AndFilter(new HasAttributeFilter("class","c-abstract"),new NodeClassFilter(Div.class)),new AndFilter(new HasAttributeFilter("class","c-tools"),new NodeClassFilter(Div.class))));            //创建搜索结果对象,设置关键字            BaiduResult baidu = new BaiduResult(searchContent);            for (int i = 0; i < list.size(); i++) {                Div div = (Div)list.elementAt(i);                if(div.getAttribute("id") == null){                    //设置大概                    baidu.setAssumably(div.getStringText());                }else if(baidu.getAssumably()!= ""){                    String data = div.getAttribute("data-tools");                    if(data != null){                        JSONObject  json1 = new JSONObject(data);                        //设置标题                        baidu.setTitle(json1.getString("title"));                        //设置地址                        baidu.setUrl(json1.getString("url"));                        System.out.println(baidu.toString());                        baiduList.add(baidu);                        baidu = new BaiduResult(searchContent);                    }                }            }        } catch (ParserException e) {            System.out.println(url + "-->不存在");            System.out.println(e.getMessage());        }        return baiduList;    }    public static void main(String[] args) {        for(int i=1;i<11;i++){            SearchRunnable r0 = new SearchRunnable("阿拉善", 1000*(i-1), 1000*i);            Thread t0 = new Thread(r0);// 创建线程            t0.start(); // 线程开启        }    }}class SearchRunnable implements Runnable {    private static Logger logger = Logger.getLogger(SearchRunnable.class);    /** 关键字 */    private String searchContent;    /** 起始条数 */    private int startNum = 0;    /** 终止条数 */    private int endNum = 10000;    public SearchRunnable(String searchContent, int startNum, int endNum) {        this.searchContent = searchContent;        this.startNum = startNum;        this.endNum = endNum;    }    public void run() {        System.out.println("开启线程");        long start = System.currentTimeMillis();        List<BaiduResult> baiduList = new ArrayList<BaiduResult>();        DataUpdaterMain dataUpdaterMain = new DataUpdaterMain();        int size = 0;        try {            for (int i = startNum; i < endNum; i += 50) {                //进行爬虫                baiduList.addAll(ReptileBaidu.baiduSearch(searchContent, i,50));                if(baiduList.size() >=100){                    //保存爬虫结果                    dataUpdaterMain.update(baiduList);                    size+=baiduList.size();                    baiduList = new ArrayList<BaiduResult>();                }            }            dataUpdaterMain.update(baiduList);            size+=baiduList.size();        } catch (Exception e) {            System.out.println(e.getMessage());        } finally {            dataUpdaterMain.shutThreadPool();            logger.info("耗时[" + (System.currentTimeMillis() - start) + "]ms,保存"+size+"条数据");        }    }}
0 0
原创粉丝点击