Java多线程爬虫和存储

来源:互联网 发布:我是淘宝骗保师 编辑:程序博客网 时间:2024/06/05 08:57
import org.apache.http.HttpHeaders;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.util.EntityUtils;import org.dom4j.Element;import java.io.IOException;import java.util.regex.Matcher;import java.util.regex.Pattern;public class GetBookInfoThread extends Thread{    private CloseableHttpClient httpClient;    private String webAddress;    private Element rootElement;    private Pattern bookAuthorRegex;    private Pattern bookPublishRegex;    private Pattern bookIsbnRegex;    private Pattern bookImgRegex;    private String bookName;    /**     *     * @param httpClient    用这个操作抓取     * @param webAddress    这个是抓取的网址     * @param rootElement   这个是一个xml文档的根节点,用这个来操作加入新的子节点     */    public GetBookInfoThread(CloseableHttpClient httpClient,String webAddress,String bookName,Element rootElement,Pattern bookAuthorRegex,Pattern bookPublishRegex,Pattern bookIsbnRegex,Pattern bookImgRegex) {        this.httpClient = httpClient;        this.webAddress = webAddress;        this.rootElement = rootElement;        this.bookAuthorRegex = bookAuthorRegex;        this.bookPublishRegex = bookPublishRegex;        this.bookIsbnRegex = bookIsbnRegex;        this.bookName = bookName;        this.bookImgRegex = bookImgRegex;    }    @Override    public void run() {        HttpGet getBookInfo = new HttpGet(webAddress);        getBookInfo.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");        CloseableHttpResponse bookInfoResponse;        String bookInfoCode = null;//书籍具体信息网页源码        try {            bookInfoResponse = httpClient.execute(getBookInfo);            if (bookInfoResponse.getStatusLine().getStatusCode() != 200) {                System.out.println("获取书本具体信息时出错,页面地址:" + webAddress + "错误信息" + bookInfoResponse.getStatusLine());                return;            }            bookInfoCode = EntityUtils.toString(bookInfoResponse.getEntity());        } catch (IOException e) {            e.printStackTrace();        }        Matcher bookAuthorMatcher = bookAuthorRegex.matcher(bookInfoCode); //匹配作者        Matcher bookPublishMatcher = bookPublishRegex.matcher(bookInfoCode);    //匹配出版商        Matcher bookIsbnMatcher = bookIsbnRegex.matcher(bookInfoCode);  //匹配isbn        Matcher bookImgMatcher = bookImgRegex.matcher(bookInfoCode);    //匹配图片地址        String bookName = this.bookName;        String bookAuthor = "";        String bookPublish = "";        String bookIsbn = "";        String bookLink = webAddress;        String bookImg = "";        if (bookAuthorMatcher.find()) {            bookAuthor = bookAuthorMatcher.group(1);        }        if (bookPublishMatcher.find()) {            bookPublish = bookPublishMatcher.group(1);        }        if (bookIsbnMatcher.find()) {            bookIsbn = bookIsbnMatcher.group(1);        }        if (bookImgMatcher.find()) {            bookImg = bookImgMatcher.group(1);        }//                    System.out.println(bookName + "-" + bookAuthor + "-" + bookPublish + "-" + bookIsbn);        Element bookElement = rootElement.addElement("book");//新建一个书的标签        bookElement.addAttribute("id",String.valueOf(Main.bookId++));        bookElement.addElement("name").setText(bookName);        bookElement.addElement("author").setText(bookAuthor);        bookElement.addElement("publish").setText(bookPublish);        bookElement.addElement("isbn").setText(bookIsbn);        bookElement.addElement("count").setText(String.valueOf((int)(Math.random() * 10) + 3));        bookElement.addElement("link").setText(bookLink);        bookElement.addElement("img").setText(bookImg);        System.out.println("抓取了:" + webAddress + " " + bookName);    }}
import org.apache.http.HttpHeaders;import org.apache.http.HttpHost;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.dom4j.Document;import org.dom4j.DocumentHelper;import org.dom4j.Element;import java.io.IOException;import java.util.*;import java.util.regex.Matcher;import java.util.regex.Pattern;public class Main {    CloseableHttpClient httpClient;    static int bookId = 496;    Map<String,Integer> proxyMap;//ip->端口    List<String> ipList;//从这个list中读出ip,再由ip从map中读出端口    int i = 0;//根据这个从list中取出ip,换上对应的代理    public static void main(String[] args) {        Main m = new Main();//        List<String> tagList = m.getTagList();        List<String> tagList = new LinkedList<String>();//        tagList.add("经典");//        tagList.add("日本文学");//        tagList.add("散文");//        tagList.add("中国文学");//        tagList.add("算法");//        tagList.add("童话");//        tagList.add("外国文学");//        tagList.add("文学");//        tagList.add("小说");//        tagList.add("漫画");//        tagList.add("诗词");//        tagList.add("心理学");        tagList.add("摄影");        tagList.add("理财");        tagList.add("经济学");        m.pullAndWrite(tagList,10);    }    public Main() {//        HttpHost proxy = new HttpHost("122.225.106.35",80);//        httpClient = HttpClients.custom().setProxy(proxy).build();        httpClient = HttpClients.createDefault();        setProxyMap();    }    public void setProxyMap() {        proxyMap = new HashMap<String, Integer>();        ipList = new LinkedList<String>();        proxyMap.put("211.68.122.171",80);ipList.add("211.68.122.171");    }    public List<String> getTagList() {        HttpGet getTag = new HttpGet("http://book.douban.com/tag/");        getTag.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");        CloseableHttpResponse tagPageResponse = null;        String tagPageCode = null;//网页源码        try {            tagPageResponse = httpClient.execute(getTag);            tagPageCode = EntityUtils.toString(tagPageResponse.getEntity());            tagPageResponse.close();        } catch (IOException e) {            e.printStackTrace();        } finally {            try {                tagPageResponse.close();            } catch (IOException e) {                e.printStackTrace();            }        }        Pattern p = Pattern.compile("class=\"tag\">(.*?)</a>");        Matcher m = p.matcher(tagPageCode);        List<String> resultTagList = new LinkedList<String>();        while (m.find()) {            resultTagList.add(m.group(1));        }        return resultTagList;    }    /**     *     * @param tagList  要抓的图书的类别     * @param maxPageNum 每种图书最多抓取的页数     */    public void pullAndWrite(List<String> tagList,int maxPageNum) {        Pattern bookAddressRegex = Pattern.compile("href=\"(.*?)\" class=\"title\" target=\"_blank\">(.*?)</a>");   //获取具体书籍网址的正则        Pattern bookAuthorRegex = Pattern.compile("(?s)<span class=\"pl\"> 作者</span>:.*?>(.*?)</a>");//匹配作者        Pattern bookPublishRegex = Pattern.compile("<span class=\"pl\">出版社:</span> (.*?)<br/>");        Pattern bookIsbnRegex = Pattern.compile("<span class=\"pl\">ISBN:</span> (.*?)<br/>");        Pattern bookImgRegex = Pattern.compile("<img src=\"(.*?)\" title=\"点击看大图\"");        //分别抓取每一种类别的书籍        for (String tag:tagList) {            int nowPageNum = 0;//目前正在抓取的页数            Document newDocument = DocumentHelper.createDocument();            Element rootElement = newDocument.addElement("root");            while (nowPageNum < maxPageNum) {                System.out.println(1);                String nowPageAddress = "http://www.douban.com/tag/" + tag + "/book?start=" + nowPageNum * 15;//当前页的网址                HttpGet getBooksPage = new HttpGet(nowPageAddress);                getBooksPage.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");                CloseableHttpResponse booksPageResponse;                Matcher m = null;                try {                    System.out.println(2);                    booksPageResponse = httpClient.execute(getBooksPage);                    System.out.println(3);                    m = bookAddressRegex.matcher(EntityUtils.toString(booksPageResponse.getEntity()));                    booksPageResponse.close();                    if (booksPageResponse.getStatusLine().getStatusCode() != 200) {                        System.out.println("抓 " + nowPageAddress + " 时出错:");                        System.out.println("错误信息:" + booksPageResponse.getStatusLine());                        changeProxy();                        continue;//换个代理继续爬当前页                    }                } catch (IOException e) {                    e.printStackTrace();                }                //具体每一本书,具体抓取                int findCount = 0;//找到的书籍的数目                List<Thread> threadList = new LinkedList<Thread>();                while (m.find()) {                    threadList.add(new GetBookInfoThread(httpClient, m.group(1), m.group(2), rootElement, bookAuthorRegex, bookPublishRegex, bookIsbnRegex,bookImgRegex));                    findCount++;                }                //没有知道到代表这种类别的书都找完了,那么直接退出此类书籍的查找                if (findCount == 0) {                    break;                }                for (Thread thread:threadList) {                    thread.start();                }                for (Thread thread:threadList) {                    try {                        thread.join();                    } catch (InterruptedException e) {                        e.printStackTrace();                    }                }                nowPageNum++;            }            //一个类别爬完了再写入            new WriteBookInfoToFile(rootElement,"/home/geekgao/book/" + tag + ".xml").start();  //另开一个线程写入文件        }    }    private void changeProxy() {        if (i >= ipList.size()) {            System.out.println("代理用完了,退出");            System.exit(0);        }        String ip = ipList.get(i++);        httpClient = HttpClients.custom().setProxy(new HttpHost(ip,proxyMap.get(ip))).build();        System.out.println("换代理啦,使用代理:" + ip + ",端口:" + proxyMap.get(ip));    }}
import org.dom4j.Element;import org.dom4j.io.XMLWriter;import java.io.FileWriter;import java.io.IOException;import java.io.Writer;public class WriteBookInfoToFile extends Thread {    private Element root;    private String fileAddress;    public WriteBookInfoToFile(Element root,String fileAddress) {        this.root = root;        this.fileAddress = fileAddress;    }    @Override    public void run() {        Writer fileWriter;        try {            fileWriter = new FileWriter(fileAddress);            XMLWriter xmlWriter = new XMLWriter(fileWriter);            xmlWriter.write(root);            xmlWriter.close();            System.out.println("[" + fileAddress + "]写入成功");        } catch (IOException e) {            e.printStackTrace();        }    }}
import org.dom4j.Document;import org.dom4j.DocumentException;import org.dom4j.Element;import org.dom4j.io.SAXReader;import java.io.File;import java.sql.DriverManager;import java.sql.SQLException;import java.sql.Statement;import java.util.List;public class WriteInfoToDB {    public static void main(String[] args) {        File folder = new File("/home/geekgao/book");        File[] XMLS = folder.listFiles();        SAXReader reader = new SAXReader();        Statement statement = null;    //用这个执行sql语句        try {            Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动            statement = DriverManager.getConnection("jdbc:mysql://localhost:3306/BookManage?user=root&password=root").createStatement();        } catch (SQLException e) {            e.printStackTrace();        } catch (ClassNotFoundException e) {            e.printStackTrace();        }        for (File f:XMLS) {            if (f.isDirectory()) {                continue;            }            Document document = null;            try {                document = reader.read(f);            } catch (DocumentException e) {                e.printStackTrace();            }            Element root = document.getRootElement();            List<Element> books = root.elements();            for (Element book:books) {                String name = null;                String author = null;                String publish = null;                String isbn = null;                String count = null;                String link = null;                String img = null;                List<Element> b = book.elements();                for (Element info:b) {                    if (info.getName().equals("name")) {                        name = info.getText();                    } else if (info.getName().equals("author")) {                        author = info.getText();                    } else if (info.getName().equals("publish")) {                        publish = info.getText();                    } else if (info.getName().equals("isbn")) {                        isbn = info.getText();                    } else if (info.getName().equals("count")) {                        count = info.getText();                    } else if (info.getName().equals("link")) {                        link = info.getText();                    } else if (info.getName().equals("img")) {                        img = info.getText();                    }//                    System.out.println(info.getName() + ": " + info.getText());                }                String sql = "INSERT INTO Book(bookPublish,bookName,bookAuthor,bookTag,bookIsbn,bookCount,bookRestCount,bookLink,bookImg) VALUES ('" + publish + "','" + name + "','" + author + "','" + f.getName().split("\\.")[0] + "','" + isbn + "','" + count + "','" + count + "','" + link + "','" + img + "');";                try {                    statement.execute(sql);                } catch (SQLException e) {                    System.err.println("sql语句处错误:" + e.getMessage());                    System.err.println("sql语句:" + sql);                }            }        }    }}
0 0