Java

来源:互联网 发布:网络直播运营模式 编辑:程序博客网 时间:2024/06/09 15:42

pom


<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">  <modelVersion>4.0.0</modelVersion>  <groupId>peerslee</groupId>  <artifactId>Onlylady</artifactId>  <version>0.0.1-SNAPSHOT</version>  <packaging>jar</packaging>  <name>Onlylady</name>  <url>http://maven.apache.org</url>  <properties>    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>  </properties>  <dependencies>    <dependency>      <groupId>junit</groupId>      <artifactId>junit</artifactId>      <version>3.8.1</version>      <scope>test</scope>    </dependency>    <dependency>    <groupId>org.apache.httpcomponents</groupId>    <artifactId>httpclient</artifactId>    <version>4.5</version></dependency><!-- jsoup HTML parser library @ http://jsoup.org/ -->    <dependency>  <groupId>org.jsoup</groupId>  <artifactId>jsoup</artifactId>  <version>1.10.2</version></dependency>    <!-- mongodb -->    <dependency>        <groupId>org.mongodb</groupId>        <artifactId>mongo-java-driver</artifactId>        <version>3.4.2</version>    </dependency>  </dependencies></project>

Spider


package peerslee.Onlylady;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Date;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.client.HttpClient;import org.apache.http.client.config.RequestConfig;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.HttpClients;import org.apache.http.util.EntityUtils;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class Spider {private static HttpClient client = HttpClients.createDefault();private static MongoUtil util = new MongoUtil();Pattern pattern = null;Matcher matcher = null;Integer Max_Value = 500; //每个帖子,最多抓500页评论,最大不要过3000条,单条数据插入容量溢出// util,返回jsoup documentDocument get_doc(String url) {RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000) //设置连接超时时间,单位毫秒.setConnectionRequestTimeout(5000) // 设置从connect Manager获取Connection 超时时间,单位毫秒.setSocketTimeout(5000) //请求获取数据的超时时间,单位毫秒.build();HttpGet get = new HttpGet(url);get.setConfig(requestConfig);try {HttpResponse response = client.execute(get);HttpEntity entity = response.getEntity();String html = EntityUtils.toString(entity);return Jsoup.parse(html);} catch (Exception e) {System.out.println("------超时------");return get_doc(url); //重新抓取} }// 评论List<Map<String, String>> crawl_comment(String url) {List<Map<String, String>> list = new ArrayList<Map<String, String>>();Document doc = get_doc(url);Elements elements = doc.select("#postlist > div");for(int i = 0; i < elements.size() - 1; i++) { //最后一个div不是评论块Map<String, String> map = new HashMap<String, String>();Element element = elements.get(i);String comment_id = element.attr("id");pattern = Pattern.compile("\\d+");matcher = pattern.matcher(comment_id);matcher.find();comment_id = matcher.group(0);map.put("comment_id", comment_id);String comment_author = element.select("div.authi > a.xw1").text();map.put("comment_author", comment_author);String comment_content = element.select("td.t_f").text();map.put("comment_content", comment_content);String comment_time = doc.select("#authorposton" + comment_id).text().replace("发表于", "").trim();map.put("comment_time", comment_time);list.add(map);//System.out.println(comment_id + " " + comment_author + " " + comment_content + " " + comment_time);}System.out.println(list);return list;}// 帖子void crawl_info(String url) {Document doc = get_doc(url);// infoMap<String, Object> info_map = new HashMap<String, Object>();// 帖子标题String title = doc.select("div.ts > a > h1").text();info_map.put("title", title);// 帖子idpattern = Pattern.compile("thread-(\\d+)-\\d+-\\d+");matcher = pattern.matcher(url);matcher.find();String topic_id = matcher.group(1);info_map.put("topic_id", topic_id);// 作者String author = doc.select("#threadstamp + table div.authi > a:eq(0)").text();info_map.put("author", author);// 查看数String look = doc.select("div.hm > span:eq(1)").text();info_map.put("look", look);// 评论数String reply = doc.select("div.hm > span:eq(4)").text();info_map.put("reply", reply);// 发帖时间String first_div_id = doc.select("#postlist > div").attr("id");pattern = Pattern.compile("\\d+");matcher = pattern.matcher(first_div_id);matcher.find();first_div_id = matcher.group(0);String time = doc.select("#authorposton" + first_div_id).text().trim().replace("发表于", "").trim();info_map.put("time", time);// 当前时间SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");String date = df.format(new Date());info_map.put("date", date);// 评论List<Map<String, String>> comment_list = new ArrayList<Map<String, String>>();Elements elements = doc.select("#postlist > div");for(int i = 1; i < elements.size() - 1; i++) {Map<String, String> map = new HashMap<String, String>();Element element = elements.get(i);// 评论idString comment_id = element.attr("id");pattern = Pattern.compile("\\d+");matcher = pattern.matcher(comment_id);matcher.find();comment_id = matcher.group(0);map.put("comment_id", comment_id);// 评论者String comment_author = element.select("div.authi > a.xw1").text();map.put("comment_author", comment_author);// 评论内容String comment_content = element.select("td.t_f").text();map.put("comment_content", comment_content);// 评论时间String comment_time = element.select("#authorposton" + comment_id).text().trim().replace("发表于", "").trim();map.put("comment_time", comment_time);comment_list.add(map);//System.out.println(comment_id + " " + comment_author + " " + comment_content + " " + comment_time);}System.out.println(comment_list);// 评论翻页String str;try {str = doc.select("a.last").first().text(); //1. 超过10页} catch (java.lang.NullPointerException e) {Elements es = doc.select("div.pg > a");if(es.isEmpty()) {str = "1"; //3. 评论就一页} else {str = es.get(es.size()-2).text(); // 2. 少于10页}}pattern = Pattern.compile("\\d+");matcher = pattern.matcher(str);matcher.find();Integer total = Integer.parseInt(matcher.group(0)); // 总页System.out.println("帖子:" + topic_id + " 共:" + total + " 页...");for(int i = 2; i <= (total < Max_Value ? total : Max_Value) ; i++) {String comment_url = "http://bbs.onlylady.com/thread-" + topic_id + "-" + i +"-1.html";System.out.println("crawl 第 " + i +" 页评论...");List comment_per_list = crawl_comment(comment_url);comment_list.addAll(comment_per_list); // 评论列表插入}info_map.put("comment_list", comment_list);// 帖子内容String post_content = doc.select("td.t_f").text();pattern = Pattern.compile("(\\d+-){2}\\d+\\s(\\d+:){2}\\d+\\s上传\\s下载附件 \\(.*?\\)");matcher = pattern.matcher(post_content);post_content = matcher.replaceAll("").trim();info_map.put("post_content", post_content);util.insertCol("test", info_map); //dbSystem.out.println("==============");}// 页void crawl_topic_url(String url) {List<String> topic_list = new ArrayList<String>();Document doc = get_doc(url);Elements links = doc.select("#moderate tbody > tr > td.icn > a");// 每页所有的帖子链接for(Element e : links) {String link = e.attr("href");System.out.println(link);crawl_info(link);}}// 类void crawl_category(Integer type) {String url = "http://bbs.onlylady.com/forum-" + type + "-1.html";Document doc = get_doc(url);String str = doc.select("a.last").first().text();Matcher matcher = null;matcher = Pattern.compile("\\d+").matcher(str);matcher.find();Integer total = Integer.parseInt(matcher.group(0));System.out.println("类别" + type + "共:" + total + "页...");for(int i = 1; i <= total; i++) {String topic_url = "http://bbs.onlylady.com/forum-" + type + "-" + i +".html";System.out.println("crawl 第" + i +"页...");crawl_topic_url(topic_url);}}public static void main(String[] args) {Spider spider = new Spider();spider.crawl_category(86);//spider.crawl_info("http://bbs.onlylady.com/thread-4030747-1-1.html");//spider.crawl_info("http://bbs.onlylady.com/thread-4031594-1-1.html");}}

util


package peerslee.Onlylady;import com.mongodb.MongoClient;import com.mongodb.client.MongoCollection;import com.mongodb.client.MongoDatabase;import org.bson.Document;import java.util.ArrayList;import java.util.List;import java.util.Map;/** * Created by PeersLee on 2017/1/30. */public class MongoUtil {private MongoClient client =  null;private String dbName = null;public MongoUtil() {dbName = "Onlylady";this.client = new MongoClient("127.0.0.1", 27017);}//插入(去重复)public void insertCol(String colName, Map<String, Object> msg) {try {MongoDatabase db = client.getDatabase(dbName);MongoCollection<Document> col = db.getCollection(colName);Document doc = new Document();for(Map.Entry<String, Object> entry : msg.entrySet()) {doc.append(entry.getKey(), entry.getValue());}List<Document> docs = new ArrayList<Document>();docs.add(doc);col.insertMany(docs);System.out.println("Doc insert" + colName + " ok...");} catch (Exception e) {System.out.println(e.getClass().getName() + ":" + e.getMessage());}}}


代理


HttpClient client = HttpClients.createDefault();// 设置代理HttpHost proxy = new HttpHost(ip, port);DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);// HttpClientBuilderclient = HttpClients.custom().setRoutePlanner(routePlanner).setConnectionTimeToLive(2, TimeUnit.SECONDS).build();


.HttpPost 官方文档