爬虫的综合案例
来源:互联网 发布:spark sql 官网 编辑:程序博客网 时间:2024/05/21 10:54
爬虫的综合案例爬取虎嗅网的完整Demo
创建Maven项目
- 首先引入依赖
<dependencies> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.3</version> </dependency> <dependency> <!-- jsoup HTML parser library @ https://jsoup.org/ --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.3</version> </dependency> <!-- https://mvnrepository.com/artifact/org.springframework/spring-jdbc --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-jdbc</artifactId> <version>4.2.6.RELEASE</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.41</version> </dependency> <dependency> <groupId>c3p0</groupId> <artifactId>c3p0</artifactId> <version>0.9.1.2</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.31</version> </dependency> <dependency> <groupId>com.google.code.gson</groupId> <artifactId>gson</artifactId> <version>2.8.1</version> </dependency> </dependencies>
使用的是mysql数据库
创建数据库名字为:spider
create database spider;
创建表的代码如下:
CREATE TABLE `huxiu_article` ( `id` varchar(250) DEFAULT NULL, `title` varchar(250) DEFAULT NULL, `author` varchar(250) DEFAULT NULL, `createTime` varchar(250) DEFAULT NULL, `zan` varchar(250) DEFAULT NULL, `pl` varchar(250) DEFAULT NULL, `sc` varchar(250) DEFAULT NULL, `content` blob, `url` varchar(250) DEFAULT NULL) ENGINE=InnoDB DEFAULT CHARSET=utf8
具体代码如下
实体类Article
public class Article { private String id; private String url; private String title; private String author; private String createTime; private String pl; private String zan; private String sc; private String content; public String getId() { return id; } public void setId(String id) { this.id = id; } ........}
操作数据库的ArticleDao
public class ArticleDao extends JdbcTemplate { public ArticleDao() { // 创建C3P0的datasource 1.配置 2.代码 ComboPooledDataSource dataSource = new ComboPooledDataSource(); // 1.url // 2.driver // 3.username&password dataSource.setUser("root"); dataSource.setPassword("root"); dataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf-8"); setDataSource(dataSource); } public void save(Article article) { String sql = "INSERT INTO `spider`.`huxiu_article` (`id`, `title`, `author`, `createTime`, `zan`, `pl`, `sc`, `content`, `url` ) VALUES( ?,?,?,?,?,?,?,?,?)"; update(sql, article.getId(),article.getTitle(),article.getAuthor(),article.getCreateTime(),article.getZan(),article.getPl(),article.getSc(),article.getContent(),article.getUrl()); }}
返回值的实体类HuxiuPagingResponse
public class HuxiuPagingResponse { private String data; private String last_dateline; private String msg; private String result; private String total_page; public String getData() { return data; } public void setData(String data) { this.data = data; } ........}
程序主方法入口HuXiuSpider
public class HuXiuSpider { // 保存数据 public static ArticleDao articleDao = new ArticleDao(); // dataLine用来做分页的请求 private static String dateLine = null; // 创建固定大小的线程池(下载、解析、存储) private static ExecutorService threadPool = Executors.newFixedThreadPool(30); // 队列---从首页和分页解析出来的文章url,存放在这个队列中 public static ArrayBlockingQueue<String> urlQueue = new ArrayBlockingQueue<String>(1000); // 队列---每个文章解析出来的Html文档,存放这个队列中 public static ArrayBlockingQueue<String> articleHtmlQueue = new ArrayBlockingQueue<String>(1000); // 队列---每个文章的内容,也就是article对象,存放这个队列中 public static ArrayBlockingQueue<Article> articleContentQueue = new ArrayBlockingQueue<Article>(1000); public static void main(String[] args) throws Exception { // 提交线程 用来针对每个文章的url ----进行网络请求 for (int i = 0; i < 10; i++) { threadPool.execute(new ProcessSinglePageRunnable()); } // 解析页面 for (int i = 0; i < 10; i++) { threadPool.execute(new ParseHtmlRunnable()); } // 保存数据 threadPool.execute(new SaveDBRunnable()); //获取首页的文章url列表 getIndexArticleUrlList(); //加载分页 processPaging(); } /** * 获取首页的文章列表信息 * * @throws IOException * @throws ClientProtocolException */ private static void getIndexArticleUrlList() throws IOException, ClientProtocolException { // 1.指定首页url http://www.huxiu.com String indexUrl = "http://www.huxiu.com"; // 2.发起一个HttpGet请求 HttpGet indexHttpGet = new HttpGet(indexUrl); //设置User-Agent indexHttpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"); String html = getHtmlByRequest(indexHttpGet); // 5.使用Jsoup进行解析,得到 文章的列表 ,获得文章aid。 Document indexDocument = Jsoup.parse(html); // 获取date_line Elements dateLines = indexDocument.select("[data-last_dateline]"); dateLine = dateLines.get(0).attr("data-last_dateline"); // 5.1 解析出div的某个属性data-id Elements aidElements = indexDocument.select("div[data-aid]"); // 5.2 依次得到每个新闻的aid for (Element element : aidElements) { String aid = element.attr("data-aid"); try { urlQueue.put(aid); } catch (InterruptedException e) { System.out.println("添加 aid 到urlQueue异常" + e); } } } private static void processPaging() { for (int page = 2; page <= 1615; page++) { try { // 编写分页 String pagingUrl = "https://www.huxiu.com/v2_action/article_list"; HttpPost httpPost = new HttpPost(pagingUrl); // 设置参数 ArrayList<NameValuePair> arrayList = new ArrayList<NameValuePair>(); arrayList.add(new BasicNameValuePair("huxiu_hash_code", "fb7f7403c58c3e8cb45aa47afc204c10")); arrayList.add(new BasicNameValuePair("page", page + "")); arrayList.add(new BasicNameValuePair("last_dateline", dateLine)); httpPost.setEntity(new UrlEncodedFormEntity(arrayList)); // 执行网络参数 String jsonText = getHtmlByRequest(httpPost); // 想将json串转成对象 Gson gson = new Gson(); HuxiuPagingResponse huxiuPagingResponse = gson.fromJson(jsonText, HuxiuPagingResponse.class); // 每一次请求,都需要解析出新的dataLine dateLine = huxiuPagingResponse.getLast_dateline(); // 获取数据 String htmlData = huxiuPagingResponse.getData(); Document doc = Jsoup.parse(htmlData); // 解析出div的某个属性data-id Elements aidElements = doc.select("div[data-aid]"); // 依次得到每个新闻的aid for (Element element : aidElements) { String aid = element.attr("data-aid"); urlQueue.put(aid); } } catch (Exception e) { // log.errer() System.out.println(page); System.out.println(e); } try { Thread.sleep(500); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * 获取html文档 * @throws IOException * @throws ClientProtocolException */ public static String getHtml(String aidUrl) throws IOException, ClientProtocolException { // 2.发起一个httpget请求 HttpGet indexHttpGet = new HttpGet(aidUrl); return getHtmlByRequest(indexHttpGet); } private static String getHtmlByRequest(HttpRequestBase request) throws IOException, ClientProtocolException { //设置请求头User-Agent request.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"); // 3.使用HttpClient执行,得到一个entity。 CloseableHttpClient indexHttpClient = HttpClients.createDefault(); CloseableHttpResponse indexResponse = indexHttpClient.execute(request); String html = null; if (200 == indexResponse.getStatusLine().getStatusCode()) { HttpEntity indexEntity = indexResponse.getEntity(); // 4.将entity转成字符串(html) html = EntityUtils.toString(indexEntity, Charset.forName("utf-8")); } return html; }}
用来针对每个文章的url 进行网络请求ProcessSinglePageRunnable
public class ProcessSinglePageRunnable implements Runnable { public void run() { while (true) { try { processSingleUrl(); Thread.sleep(3000); } catch (InterruptedException e) { } } } private void processSingleUrl() throws InterruptedException { String aid = HuXiuSpider.urlQueue.take(); String aidUrl = "http://www.huxiu.com/article/" + aid + ".html"; try { /*Article article = new Article(); article.setId(aid);*/ // 获取到单个新闻页面的html String aidHtml = HuXiuSpider.getHtml(aidUrl); HuXiuSpider.articleHtmlQueue.put(aidHtml); } catch (Exception e) { System.out.println(aidUrl); System.out.println(e); } }}
解析每个页面ParseHtmlRunnable
import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.select.Elements;public class ParseHtmlRunnable implements Runnable { public void run() { while (true) { String html = null; try { html = HuXiuSpider.articleHtmlQueue.take(); } catch (InterruptedException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } Article article = new Article(); Document detailDocument = Jsoup.parse(html); //解析出div的某个属性data-id Elements aidElements = detailDocument.select("div[data-aid]"); String aid=aidElements.get(0).attr("data-aid"); article.setId(aid); System.out.println(aid+"........."); // 解析文章title Elements titles = detailDocument.select(".t-h1"); String title = titles.get(0).text(); article.setTitle(title); // 解析文章author author-name Elements names = detailDocument.select(".author-name"); String name = names.get(0).text(); article.setAuthor(name); // 解析文章发布时间 Elements dates = detailDocument.select("[class^=article-time]"); String date = dates.get(0).text(); article.setCreateTime(date); // 解析文章 收藏数 Elements shares = detailDocument.select("[class^=article-share]"); String share = shares.get(0).text(); article.setSc(share); // 解析文章 评论数 Elements pls = detailDocument.select("[class^=article-pl]"); String pl = pls.get(0).text(); article.setPl(pl); // 解析文章 点赞数 num Elements nums = detailDocument.select(".num"); String num = nums.get(0).text(); article.setZan(num); // 解析文章正文内容 article-content-wrap Elements content = detailDocument.select(".article-content-wrap p"); String contentText = content.text(); article.setContent(contentText); // article.setUrl(aidUrl); try { HuXiuSpider.articleContentQueue.put(article); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }}
保存数据的类SaveDBRunnable
public class SaveDBRunnable implements Runnable { public void run() { while (true) { try { Article article = HuXiuSpider.articleContentQueue.take(); HuXiuSpider.articleDao.save(article); } catch (InterruptedException e) { e.printStackTrace(); } } }}
阅读全文
0 0
- 爬虫的综合案例
- 文件读取的综合案例
- 综合/案例
- 综合案例
- 综合案例
- scrapy爬虫的几个案例
- 案例:使用BeautifuSoup4的爬虫
- 配置三层交换的综合案例
- 反射对属性操作的综合案例
- css_day9---盒子模型的综合案例
- 类和接口的综合案例
- 综合案例:运算符的使用
- 成员变量反射的综合案例
- BindService与StartService的综合使用案例
- 爬虫案例
- 案例:使用XPath的的爬虫
- 网络爬虫的原理和案例
- 案例:使用正则表达式的爬虫
- Java:求实数的绝对值
- [BZOJ3932][CQOI2015]任务查询系统 主席树
- Sophus 李群 --[SO3]
- 数据结构|链栈的实现(实验3.2)
- day02-持久层代码抽取
- 爬虫的综合案例
- 日记(周中)
- Android Study Material Design 二 之:这可能是RecyclerView最全解析 高级拓展 实现复杂布局(三)
- Linux上安装JDK+Mysql+Tomcat环境
- 配置vim
- Java基础面试——String“==”问题
- EL表达式使用fmt:formatNumber标签保留两位小数
- Codeforces 873 D Merge Sort(归并排序性质)
- 【面试】--mysql 默认隔离级别 where 语句最长多少字符,int(10)10表示多大,char(128)128多大(357)