c#采集文章
来源:互联网 发布:后盾网ci框架源码 编辑:程序博客网 时间:2024/05/21 13:58
/*
*BLOGUSER 博主的名字
* http://blog.csdn.net
*/
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Diagnostics;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using HtmlAgilityPack;
namespace Collector
{
public partial class Form1 : Form
{
// private Task[] tasks = null; //提供线程操作
private const string BLOGUSER = "jyb"; //博客用户名
private const string BLOGURL = "http://92jyb.com";
private const string PAGECOUNTPATH = "//div[@id='papelist']/span[1]"; //总页数PATH
private const string ARTICLEURLPATH = "//span[@class='link_title']/a[1]"; //文章URL的PATH
private const string ARTICLETITLEPATH = "//div[@class='article_title']/h3/span/a"; //文章标题PATH
private const string POSTDATEPATH = "//span[@class='link_postdate']"; //文章创建日期PATH
private const string ARTICLECONTENTPATH = "//div[@id='article_content']"; //文章内容PATH
private List<string> articleUrlList = new List<string>(); //所有文章的URL
private object moniter = new object();
private Stopwatch stopwatch = new Stopwatch();
private int cnt = 0;
public Form1()
{
InitializeComponent();
}
/// <summary>
/// 单篇文章采集
/// </summary>
/// <param name="state"></param>
void CollectArticle(object state) {
Interlocked.Increment(ref cnt);
lock (moniter) {
SetStatuText(string.Format("总共{0}篇文章, 正在采集中第{1}篇.", articleUrlList.Count, cnt));
string articleUrl = (string)state;
string firstArticleHtml = GetHtmlSource(articleUrl, Encoding.UTF8);
if (string.IsNullOrEmpty(firstArticleHtml)) return;
HtmlNode rootNode = GetHtmlNodeByHtml(firstArticleHtml);
string articleTitle = GetNodeInnerText(rootNode, ARTICLETITLEPATH);
string postDate = GetNodeInnerText(rootNode, POSTDATEPATH);
string articleContent = GetNodeInnerText(rootNode, ARTICLECONTENTPATH);
//采集结果处理:保存到数据库或其它......
string blogFile = BLOGUSER + ".txt";
using (StreamWriter sw = new StreamWriter(blogFile, true)) {
sw.WriteLine(articleUrl);
sw.WriteLine(articleTitle);
sw.WriteLine(postDate);
sw.WriteLine(articleContent);
}
}
}
private void TaskEnded(Task[] task) {
SetStatuText("采集结束,耗时 " + stopwatch.Elapsed.Minutes + "分" + stopwatch.Elapsed.Seconds + "秒");
}
//获取总页数
private int GetPageCount(string pageCountUrl)
{
HtmlNode rootNode = GetHtmlNodeByUrl(pageCountUrl, Encoding.UTF8);
if (rootNode == null)
return 0;
string pageCountText = GetNodeInnerText(rootNode, PAGECOUNTPATH);
int firstIndex = pageCountText.LastIndexOf("共") + 1;
int lastIndex = pageCountText.LastIndexOf("页");
string result = pageCountText.Substring(firstIndex, lastIndex - firstIndex);
return Convert.ToInt32(result);
}
//通过网页URL获取HtmlNode
private HtmlNode GetHtmlNodeByUrl(string url, Encoding encoding)
{
string html = GetHtmlSource(url, encoding);
if (string.IsNullOrEmpty(html)) return null;
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(html);
HtmlNode rootNode = document.DocumentNode;
return rootNode;
}
//通过网页html源代码获取HtmlNode
private HtmlNode GetHtmlNodeByHtml(string htmlSource)
{
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(htmlSource);
HtmlNode rootNode = document.DocumentNode;
return rootNode;
}
/// <summary>
/// 获取网页源代码
/// </summary>
private string GetHtmlSource(string url, Encoding encoding)
{
string result = "";
try
{
WebRequest request = WebRequest.Create(url);
using (WebResponse response = request.GetResponse())
using (StreamReader reader = new StreamReader(response.GetResponseStream(), encoding))
result = reader.ReadToEnd();
}
catch
{
result = "";
}
return result;
}
private string GetNodeInnerText(HtmlNode srcNode, string path)
{
HtmlNode temp = srcNode.SelectSingleNode(path);
if (temp == null)
return null;
return temp.InnerText;
}
private void SetStatuText(string s)
{
this.SafeCall(() =>
{
lblStatusInfo.Text = s;
});
}
//开始采集按钮
private void button1_Click(object sender, EventArgs e) {
stopwatch.Restart();
Task.Factory.StartNew(() => {
cnt = 0;
int pageCount = GetPageCount(BLOGURL + "/" + BLOGUSER);
if (pageCount == 0)
return;
//所有文章的URL
for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++) {
string pageIndexUrl = BLOGURL + "/" + BLOGUSER + "/article/list/" + pageIndex.ToString();
HtmlNode rootNode = GetHtmlNodeByUrl(pageIndexUrl, Encoding.UTF8);
if (rootNode == null)
continue;
HtmlNodeCollection ArticleUrlList = rootNode.SelectNodes(ARTICLEURLPATH);
foreach (HtmlNode articleUrlNode in ArticleUrlList) {
string articleUrl = BLOGURL + articleUrlNode.Attributes["href"].Value;
articleUrlList.Add(articleUrl);
}
}
}).ContinueWith((x) => {
TaskFactory taskFactory = new TaskFactory();
Task[] tasks = new Task[articleUrlList.Count];
for (int i = 0; i < articleUrlList.Count; i++) {
tasks[i] = new Task(CollectArticle, articleUrlList[i]);
tasks[i].Start();
}
taskFactory.ContinueWhenAll(tasks, TaskEnded, TaskContinuationOptions.None);
});
}
}
public static class Extenstions
{
public static void SafeCall(this Control ctrl, Action callback)
{
if (ctrl.InvokeRequired)
ctrl.Invoke(callback);
else
callback();
}
}
}
*BLOGUSER 博主的名字
* http://blog.csdn.net
*/
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Diagnostics;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using HtmlAgilityPack;
namespace Collector
{
public partial class Form1 : Form
{
// private Task[] tasks = null; //提供线程操作
private const string BLOGUSER = "jyb"; //博客用户名
private const string BLOGURL = "http://92jyb.com";
private const string PAGECOUNTPATH = "//div[@id='papelist']/span[1]"; //总页数PATH
private const string ARTICLEURLPATH = "//span[@class='link_title']/a[1]"; //文章URL的PATH
private const string ARTICLETITLEPATH = "//div[@class='article_title']/h3/span/a"; //文章标题PATH
private const string POSTDATEPATH = "//span[@class='link_postdate']"; //文章创建日期PATH
private const string ARTICLECONTENTPATH = "//div[@id='article_content']"; //文章内容PATH
private List<string> articleUrlList = new List<string>(); //所有文章的URL
private object moniter = new object();
private Stopwatch stopwatch = new Stopwatch();
private int cnt = 0;
public Form1()
{
InitializeComponent();
}
/// <summary>
/// 单篇文章采集
/// </summary>
/// <param name="state"></param>
void CollectArticle(object state) {
Interlocked.Increment(ref cnt);
lock (moniter) {
SetStatuText(string.Format("总共{0}篇文章, 正在采集中第{1}篇.", articleUrlList.Count, cnt));
string articleUrl = (string)state;
string firstArticleHtml = GetHtmlSource(articleUrl, Encoding.UTF8);
if (string.IsNullOrEmpty(firstArticleHtml)) return;
HtmlNode rootNode = GetHtmlNodeByHtml(firstArticleHtml);
string articleTitle = GetNodeInnerText(rootNode, ARTICLETITLEPATH);
string postDate = GetNodeInnerText(rootNode, POSTDATEPATH);
string articleContent = GetNodeInnerText(rootNode, ARTICLECONTENTPATH);
//采集结果处理:保存到数据库或其它......
string blogFile = BLOGUSER + ".txt";
using (StreamWriter sw = new StreamWriter(blogFile, true)) {
sw.WriteLine(articleUrl);
sw.WriteLine(articleTitle);
sw.WriteLine(postDate);
sw.WriteLine(articleContent);
}
}
}
private void TaskEnded(Task[] task) {
SetStatuText("采集结束,耗时 " + stopwatch.Elapsed.Minutes + "分" + stopwatch.Elapsed.Seconds + "秒");
}
//获取总页数
private int GetPageCount(string pageCountUrl)
{
HtmlNode rootNode = GetHtmlNodeByUrl(pageCountUrl, Encoding.UTF8);
if (rootNode == null)
return 0;
string pageCountText = GetNodeInnerText(rootNode, PAGECOUNTPATH);
int firstIndex = pageCountText.LastIndexOf("共") + 1;
int lastIndex = pageCountText.LastIndexOf("页");
string result = pageCountText.Substring(firstIndex, lastIndex - firstIndex);
return Convert.ToInt32(result);
}
//通过网页URL获取HtmlNode
private HtmlNode GetHtmlNodeByUrl(string url, Encoding encoding)
{
string html = GetHtmlSource(url, encoding);
if (string.IsNullOrEmpty(html)) return null;
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(html);
HtmlNode rootNode = document.DocumentNode;
return rootNode;
}
//通过网页html源代码获取HtmlNode
private HtmlNode GetHtmlNodeByHtml(string htmlSource)
{
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(htmlSource);
HtmlNode rootNode = document.DocumentNode;
return rootNode;
}
/// <summary>
/// 获取网页源代码
/// </summary>
private string GetHtmlSource(string url, Encoding encoding)
{
string result = "";
try
{
WebRequest request = WebRequest.Create(url);
using (WebResponse response = request.GetResponse())
using (StreamReader reader = new StreamReader(response.GetResponseStream(), encoding))
result = reader.ReadToEnd();
}
catch
{
result = "";
}
return result;
}
private string GetNodeInnerText(HtmlNode srcNode, string path)
{
HtmlNode temp = srcNode.SelectSingleNode(path);
if (temp == null)
return null;
return temp.InnerText;
}
private void SetStatuText(string s)
{
this.SafeCall(() =>
{
lblStatusInfo.Text = s;
});
}
//开始采集按钮
private void button1_Click(object sender, EventArgs e) {
stopwatch.Restart();
Task.Factory.StartNew(() => {
cnt = 0;
int pageCount = GetPageCount(BLOGURL + "/" + BLOGUSER);
if (pageCount == 0)
return;
//所有文章的URL
for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++) {
string pageIndexUrl = BLOGURL + "/" + BLOGUSER + "/article/list/" + pageIndex.ToString();
HtmlNode rootNode = GetHtmlNodeByUrl(pageIndexUrl, Encoding.UTF8);
if (rootNode == null)
continue;
HtmlNodeCollection ArticleUrlList = rootNode.SelectNodes(ARTICLEURLPATH);
foreach (HtmlNode articleUrlNode in ArticleUrlList) {
string articleUrl = BLOGURL + articleUrlNode.Attributes["href"].Value;
articleUrlList.Add(articleUrl);
}
}
}).ContinueWith((x) => {
TaskFactory taskFactory = new TaskFactory();
Task[] tasks = new Task[articleUrlList.Count];
for (int i = 0; i < articleUrlList.Count; i++) {
tasks[i] = new Task(CollectArticle, articleUrlList[i]);
tasks[i].Start();
}
taskFactory.ContinueWhenAll(tasks, TaskEnded, TaskContinuationOptions.None);
});
}
}
public static class Extenstions
{
public static void SafeCall(this Control ctrl, Action callback)
{
if (ctrl.InvokeRequired)
ctrl.Invoke(callback);
else
callback();
}
}
}
0 0
- C#文章采集浅析
- C#文章采集浅析。
- c#采集文章
- C#采集CSDN单个博客所有文章
- c#采集
- 天涯文章采集器
- PHPCMS v9 文章采集
- PHP-文章简单采集
- php文章采集的实现
- 采集卡的一些文章
- PHP正则过滤采集文章
- C# 采集代码
- C# 采集程序
- C#采集数据类
- C#采集数据类
- C#采集数据类
- C# 信息采集器
- C# 网页图片采集
- wcf与silverlight跨区域问题
- Linux 快速操作IO端口
- 设置android SurfaceView背景透明
- C/C++学习开始第一天
- 有符号数和无符号数负数
- c#采集文章
- HDU3058Nightmare Ⅱ( 双向广搜 )
- [转]Redis有序集内部实现原理分析(二)
- 程序员面试笔试宝典学习记录(二)(程序设计相关知识)
- 应该知道的Linux技巧
- linux服务器 修改 图片上传upload目录 权限
- 程序员面试笔试宝典学习记录(三)(数据库相关知识)
- iOS开发工具-网络封包分析工具Charles
- LeetCode 23 Merge k Sorted Lists 高空间消耗解法与堆的解法,还需要学习堆的做法