CSDN爬虫
来源:互联网 发布:自考 函授 电大 网络 编辑:程序博客网 时间:2024/05/14 17:06
仅做技术交流。
using System;using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.Text;using System.Windows.Forms;using System.IO;using System.Text.RegularExpressions;using DotNet.Utilities;using System.Xml;using System.Net;namespace CSDNEpt{ public partial class Form1 : Form { public Form1() { InitializeComponent(); } public const string Category = "<div\\s*id=\"panel_Category\"\\s*class=\"panel\">[\\w\\W]*?</div>"; public const string CategoryId = "(?<=<li(.*)\\s*<a(.*))\\d+(?=\"\\sonclick=(.*)\\s*</li>)"; public const string CategoryName="(?<=<li(.*)\\s*<a(.*)\">)(.*)(?=</a><span>(.*)\\s*</li>)"; public const string ArticleCount = "(?<=<li(.*)\\s*<a(.*)</a><span>\\()\\d*(?=\\)</span>\\s*</li>)"; public const string ArticleName = "(?<=<div\\s*class=\"article_title\">[\\w\\W]*\">\\s*)(.*)(?=\\s*</a></span>\\s*(.*)\\s*</div>)"; public const string PostDate = "(?<=<span\\s*class=\"link_postdate\">)(.*)(?=</span>)"; public const string ReadCount = "(?<=<span\\s*class=\"link_view\"\\s*title=\"阅读次数\">)(.*)(?=人阅读</span>)"; public const string ArticleContent = "(?<=<div\\s*id=\"article_content\"\\s*class=\"article_content\">)[\\w\\W]*?(?=</div>)"; public const string ArticleId = "(?<=<span\\s*class=\"link_title\">(.*)details/)\\d*(?=\">[\\w\\W]*?</a></span>)"; public const string IsOriginal="(?<=<span\\s*class=\"ico\\s*)(.*)(?=\"></span>)";//是否原创 public List<string> MatchStr(string regexStr,string matchStr){ List<string> lt = new List<string>(); Regex reg = new Regex(regexStr, RegexOptions.IgnoreCase); Match m = reg.Match(matchStr); while (m.Success){ lt.Add(m.ToString()); m = m.NextMatch(); } return lt; } private void button1_Click(object sender, EventArgs e) { string path = Application.StartupPath + "\\article\\"; if(!Directory.Exists(path)){ Directory.CreateDirectory(path); } List<string> lt_CategoryId = new List<string>(); List<string> lt_CategoryName = new List<string>(); List<string> lt_ArticleCount = new List<string>(); List<string> lt_ArticleId = new List<string>(); HttpHelper http = new HttpHelper(); HttpItem item = new HttpItem(); item.URL = textBox1.Text; item.Referer = textBox1.Text; item.ProxyIp = "ieproxy"; item.Encoding = Encoding.GetEncoding("utf-8"); string html=http.GetHtml(item).Html; string CategoryHtml = MatchStr(Category, html)[0]; lt_CategoryId = MatchStr(CategoryId, CategoryHtml); lt_CategoryName = MatchStr(CategoryName, CategoryHtml); lt_ArticleCount = MatchStr(ArticleCount, CategoryHtml); //循环每一个分类 取分类下的文章集合 for (int i = 0; i < lt_CategoryId.Count; i++) { listBox1.Items.Insert(0, "正在获取【" + lt_CategoryName[i] + "】分类..."); int count = Convert.ToInt32(lt_ArticleCount[i]); int page = (count % 20 == 0) ? (count / 20) : (count / 20 + 1); lt_ArticleId.Clear(); for (int k = 1; k < page+1; k++) { string pageUrl = textBox1.Text.Trim() + "/article/category/" + lt_CategoryId[i] + "/" + k; item.URL = pageUrl; string pageHtml = http.GetHtml(item).Html; lt_ArticleId.AddRange(MatchStr(ArticleId, pageHtml)); } if (lt_ArticleId.Count != 0) { string articleUrl = ""; for (int j = 0; j < lt_ArticleId.Count; j++) { articleUrl = textBox1.Text.Trim() + "/article/details/" + lt_ArticleId[j]; item.URL = articleUrl; string articleHtml = http.GetHtml(item).Html; string articleName_txt = MatchStr(ArticleName, articleHtml)[0].Trim().Replace("\r",""); string postDate_txt = MatchStr(PostDate, articleHtml)[0].Trim(); string readCount_txt = MatchStr(ReadCount, articleHtml)[0].Trim(); string articleContent_txt = MatchStr(ArticleContent, articleHtml)[0]; string isOriginal_txt = MatchStr(IsOriginal, articleHtml)[0].Trim().Trim(); listBox1.Items.Insert(0, "正在抓取【" + articleName_txt + "】文章..."); //创建xml 保存文章 XmlDocument xml = new XmlDocument(); XmlDeclaration xmldecl=xml.CreateXmlDeclaration("1.0", "gb2312", null); XmlElement root = xml.CreateElement("Article"); XmlElement name = xml.CreateElement("Name"); name.InnerText = articleName_txt; XmlElement url = xml.CreateElement("URL"); url.InnerText = articleUrl; XmlElement isOriginal = xml.CreateElement("IsOriginal"); isOriginal.InnerText = isOriginal_txt == "ico_type_Original" ? "Y" : "N"; XmlElement postDate = xml.CreateElement("PostDate"); postDate.InnerText = postDate_txt; XmlElement readCount = xml.CreateElement("ReadCount"); readCount.InnerText = readCount_txt; XmlElement articleContent = xml.CreateElement("ArticleContent"); articleContent.InnerText = articleContent_txt; root.AppendChild(name); root.AppendChild(url); root.AppendChild(isOriginal); root.AppendChild(postDate); root.AppendChild(readCount); root.AppendChild(articleContent); xml.AppendChild(xmldecl); xml.AppendChild(root); xml.Save(path + articleName_txt + ".xml"); listBox1.Items.Insert(0, "【" + articleName_txt + "】文章抓取成功!"); Application.DoEvents(); } } listBox1.Items.Insert(0, "【"+lt_CategoryName[i] + "】分类获取完毕!"); } } }}
0 0
- CSDN爬虫
- csdn站内搜索结果爬虫
- Python爬虫 - 登录csdn
- csdn博客爬虫更新
- Python爬虫实例- CSDN博客爬虫
- 鄙人自制csdn博客爬虫
- python爬虫CSDN文章抓取
- Python爬虫抓取csdn博客
- Python爬虫Csdn系列I
- Python爬虫Csdn系列II
- Python爬虫Csdn系列III
- Python网络爬虫模拟CSDN
- Csdn开源爬虫项目
- 通过爬虫迁移CSDN博客
- python爬虫爬取csdn
- CSDN爬虫(一)——爬虫入门+数据总览
- 超简单Nsoup版Csdn博客爬虫
- python爬虫之模拟登陆csdn
- iOS学习笔记(十五)——数据库操作(SQLite)
- java持久层框架分析
- C语言陷阱---数据溢出和类型转换
- 定制个性化App 引领企业跨入互联网时代
- Android之触摸手势检测GestureDetector使用详解
- CSDN爬虫
- HDU 4059解题报告
- iOS学习笔记(十四)——打电话、发短信
- C++: 类单例的宏实现(方便调用)
- 社説 20150226 ギリシャ改革案 実効性のある具体策が肝心だ
- Ubuntu下编译GCC
- 分数拆分(Fractions Again?!)
- Insertion Sort List
- iOS学习笔记(十三)——获取手机信息(UIDevice、NSBundle、NSLocale)