CSDN爬虫

来源:互联网 发布:自考 函授 电大 网络 编辑:程序博客网 时间:2024/05/14 17:06

仅做技术交流。


using System;using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.Text;using System.Windows.Forms;using System.IO;using System.Text.RegularExpressions;using DotNet.Utilities;using System.Xml;using System.Net;namespace CSDNEpt{    public partial class Form1 : Form    {        public Form1()        {            InitializeComponent();        }        public const string Category = "<div\\s*id=\"panel_Category\"\\s*class=\"panel\">[\\w\\W]*?</div>";        public const string CategoryId = "(?<=<li(.*)\\s*<a(.*))\\d+(?=\"\\sonclick=(.*)\\s*</li>)";        public const string CategoryName="(?<=<li(.*)\\s*<a(.*)\">)(.*)(?=</a><span>(.*)\\s*</li>)";        public const string ArticleCount = "(?<=<li(.*)\\s*<a(.*)</a><span>\\()\\d*(?=\\)</span>\\s*</li>)";        public const string ArticleName = "(?<=<div\\s*class=\"article_title\">[\\w\\W]*\">\\s*)(.*)(?=\\s*</a></span>\\s*(.*)\\s*</div>)";        public const string PostDate = "(?<=<span\\s*class=\"link_postdate\">)(.*)(?=</span>)";        public const string ReadCount = "(?<=<span\\s*class=\"link_view\"\\s*title=\"阅读次数\">)(.*)(?=人阅读</span>)";        public const string ArticleContent = "(?<=<div\\s*id=\"article_content\"\\s*class=\"article_content\">)[\\w\\W]*?(?=</div>)";        public const string ArticleId = "(?<=<span\\s*class=\"link_title\">(.*)details/)\\d*(?=\">[\\w\\W]*?</a></span>)";        public const string IsOriginal="(?<=<span\\s*class=\"ico\\s*)(.*)(?=\"></span>)";//是否原创        public List<string> MatchStr(string regexStr,string matchStr){            List<string> lt = new List<string>();            Regex reg = new Regex(regexStr, RegexOptions.IgnoreCase);            Match m = reg.Match(matchStr);            while (m.Success){                lt.Add(m.ToString());                m = m.NextMatch();            }            return lt;        }        private void button1_Click(object sender, EventArgs e)        {            string path = Application.StartupPath + "\\article\\";            if(!Directory.Exists(path)){                Directory.CreateDirectory(path);            }            List<string> lt_CategoryId = new List<string>();            List<string> lt_CategoryName = new List<string>();            List<string> lt_ArticleCount = new List<string>();            List<string> lt_ArticleId = new List<string>();            HttpHelper http = new HttpHelper();            HttpItem item = new HttpItem();            item.URL = textBox1.Text;            item.Referer = textBox1.Text;            item.ProxyIp = "ieproxy";            item.Encoding = Encoding.GetEncoding("utf-8");            string html=http.GetHtml(item).Html;            string CategoryHtml = MatchStr(Category, html)[0];            lt_CategoryId = MatchStr(CategoryId, CategoryHtml);            lt_CategoryName = MatchStr(CategoryName, CategoryHtml);            lt_ArticleCount = MatchStr(ArticleCount, CategoryHtml);            //循环每一个分类 取分类下的文章集合            for (int i = 0; i < lt_CategoryId.Count; i++)            {                listBox1.Items.Insert(0, "正在获取【" + lt_CategoryName[i] + "】分类...");                int count = Convert.ToInt32(lt_ArticleCount[i]);                int page = (count % 20 == 0) ? (count / 20) : (count / 20 + 1);                lt_ArticleId.Clear();                for (int k = 1; k < page+1; k++)                {                    string pageUrl = textBox1.Text.Trim() + "/article/category/" + lt_CategoryId[i] + "/" + k;                    item.URL = pageUrl;                    string pageHtml = http.GetHtml(item).Html;                    lt_ArticleId.AddRange(MatchStr(ArticleId, pageHtml));                }                if (lt_ArticleId.Count != 0)                {                    string articleUrl = "";                    for (int j = 0; j < lt_ArticleId.Count; j++)                    {                        articleUrl = textBox1.Text.Trim() + "/article/details/" + lt_ArticleId[j];                        item.URL = articleUrl;                        string articleHtml = http.GetHtml(item).Html;                        string articleName_txt = MatchStr(ArticleName, articleHtml)[0].Trim().Replace("\r","");                        string postDate_txt = MatchStr(PostDate, articleHtml)[0].Trim();                        string readCount_txt = MatchStr(ReadCount, articleHtml)[0].Trim();                        string articleContent_txt = MatchStr(ArticleContent, articleHtml)[0];                        string isOriginal_txt = MatchStr(IsOriginal, articleHtml)[0].Trim().Trim();                        listBox1.Items.Insert(0, "正在抓取【" + articleName_txt + "】文章...");                        //创建xml 保存文章                        XmlDocument xml = new XmlDocument();                        XmlDeclaration xmldecl=xml.CreateXmlDeclaration("1.0", "gb2312", null);                        XmlElement root = xml.CreateElement("Article");                        XmlElement name = xml.CreateElement("Name");                        name.InnerText = articleName_txt;                        XmlElement url = xml.CreateElement("URL");                        url.InnerText = articleUrl;                        XmlElement isOriginal = xml.CreateElement("IsOriginal");                        isOriginal.InnerText = isOriginal_txt == "ico_type_Original" ? "Y" : "N";                        XmlElement postDate = xml.CreateElement("PostDate");                        postDate.InnerText = postDate_txt;                        XmlElement readCount = xml.CreateElement("ReadCount");                        readCount.InnerText = readCount_txt;                        XmlElement articleContent = xml.CreateElement("ArticleContent");                        articleContent.InnerText = articleContent_txt;                        root.AppendChild(name);                        root.AppendChild(url);                        root.AppendChild(isOriginal);                        root.AppendChild(postDate);                        root.AppendChild(readCount);                        root.AppendChild(articleContent);                        xml.AppendChild(xmldecl);                        xml.AppendChild(root);                        xml.Save(path + articleName_txt + ".xml");                        listBox1.Items.Insert(0, "【" + articleName_txt + "】文章抓取成功!");                        Application.DoEvents();                    }                }                listBox1.Items.Insert(0, "【"+lt_CategoryName[i] + "】分类获取完毕!");                            }        }    }}


0 0
原创粉丝点击