业余写的一个小工具_XPathTool(C#源码)

来源:互联网 发布:ubuntu与centos的区别 编辑:程序博客网 时间:2024/05/02 09:18

     因为前段时间对于爬虫有些兴趣,所以研究了一下HtmlAgilityPack.dll而它是可以基于XPath来解析的。

     关于Xpath相关的查看这里 http://www.w3school.com.cn/xpath/index.asp

     网上找了半天没找到几个XPath工具。找到一份源代码,在它的基础上自己做了一个Xpath工具

如图 这里是通过XPath获取百度音乐歌曲名   



源代码



using System;using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.IO;using System.Linq;using System.Text;using System.Threading.Tasks;using System.Windows.Forms;using HtmlAgilityPack;using System.Threading;using System.Text.RegularExpressions;namespace XPathTools{    public partial class Form1 : Form    {        public Form1()        {                        InitializeComponent();            comboBox1.SelectedIndexChanged += comboBox1_SelectedIndexChanged;            comboBox2.SelectedIndexChanged += comboBox2_SelectedIndexChanged;        }        private void comboBox2_SelectedIndexChanged(object sender, EventArgs e)        {            try            {                textBox3.Text = hd.DocumentNode.SelectNodes(comboBox2.Text)[0].InnerHtml;            }            catch (System.Exception ex)            {                MessageBox.Show("表达式有误" + ex.ToString());            }            //throw new NotImplementedException();        }        //鼠标滚轮        private void comboBox1_SelectedIndexChanged(object sender, EventArgs e)        {            try            {                textBox3.Text = hd.DocumentNode.SelectNodes(comboBox1.Text)[0].InnerHtml;            }            catch (System.Exception ex)            {                MessageBox.Show("表达式有误" + ex.ToString());                return;            }            comboBox2.Text = comboBox1.Text;            //throw new NotImplementedException();        }        //指定文件路径        private void button1_Click(object sender, EventArgs e)        {           textBox3.Text = textBox2.Text = null;           OpenFileDialog ofg = new OpenFileDialog();           ofg.Filter = "网页文件(*.html)|*.html;*.xml;*.htm;*.txt";           ofg.Multiselect = false;           if (ofg.ShowDialog() == DialogResult.OK)           {               textBox1.Text = ofg.FileName;               if (ofg.FileName == null)               {                   return;               }               textBox1.ReadOnly = true;               textBox2.ReadOnly = true;               //将选择的文件加载到tab1的textbox中               FileStream fs = new FileStream(textBox1.Text, FileMode.OpenOrCreate, FileAccess.Read);               StreamReader sr = new StreamReader(fs,  UnicodeEncoding.GetEncoding("GB2312"));               textBox2.AppendText(sr.ReadToEnd());               sr.Close();               fs.Close();               //开始解析文件               StartAnalyse();           }                    }        HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument();        //开始分析文件的xpath路径        private void StartAnalyse()        {            comboBox1.Items.Clear();            comboBox2.Items.Clear();            hd.LoadHtml(textBox2.Text);            Thread th = new Thread(NewMethod);            th.Start();            //throw new NotImplementedException();        }        //向combox1添加数据        private void UIContorol(string str)        {                       //textBox1.Text = str;            comboBox1.Items.Add(str);            comboBox1.Text = str;                        //让combox2等于combox1 combox2 也可以自己根据关键字查询得出                      comboBox2.Text = str;            //toolStripStatusLabel1.Text = str;        }        private delegate void Dg(string str);        Dictionary<string, string> D = new Dictionary<string, string>();        private void NewMethod()        {            Dg dgUIContorol = new Dg(UIContorol);            List<string> returnList = new List<string>();            string str = textBox2.Text;            string s = "<script[\\s\\S]*?</script>";            MatchCollection ms = Regex.Matches(str, s, RegexOptions.IgnoreCase | RegexOptions.Compiled);            foreach (Match m in ms)            {                str = str.Replace(m.Value, "");            }            Dictionary<string, int> dic = new Dictionary<string, int>();            List<string> strList = new List<string>();            strList.Add(".");            string strPattern = "<([^<>]*?)>";            MatchCollection Matches = Regex.Matches(str, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);            foreach (Match NextMatch in Matches)            {                if (!NextMatch.Groups[0].Value.EndsWith("/ >") && !NextMatch.Groups[0].Value.EndsWith("/>") && !NextMatch.Groups[0].Value.StartsWith("<!"))                {                    if (NextMatch.Groups[0].Value.StartsWith("</"))                    {                        if (NextMatch.Groups[0].Value.Replace("</", "<").ToLower() == strList[strList.Count - 1].ToLower())                        {                            strList.RemoveAt(strList.Count - 1);                        }                    }                    else                    {                        string strOldXpath = XpathRow(strList, dic);                        string strp = "(<(?<body>[^>]*?) [^>]*?>)|(<(?<body>[^>]*?)>)";                        string v = Regex.Matches(NextMatch.Groups[0].Value, strp, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value.ToLower();                        if (v.ToUpper() != "LINK" && v.ToUpper() != "META" && v.ToUpper() != "SCRIPT" && v.ToUpper() != "IMG" && v.ToUpper() != "INPUT" && v.ToUpper() != "FORM")                        {                            AddRowNumber(strOldXpath, "<" + v + ">", dic);                            strList.Add("<" + v + ">");                            returnList.Add(XpathRow(strList, dic));                            //label1.Text = returnList.Last();                            try                            {                                string SelectNodes = hd.DocumentNode.SelectNodes(returnList.Last())[0].InnerHtml;                                textBox3.Invoke(dgUIContorol, new object[] { returnList.Last() });                                D.Add(returnList.Last(), SelectNodes);                                //if (D.ContainsKey("./html[1]/body[1]/table[1]/tr[1]/td[1]/table[1]/tr[2]/td[1]/table[1]/tr[1]/td[1]/div[1]/div[1]/fieldset[1]/div[1]"))                            }                            catch                            {                            }                        }                    }                }                else                {                }            }            //listBox1.DataSource = returnList;            //listBox1.Items.Add(returnList.Count);            if (strList.Count == 1)            {                //toolStripStatusLabel1.Text = "OK";            }            else            {                //toolStripStatusLabel1.Text = "False";            }        }        private string XpathRow(List<string> strList, Dictionary<string, int> dic)        {            StringBuilder sb = new StringBuilder();            foreach (var str in strList)            {                string strPattern = "<(?<body>[^>]*?)>";                string v = "";                try                {                    v = Regex.Matches(str, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value;                    string temp = sb.ToString() + v;                    v = v + "[" + dic[temp].ToString() + "]";                }                catch                {                    v = str;                }                sb.Append(v + "/");            }            return sb.ToString().TrimEnd('/');        }        private void AddRowNumber(string strOldXpatch, string NewNode, Dictionary<string, int> dic)        {            if (strOldXpatch == "")            {                if (!dic.ContainsKey("."))                {                    dic.Add(".", 0);                }                else                {                    dic["."] = 0;                }                return;            }            string strPattern = "<(?<body>[^>]*?)>";            string v = Regex.Matches(NewNode, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled)[0].Groups["body"].Value;            if (dic.ContainsKey(strOldXpatch + "/" + v))            {                dic[strOldXpatch + "/" + v]++;            }            else            {                dic.Add(strOldXpatch + "/" + v, 1);            }        }        //获取指定的远程网页        private void button2_Click(object sender, EventArgs e)        {        }                private void Form1_Load(object sender, EventArgs e)        {        }        //执行xpath查询        private void button3_Click(object sender, EventArgs e)        {            comboBox2.Items.Clear();            foreach (string str in D.Where(fun => fun.Value.ToLower().Contains(textBox5.Text.ToLower())).Select(fun => fun.Key))            {                              comboBox2.Items.Add(str);               comboBox2.Text = str;            }                    }        //获取远程        private void button2_Click_1(object sender, EventArgs e)        {            textBox3.Text = textBox2.Text = null;            if (textBox1.Text == null)            {                MessageBox.Show("地址不能为空!");                return;            }            string strUrl = textBox1.Text;            HtmlWeb hw = new HtmlWeb();            string url = strUrl;            try            {                hd = hw.Load(url);            }            catch (System.Exception ex)            {                MessageBox.Show(ex.ToString());                return;            }            textBox2.Text = hd.DocumentNode.InnerHtml;                        //开始解析标签            StartAnalyse();        }          //解析textbox1中的标签        private void button4_Click(object sender, EventArgs e)        {            //分析textbox2中的xpath项            StartAnalyse();        }        private void textBox2_TextChanged(object sender, EventArgs e)        {        }        //执行combox1中的XPath语句        private void OnXPath(object sender, EventArgs e)        {            try            {                textBox3.Text = hd.DocumentNode.SelectNodes(comboBox1.Text)[0].InnerHtml;            }            catch (System.Exception ex)            {                MessageBox.Show("表达式有误" + ex.ToString());            }            //            comboBox2.Text = comboBox1.Text;        }        //获取匹配的结果值        private void button6_Click(object sender, EventArgs e)        {            //标签            string strLabel =  textBox6.Text;            //值            string strValue= textBox7.Text;            string strXPathLabel_Val = "descendant::" + strLabel;            //XPath语句            string strXPath = comboBox2.Text;            HtmlNode node = hd.DocumentNode.SelectSingleNode(strXPath);                        //HtmlNode ^node = doc->GetElementbyId("entry_content");            if (node == null)            {                return ;            }            Form2 f2 = new Form2();            try            {                //HtmlNodeCollection atts  = node.SelectNodes("//*[@background or @lowsrc or @src or @href]");                //这样得到的是基于全文的                //HtmlNodeCollection hrefs = node.SelectNodes("//a[@href]");                //这样得到的是基于本节点的                                                HtmlNodeCollection hrefs = node.SelectNodes(strXPathLabel_Val);                if (hrefs == null)                {                    return;                }                                foreach (HtmlNode href in hrefs)                {                    if (href.Attributes[strValue] == null)                    {                        continue;                    }                                         String strImg = href.Attributes[strValue].Value;                    f2.AddData2ListView(textBox6.Text, strValue, strImg);                }            }            catch (System.Exception ex)            {                MessageBox.Show(ex.ToString());            }            finally            {                f2.Show();            }        }    }}


源代码下载地址:

http://download.csdn.net/detail/witch_soya/4978587


原创粉丝点击