用C#使用HtmlParser.NET的例子。

来源:互联网 发布:造价预算软件 编辑:程序博客网 时间:2024/05/21 05:06
 
using System;
using System.IO;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Tags;

private void button1_Click(object sender, EventArgs e)
{
    
//we can use the stream to load a html file from the local disk
    
// or use the uri to load a web page from the internet
    
//byte[] htmlBytes = Encoding.UTF8.GetBytes(this.textBox1.Text);
    
//MemoryStream memsteam = new MemoryStream(htmlBytes);
    
//InputStreamSource input = new InputStreamSource(memsteam, "utf-8");
    
//Page page = new Page(input);
    
//Lexer lex = new Lexer(page);

    
if (this.textBox1.Text.Length<= 0)
        
return;
    
//here I read the html from the textbox
     Lexer lexer =new Lexer(this.textBox1.Text);
     Parser parser
=new Parser(lexer);
     NodeList htmlNodes
= parser.Parse(null);
    
this.treeView1.Nodes.Clear();
    
this.treeView1.Nodes.Add("root");
     TreeNode treeRoot
=this.treeView1.Nodes[0];
    
for (int i= 0; i< htmlNodes.Count; i++)
     {
        
this.RecursionHtmlNode(treeRoot, htmlNodes[i],false);
     }
}

private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode,bool siblingRequired)
{
    
if (htmlNode== null|| treeNode== null)return;

     TreeNode current
= treeNode;
    
//current node
    if (htmlNodeis ITag)
     {
         ITag tag
=(htmlNodeas ITag);
        
if (!tag.IsEndTag())
         {
            
string nodeString= tag.TagName;
            
if (tag.Attributes!= null&& tag.Attributes.Count> 0)
             {
                
if (tag.Attributes["ID"]!= null)
                     nodeString
= nodeString+ " { id=\"" + tag.Attributes["ID"].ToString() + "\" }";
                
if (tag.Attributes["CLASS"]!= null)
                     nodeString
= nodeString+ " { class=\"" + tag.Attributes["CLASS"].ToString() + "\" }";
                
if (tag.Attributes["STYLE"]!= null)
                     nodeString
= nodeString+ " { style=\"" + tag.Attributes["STYLE"].ToString() + "\" }";
                
if (tag.Attributes["HREF"]!= null)
                     nodeString
= nodeString+ " { href=\"" + tag.Attributes["HREF"].ToString() + "\" }";
             }
             current
=new TreeNode(nodeString);
             treeNode.Nodes.Add(current);
         }
     }

    
//the children nodes
    if (htmlNode.Children!=null&& htmlNode.Children.Count> 0)
     {
        
this.RecursionHtmlNode(current, htmlNode.FirstChild,true);
     }

    
//the sibling nodes
    if (siblingRequired)
     {
         INode sibling
= htmlNode.NextSibling;
        
while (sibling!= null)
         {
            
this.RecursionHtmlNode(treeNode, sibling,false);
             sibling
= sibling.NextSibling;
         }
     }
}

    screen snapshot for the example:
   
    The fault tolerance of the parser is very good, as shown in the pic below (although it could do this more intelligently, I really think that's enough for use):