html 标签解嵌套

来源:互联网 发布:eve A族女性捏脸数据 编辑:程序博客网 时间:2024/05/16 10:35

一:前言:网页源码中有很多嵌套的标签

例如div标签嵌套如:bUTP<DIV>finally<div>aurora</div>@126.com</div><div class=/"Cited1/">ggff</div>

我们的网页解析工作中有时候需要解嵌套。通俗的讲就是把嵌套的标签以线性表的形式表示出来。还拿上面的例子来说明。即解嵌套为

<div>aurora</div>

 <DIV>finally<div>aurora</div>@126.com</div>

div class=/"Cited1/">ggff</div>

核心代码如下:

 


 class ThemeIRAssist
    {
        
public static void  GetNodesByTags( ref string rawtext,string tags,ref  List<string>result )
       {   
//储存开始标签的位置
           List<Position > beginTagPos = new List<Position >();
            
//储存结束标签的位置
           List<Position> endTagPos = new List<Position>();
            
//匹配开始标签的正则表达式
           string sBeginTagPattern = "<" + tags;
           Regex regexBeginTag 
= new Regex(sBeginTagPattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
           
//匹配结束标签的正则表达式。
           string sEndTagPattern = "</" + tags + ">";
           Regex regexEndTag 
= new Regex(sEndTagPattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
            
//获得开始标签的集合
           MatchCollection beginTagCollection = regexBeginTag.Matches(rawtext);
            
//获得结束标签的集合
           MatchCollection endTagCollection = regexEndTag.Matches(rawtext);
           
foreach (Match mymatch in beginTagCollection)
           {
              Position pos
=new Position();
               pos.nPos
=mymatch.Index;
               pos.VistStatus
=false;
               beginTagPos.Add(pos);
               
           }

           
foreach (Match mymatch in endTagCollection)
           {
               Position pos 
= new Position();
               pos.nPos 
= mymatch.Index;
               pos.VistStatus 
= false;
               endTagPos.Add(pos);


           }
           
for (int i = 0; i < endTagPos.Count; i++)
           {
               
for (int j = beginTagPos.Count - 1; j >= 0; j--)
               {
                   
if(endTagPos[i].nPos<beginTagPos[j].nPos)
                       
continue;
                   
else
                   {
                       
if (beginTagPos[j].VistStatus)
                           
continue;
                       
else
                       {
                           result.Add(rawtext.Substring(beginTagPos[j].nPos,endTagPos[i].nPos
-beginTagPos[j].nPos+6));
                           beginTagPos[j].VistStatus
=true;
                           
break;
                       }

                   }
               }
           }
            






 
       }
    }
    
class Position
    {
       
private int pos;
        
private bool visited;
        
public int nPos
        {
            
get { return pos; }
            
set { pos = value; }
       
        }
        
public bool VistStatus
        {
            
get { return visited; }
            
set { visited = value; }
        }

    }

Main函数测试如下:

 


 static void Main(string[] args)
        {
            
            StreamReader sr 
= new StreamReader("D://finally.txt", Encoding.GetEncoding("utf-8"));
            
string rawtext = sr.ReadToEnd();
           
// string rawtext = "bUTP<DIV>finally<div>aurora</div>@126.com</div><div class=/"Cited1/">ggff</div>";
            List<string> result = new List<string>();
            ThemeIRAssist.GetNodesByTags(
ref rawtext, "div"ref result);
            Console.WriteLine(result.Count);
            Regex regexStadard 
= new Regex(@"^<div/s+class=""Cited1""", RegexOptions.IgnoreCase | RegexOptions.Singleline);
            
for (int i = result.Count - 1; i >= 0; i--)
            {
                
if(!regexStadard.IsMatch(result[i]))
                    result.RemoveAt(i);
            }
            Console.WriteLine(result.Count);
            
foreach(string s in result)
            {
                Console.WriteLine(
"***************************************************");
                Console.Write(s);
                Console.WriteLine(
"***************************************************");
            }
            Console.Read();
           
        }
原创粉丝点击