c#几个正则表达式

来源:互联网 发布:网络口碑营销特点 编辑:程序博客网 时间:2024/05/01 17:07

    #region 几个比较实用比较N的正则表达式
    /// <summary>
    /// 几个比较实用比较N的正则表达式[思归写出来的方法]
    /// </summary>
    /// <param name="strHtml"></param>
    /// <returns></returns>
    public static string StripHtml(string strHtml)
    {
        // 比较牛的处理文章中的图片,根据这个可推出好多类似有用的正则
        //将<img>转化为标准的<img src="" border="1" alt="">
        strHtml = Regex.Replace(strHtml, @"<img/s+(((?<alt>alt=('[^']*'|""[^""]*""|[^/s>]*))|(?<src>src=('[^']*'|""[^""]*""|[^/s>]*))|(?<border>border=('[^']*'|""[^""]*""|[^/s>]*))|(?<others>[^=<>]+=('[^']*'|""[^""]*""|[^/s>]*)))/s*)*[^>]*>", "[img ${src} ${border} ${alt}]", RegexOptions.IgnoreCase);

        //不能显示大图片的时候用(并且实现lightbox效果)
        string imgicon = "<img src='images/imgicon.jpg' width='16' height='12' border='0' alt='点击查看大图'>";
        strHtml = Regex.Replace(strHtml, @"<img/s+((src=(?<src>'[^']*'|""[^""]*""|[^/s>]*))/s*)*[^>]*>", @"<a href=${src} rel='lightbox'>" + imgicon + "</a>", RegexOptions.IgnoreCase);

        //图片的一般处理
        strHtml = Regex.Replace(strHtml, @"<img/s+((src=(?<src>'[^']*'|""[^""]*""|[^/s>]*))/s*)*[^>]*>", @"<img src= ${src}>", RegexOptions.IgnoreCase);

        //将<strhtml str="str" str="str">整理成<strhtml>
        strHtml = Regex.Replace(strHtml, @"<div[^>]+>|]+>", "<div>", RegexOptions.IgnoreCase);

        //所有带<>的标签都去掉
        strHtml = Regex.Replace(strHtml, @"<[^>]+>|]+>", "", RegexOptions.IgnoreCase);

        return strHtml;
    }
    #endregion

    #region // 清理Word产生的垃圾代码(不是很理想,但...)
    /// <summary>
    /// Removes all FONT and SPAN tags, and all Class and Style attributes.
    /// Designed to get rid of non-standard Microsoft Word HTML tags.
    /// http://tim.mackey.ie/CleanWordHTMLUsingRegularExpressions.aspx
    /// http://article.pchome.net/content-425187.html
    /// </summary>
    private string CleanWord(string strHtml)
    {
        // start by completely removing all unwanted tags    
        strHtml = Regex.Replace(strHtml, @"<[/]?(font|span|xml|del|ins|[ovwxp]:/w+)[^>]*?>", "", RegexOptions.IgnoreCase);
       
        // then run another pass over the html (twice), removing unwanted attributes    
        strHtml = Regex.Replace(strHtml, @"<([^>]*)(?:class|lang|style|size|face|[ovwxp]:/w+)=(?:'[^']*'|""[^""]*""|[^/s>]+)([^>]*)>", "<$1$2>", RegexOptions.IgnoreCase);
        strHtml = Regex.Replace(strHtml, @"<([^>]*)(?:class|lang|style|size|face|[ovwxp]:/w+)=(?:'[^']*'|""[^""]*""|[^/s>]+)([^>]*)>", "<$1$2>", RegexOptions.IgnoreCase);
       
        // [20080323]
        strHtml = Regex.Replace(strHtml, @"%", "%", RegexOptions.IgnoreCase);

        // [20080323]去掉<?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /></
        strHtml = Regex.Replace(strHtml, @"<[?]xml[^>]+>|]+>", "", RegexOptions.IgnoreCase);

        // [20080323]自动加载图片(自动替换M_IMG[i])
        for (int i = 1; i < 100; i++)
        {
            strHtml = Regex.Replace(strHtml, @"#M_IMG" + i + "#", "<img src='images/" + i + ".jpg'>", RegexOptions.IgnoreCase);
        }
        return strHtml;
    }
    #endregion