VB.NET 获取HTML中的文字

来源:互联网 发布:淘宝直通车计划表格 编辑:程序博客网 时间:2024/05/22 00:21
  ''' <summary>    ''' 去除HTML标记    ''' </summary>    ''' <param name="strHtml">包括HTML的源码 </param>    ''' <returns>已经去除后的文字</returns>    Public Shared Function GetStrfromHTML(ByVal strHtml As String) As String        Dim aryReg As String() = {"<script[^>]*?>.*?</script>", "<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", "([\r\n])[\s]+", "&(quot|#34);", "&(amp|#38);", "&(lt|#60);", _         "&(gt|#62);", "&(nbsp|#160);", "&(iexcl|#161);", "&(cent|#162);", "&(pound|#163);", "&(copy|#169);", _         "&#(\d+);", "-->", "<!--.*\n"}        'chr(161),        'chr(162),        'chr(163),        'chr(169),        Dim aryRep As String() = {"", "", "", """", "&", "<", _         ">", " ", "¡", "¢", "£", "©", _         "", vbCr & vbLf, ""}        Dim newReg As String = aryReg(0)        Dim strOutput As String = strHtml        For i As Integer = 0 To aryReg.Length - 1            Dim regex As New Regex(aryReg(i), RegexOptions.IgnoreCase)            strOutput = regex.Replace(strOutput, aryRep(i))        Next        strOutput.Replace("<", "")        strOutput.Replace(">", "")        strOutput.Replace(vbCr & vbLf, "")        Return strOutput    End Function


	
				
		
原创粉丝点击