爬虫程序
来源:互联网 发布:口才与演讲软件 编辑:程序博客网 时间:2024/05/10 23:42
/// <summary> /// 获取抓取链接Html的源代码 /// </summary> /// <param name="url">url地址</param> /// <param name="charSet">编码方式、如果传入""则自动获取编码</param> /// <returns></returns> public string GetHttpSource(string url, string charSet = "") { try { string strWebData; StreamReader sr; var myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url); myHttpWebRequest.Proxy = null; myHttpWebRequest.Timeout = 15 * 1000; //连接超时 myHttpWebRequest.Accept = "*/*"; myHttpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/536.1"; myHttpWebRequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; //自动解压gzip var myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse(); var stream = myHttpWebResponse.GetResponseStream(); //stream.ReadTimeout = 15 * 1000; //读取超时 设置.AutomaticDecompression后不支持超时 //先分析header中编码 var hchart = myHttpWebResponse.Headers["Content-Type"]; var hchartm = Regex.Match(hchart, "charset=(.*)?", RegexOptions.IgnoreCase); var hchart1 = hchartm.Groups[1].Value; if (hchart1 != "") { if (stream != null) { sr = new StreamReader(stream, Encoding.GetEncoding(hchart1)); strWebData = sr.ReadToEnd(); goto endthis; } } //保存到 MemoryStream 供重复读取 var ms = new MemoryStream(); var buffer = new byte[1024]; while (true) { if (stream != null) { var sz = stream.Read(buffer, 0, 1024); if (sz == 0) break; ms.Write(buffer, 0, sz); } } //默认编码读取 ms.Position = 0;//指针置于流开头 if (charSet == "") charSet = "gb2312"; sr = new StreamReader(ms, Encoding.GetEncoding(charSet)); strWebData = sr.ReadToEnd(); //获取网页meta字符编码 var charSetMatch = Regex.Match(strWebData, "<meta([^>]*)charset=(\")?(.*)?\"", RegexOptions.IgnoreCase | RegexOptions.Multiline); var webCharSet = charSetMatch.Groups[3].Value.ToLower(); if (!Encoding.GetEncoding(webCharSet).Equals(Encoding.GetEncoding(charSet)) && webCharSet != "") { ms.Position = 0;//指针置于流开头 sr = new StreamReader(ms, Encoding.GetEncoding(webCharSet)); strWebData = sr.ReadToEnd(); } ms.Close(); endthis: sr.Close(); stream.Close(); myHttpWebResponse.Close(); myHttpWebRequest.Abort(); return strWebData; } catch (Exception ex) { return "Error:" + ex.Message; } }
/// <summary> /// 清除内容中的Html代码 /// </summary> /// <param name="Content"></param> /// <returns></returns> public string ClearHtml(string Content) { Content = ReplaceHtml("&#[^>]*;", "", Content); Content = ReplaceHtml("</?marquee[^>]*>", "", Content); Content = ReplaceHtml("</?object[^>]*>", "", Content); Content = ReplaceHtml("</?param[^>]*>", "", Content); Content = ReplaceHtml("</?embed[^>]*>", "", Content); Content = ReplaceHtml("</?table[^>]*>", "", Content); Content = ReplaceHtml(" ", "", Content); Content = ReplaceHtml("</?tr[^>]*>", "", Content); Content = ReplaceHtml("</?th[^>]*>", "", Content); Content = ReplaceHtml("</?p[^>]*>", "", Content); Content = ReplaceHtml("</?a[^>]*>", "", Content); Content = ReplaceHtml("</?img[^>]*>", "", Content); Content = ReplaceHtml("</?tbody[^>]*>", "", Content); Content = ReplaceHtml("</?li[^>]*>", "", Content); Content = ReplaceHtml("</?span[^>]*>", "", Content); Content = ReplaceHtml("</?div[^>]*>", "", Content); Content = ReplaceHtml("</?th[^>]*>", "", Content); Content = ReplaceHtml("</?td[^>]*>", "", Content); Content = ReplaceHtml("</?script[^>]*>", "", Content); Content = ReplaceHtml("(javascript|jscript|vbscript|vbs):", "", Content); Content = ReplaceHtml("on(mouse|exit|error|click|key)", "", Content); Content = ReplaceHtml("<\\?xml[^>]*>", "", Content); Content = ReplaceHtml("<\\/?[a-z]+:[^>]*>", "", Content); Content = ReplaceHtml("</?font[^>]*>", "", Content); Content = ReplaceHtml("</?b[^>]*>", "", Content); Content = ReplaceHtml("</?u[^>]*>", "", Content); Content = ReplaceHtml("</?i[^>]*>", "", Content); Content = ReplaceHtml("</?strong[^>]*>", "", Content); string clearHtml = Content; return clearHtml; }
/// <summary> /// 获取Html代码中所要抓取的链接集合 /// </summary> /// <param name="html"></param> /// <param name="url"></param> /// <param name="strReg"></param> /// <returns></returns> public string[] GetLinks(string html, string url,string strReg) { Collection<string> urls = new Collection<string>(); MatchCollection matches = new Regex(strReg, RegexOptions.Singleline).Matches(html); var regLInk = string.Empty; //判断:如果Html源代码中没有 </body> 标签则证明源不完整,重新抓取 if (!html.Contains("</body>")) { var newContent = GetHttpSource(url, ""); GetLinks(newContent, url, strReg); } //循环添加 foreach (Match match in matches) { regLInk = match.Groups["key"].Value; urls.Add(regLInk); } return urls.ToArray(); }
0 0
- 爬虫程序
- 爬虫程序
- 爬虫程序
- 爬虫程序
- 网络爬虫程序
- 什么是网络爬虫程序
- Python的爬虫程序
- 网络爬虫程序
- 什么是网络爬虫程序?
- 网页爬虫程序pageSpider
- C#爬虫程序
- 小爬虫程序
- .net爬虫程序
- 什么是网络爬虫程序
- java小爬虫程序
- 第一个爬虫程序
- 网络爬虫程序
- python爬虫程序
- 11【项目2 - 职员有薪水了】
- 产品经理进阶沙盘演练
- 二叉树的深度和广度搜索算法
- 苹果审核指南
- XFire实现身份验证(基于Xfire SOAP Header的WebService安全验证)
- 爬虫程序
- 生产环境使用elasticsearch遇到的一些问题以及解决方法
- android relativelayout
- 关系模型、键值存储、文档存储、列式存储、图形数据库,解析五大流行的数据库模型
- 路由器操作系统那点事
- 用OpenSceneGraph实现的NeHe OpenGL教程 - 第二十五课
- java多线程死锁例子
- PHP定时执行计划任务
- 分享免费接口