常用的抓取web实例
来源:互联网 发布:淘宝网卖家服务中心 编辑:程序博客网 时间:2024/06/18 05:04
NetCatch.cs
using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;
using System.Collections;
using System.Collections.Generic;
using System.Xml;
using System.Text;
using System.Threading;
using System.Linq;
using System.Data.Linq;
namespace D_NewChannel_Video
{
public enum FixOptions
{
Logo,
Picture,
Flash,
}
public class PageArgs
{
public PageArgs()
{
PageUrlList = new List<string>();
RegStr = new List<string>();
}
public String Code { get; set; }
public String Url { get; set; }
public IList<string > RegStr { get; set; }
public IList PageUrlList { get; set; }
}
public class NetCatch:IDisposable
{
internal int repeatTime=0;
private VideoManager videoManager;
public string Resolution { get; set; }
public int ResolutionID { get; set; }
private string typeId;
public int categoryId { get; set; }
private readonly string logoPicRootPath = ConfigurationManager.AppSettings.Get("LogoPicPath");
internal int position = 1;
public string TypeAbbr
{
set {
typeId = value;
}
get
{
return typeId;
}
}
private string firstPage;
internal int page=1;
public NetCatch()
{
dbHelper = new VideoManager();
videoManager = new VideoManager();
}
//去除html标签
private string ReSign(string param)
{
return Regex.Replace(param,"<.+?>","");
}
/// <summary>
/// getMatch重载
/// </summary>
/// <param name="args"></param>
/// <returns></returns>
internal Match GetLastMatchByRegularExpressions(PageArgs args)
{
if (args.RegStr.Count < 2)
return Regex.Match(args.Code, args.RegStr[0]);
else
{
Match result=null;
for (int i = 0; i < args.RegStr.Count-1; i++)
{
result = Regex.Match(args.Code,args.RegStr[i]);
args.Code = result.ToString();
}
return result ;
}
}
/// <summary>
/// getMatchs
/// </summary>
/// <param name="args"></param>
/// <returns></returns>
internal MatchCollection GetLastMatchsByRegularExpressions(PageArgs args)
{
if (args.RegStr.Count < 2)
return Regex.Matches(args.Code, args.RegStr[0]);
else
{
Match param = GetLastMatchByRegularExpressions(args);
return Regex.Matches(param.ToString(), args.RegStr[args.RegStr.Count - 1]);
}
}
/// <summary>
/// 获取网页代码
/// </summary>
/// <param name="url"></param>
private string GetCode(string url)
{
using (WebClient web = new WebClient())
{
web.Encoding = System.Text.Encoding.Default;
Uri uri = new Uri(url);
try
{
return web.DownloadString(uri);
}
catch (Exception ex)
{
return "";
}
//Console.WriteLine("获取URL资源:" + url);
//Console.WriteLine("按任意键开始新列表页的数据收集");
//Console.ReadKey();
//Console.WriteLine("¥¥¥¥¥¥¥¥¥开始新列表页URL:" + url+"¥¥¥¥¥¥¥¥¥¥¥¥");
}
}
private string GetCode(string url,bool isUTF8)
{
using (WebClient web = new WebClient())
{
web.Encoding = System.Text.Encoding.UTF8;
Uri uri = new Uri(url);
return web.DownloadString(uri);
//Console.WriteLine("获取URL资源:" + url);
//Console.WriteLine("按任意键开始新列表页的数据收集");
//Console.ReadKey();
//Console.WriteLine("¥¥¥¥¥¥¥¥¥开始新列表页URL:" + url+"¥¥¥¥¥¥¥¥¥¥¥¥");
}
}
/// <summary>
/// 递归获取分页后的网页
/// </summary>
/// <param name="args"></param>
/// <returns></returns>
public void GetURLsByPage(PageArgs args,IList<string> particularRegStr)
{
if (args.PageUrlList.Count == 0)
{
//Console.WriteLine("分页第一页信息载入,运算开始");
args.PageUrlList.Add(args.Url.Substring(0, args.Url.LastIndexOf('/')));
args.PageUrlList.Add(args.Url.Substring(args.Url.LastIndexOf('/')));
firstPage = args.Url;
Console.WriteLine("该类第一页" + args.Url);
}
//根据具体情况变更开始收集数据的位置
position = 1;
args.Code = GetCode(args.Url);
if (args.Code == "")
return;
//随时修改
IList<string> particularList = GetURLsInOnePage(args.Code, particularRegStr, "");
foreach (var item in particularList)
{
if (TypeAbbr.ToLower() != "mtv" && item.Contains("target=/"_blank/">(详情)</a>"))
{
HandleIt(item, args);
}
else if (TypeAbbr.ToLower() == "mtv")
{
HandleMTV(item, args);
}
if (repeatTime > 5)
{
return;
}
}
//repeatTime++;
//if (repeatTime >= 1)
//{
// return;
//}
Match match = GetLastMatchByRegularExpressions(args);
args.Code = match.ToString();
if (match.Groups.Count < 2)
return;
args.Url = args.Url.Substring(0, args.Url.LastIndexOf('_')) + "_" + (Int32.Parse(args.Url.Substring(args.Url.LastIndexOf('_') + 1, args.Url.LastIndexOf('.') - args.Url.LastIndexOf('_') - 1)) + 1) + ".html";
page++;
GetURLsByPage(args, particularRegStr);
}
private void HandleIt(string item,PageArgs args)
{
Video video = new Video();
Match param = Regex.Match(item, "<p class=/"ml_photo/">([//s//S]+?)<img");
param = Regex.Match(param.Groups[1].ToString(), "http://([//s//S]+?)/([//s//S]+?)/"");
string sourceUrl = param.ToString().Substring(0, param.ToString().LastIndexOf('"'));
param = Regex.Match(item, "http://data.movie.xunlei.com/movie/(//d+)");
string particularUrl = param.ToString();
video.Remark = args.Url;
video.Remark += sourceUrl;
video.Remark += particularUrl;
string infoPageStr = GetCode(sourceUrl);
string parPageStr = GetCode(particularUrl, true);
string parDPageStr = GetCode(particularUrl.Replace("data.", "") + "/introduction", true);
//Console.WriteLine("开始获取详细页面资源URL:"+item);
param = Regex.Match(item, "http://img.gougou.com/([//s//S]+?)/"");
video.LogoFilePath = param.ToString().Substring(0, param.Value.Length - 1);
Boolean isRight = true;
try
{
GetParticular(parPageStr, video);
GetParticularDeeply(parDPageStr, video);
}
catch
{
isRight = false;
}
//存入数据库
if (isRight)
{
if (GetInfo(infoPageStr, position++, video))
{
sourceUrl = sourceUrl.Replace("down?", "zz?");
infoPageStr = GetCode(sourceUrl);
if (!GetSource(infoPageStr, video))
{
videoManager.DeleteSourceByVideoId(video.VideoId);
videoManager.DeleteByID(video.VideoId);
}
}
else
{
if (video.VideoId != 0)
{
videoManager.DeleteByID(video.VideoId);
}
}
}
}
private void HandleMTV(string item, PageArgs args)
{
Video video = new Video();
Match param = Regex.Match(item, "<p class=/"ml_photo/">([//s//S]+?)<img");
param = Regex.Match(param.Groups[1].ToString(), "http://([//s//S]+?)/([//s//S]+?)/"");
string sourceUrl = param.ToString().Substring(0, param.ToString().LastIndexOf('"'));
video.Remark = args.Url;
video.Remark += sourceUrl;
try
{
param = Regex.Match(item, "class=/"c_hui/">歌手:</span>([//s//S]+?)</p>");
video.Director = ReSign(param.Groups[1].ToString());
param = Regex.Match(item, "class=/"c_hui/">专辑:</span>([//s//S]*?)</div>");
video.Player = ReSign(param.Groups[1].ToString());
}
catch
{
return;
}
string infoPageStr = GetCode(sourceUrl);
//Console.WriteLine("开始获取详细页面资源URL:"+item);
param = Regex.Match(item, "http://images.music.xunlei.com/([//s//S]+?)/"");
video.LogoFilePath = param.ToString().Substring(0, param.Value.Length - 1);
if (GetInfo(infoPageStr, position++, video))
{
sourceUrl = sourceUrl.Replace("down?", "zz?");
infoPageStr = GetCode(sourceUrl);
if (!GetSource(infoPageStr, video))
{
videoManager.DeleteSourceByVideoId(video.VideoId);
videoManager.DeleteByID(video.VideoId);
}
}
else
{
if (video.VideoId != 0)
{
videoManager.DeleteByID(video.VideoId);
}
}
}
public void DownloadLogoPice(Video video)
{
//if (video.VideoId == 0)
// return;
using (WebClient web=new WebClient())
{
string uid = Guid.NewGuid().ToString();
web.Encoding = System.Text.Encoding.UTF8;
if (!Directory.Exists(logoPicRootPath+"//VideoSitePic"))
Directory.CreateDirectory(logoPicRootPath + "//VideoSitePic");
if (!Directory.Exists(logoPicRootPath+"//VideoSitePic//" + TypeAbbr))
Directory.CreateDirectory(logoPicRootPath+"//VideoSitePic//" + TypeAbbr);
if (!File.Exists(logoPicRootPath+"//VideoSitePic//" + TypeAbbr + @"/" + uid + ".jpg"))
{
try
{
web.DownloadFile(video.LogoFilePath, logoPicRootPath+"//VideoSitePic//" + TypeAbbr + @"/" + uid + @".jpg");
}
catch (Exception ex)
{
return;
}
}
video.LogoFilePath = "//VideoSitePic//" + TypeAbbr + @"/" + uid + ".jpg";
}
}
internal VideoManager dbHelper=null;
/// <summary>
/// 异步获取网页代码
/// </summary>
/// <param name="url"></param>
public void GetCodeAsyn(string url)
{
using (WebClient web = new WebClient())
{
//Console.WriteLine("异步获取URL资源:" + url);
web.Encoding = System.Text.Encoding.UTF8;
Uri uri = new Uri(url);
web.DownloadStringAsync(uri);
web.DownloadStringCompleted += new DownloadStringCompletedEventHandler(web_DownloadStringCompleted);
}
}
void web_DownloadStringCompleted(object sender, DownloadStringCompletedEventArgs e)
{
GetInfo(e.Result, position++,null);//---------------可能有问题
}
/// <summary>
/// 获取一页内的所有详细页地址信息 并 提取必要数据
/// </summary>
/// <param name="code"></param>
/// <param name="regStr"></param>
/// <returns></returns>
public IList<string> GetURLsInOnePage(String code, IList<string> RegStr, string urlPath)
{
IList<string> my_list = new List<string>();
//string p = "<div class=/"list/">([//s//S]+?)<div class=/"pager/">";
PageArgs thisArg = new PageArgs();
thisArg.RegStr = RegStr;
thisArg.Code = code;
MatchCollection matchs = GetLastMatchsByRegularExpressions(thisArg);
//剔除重复的 对一些数据进行存储
for (int i = 0; i < matchs.Count; i++)
{
if (my_list.Contains(urlPath + (matchs[i].Groups[1].ToString())))
continue;
my_list.Add(urlPath + (matchs[i].Groups[1].ToString()));
}
//Console.WriteLine("获取详细页地址集合");
return my_list;
}
private bool GetSource(string code,Video video)
{
bool isRight = true;
Match m = Regex.Match(code, "var g_filelist=//[([//s//S]+?)//];");
MatchCollection ms = Regex.Matches(m.Groups[1].ToString(), "//{([//s//S]+?)//}");
if (ms == null || ms.Count == 0)
isRight = false;
else
{
foreach (Match item in ms)
{
Source source = new Source();
Match param = Regex.Match(item.ToString(), "/"name/":/"([//s//S]+?)/",");
source.SourceName = param.Groups[1].ToString() == null ? "" : param.Groups[1].ToString();
param = Regex.Match(item.ToString(), "/"cid/":/"([//s//S]+?)/",");
source.Remark = param.Groups[1].ToString() == null ? "" : param.Groups[1].ToString();
param = Regex.Match(item.ToString(), "/"refer/":/"([//s//S]+?)/",");
source.Refer = param.Groups[1].ToString() == null ? "" : param.Groups[1].ToString();
param = Regex.Match(item.ToString(), "/"url/":/"([//s//S]+?)/"");
string url = param.Groups[1].ToString() == null ? "" : param.Groups[1].ToString();
source.FirstUrl = url;
Encoding u16 = Encoding.UTF8;
string[] up = url.Split('//');
url = "";
for (int i = 1; i < up.Length; i++)
{
url += GetUTF8String(up[i]);
}
source.Url = url;
source.ThunderUrl = "thunder://" + EncodeBase64(url);
param = Regex.Match(item.ToString(), "/"ffmt/":/"([//s//S]+?)/",");
source.ffmt = param.Groups[1].ToString() == null ? "" : param.Groups[1].ToString();
if (video.VideoId == 0)
return false;
source.SourceVideoId = video.VideoId;
lock (this)
{
videoManager.InsertSourceAfterVideoSave(source);
}
}
}
return isRight;
}
private string GetUTF8String(string param)
{
Encoding u16 = Encoding.Unicode;
string result="";
byte[] bytes=new byte[2];
bytes[0] = Convert.ToByte(Convert.ToInt32(Convert.ToString(param.Substring(3, 2)), 16));
bytes[1] = Convert.ToByte(Convert.ToInt32(Convert.ToString(param.Substring(1, 2)), 16));
string chars = u16.GetString(bytes,0,2);
result += chars;
return result;
}
/// <summary>
/// Base64加密,采用utf8编码方式加密
/// </summary>
/// <param name="source">待加密的明文</param>
/// <returns>加密后的字符串</returns>
public string EncodeBase64(string source)
{
string decode = "";
byte[] bytes = Encoding.UTF8.GetBytes("AA"+source+"ZZ");
try
{
decode = Convert.ToBase64String(bytes);
}
catch (Exception)
{
decode = source;
}
return decode;
}
/// <summary>
/// 根据Remark获取详细信息 存储数据
/// </summary>
/// <param name="code"></param>
/// <returns></returns>
public bool GetInfo(String code, int position,Video video)
{
bool isRight = true;
//初始化video并给其赋值
video.VideoResolutionId = ResolutionID;
video.VideoCategoryId = categoryId;
Match m = Regex.Match(code, "var g_title=/"([//s//S]+?)/";");
video.Title = m.Groups[1].ToString();
m = Regex.Match(code, "var g_pageurl = /"([//s//S]+?)/";");
video.Refer = m.Groups[1].ToString();
m = Regex.Match(code, "var g_preViewd={/"definition/":/"(//d)/",/"healthy/":/"//d*/",/"code/":/"([//d]+?)/",/"size/":/"([//s//S]+?)/",/"format/":/"([//s//S]+?)/",/"playtime/":/"([//s//S]+?)/"");
video.Definition = m.Groups[1].ToString();
video.Code = m.Groups[2].ToString();
video.Size = m.Groups[3].ToString();
video.Format = m.Groups[4].ToString();
video.PlayTime = m.Groups[5].ToString();
m = Regex.Match(code, "var g_filelist=//[([//s//S]+?)//];");
MatchCollection ms = Regex.Matches(m.ToString(), "//{([//s//S]+?)//}");
if (ms.Count == 0)
video.FileNum = 1;
else
video.FileNum = ms.Count;
video.Uploaddate = DateTime.Now.ToShortDateString();
DownloadLogoPice(video);
lock (this)
{
if (videoManager.HasExists(video))
{
repeatTime++;
isRight = false;
Console.WriteLine("重复Video,"+video.Title+","+video.Size+","+video.VideoResolutionId);
}
else
{
video.VideoId = videoManager.InsertInGetURLsInOnePage(video);
Console.WriteLine(position + "/" + page + "/" + Resolution + "/" + typeId + " 新数据Id:" + video.VideoId + " Name:" + video.Title);
}
}
return isRight;
}
private void GetParticular(string code,Video video)
{
Match m = Regex.Match(code,"<h2>([//s//S]+?) <em>([//s//S]*?) <span>[//s//S]+?</h2>");
video.VideoName = ReSign( m.Groups[1].ToString());
video.EnName = ReSign( m.Groups[2].ToString());
video.Director= ReSign( GetDirector(code));
m = GetPlayer(code);
MatchCollection ms = Regex.Matches(m.Groups[1].ToString(), ">([//s//S]+?)</a>");
foreach (Match item in ms)
{
video.Player +=ReSign( item.Groups[1].ToString()) + " ";
}
if (video.Player!=null&&video.Player.Length > 0)
video.Player = video.Player.Substring(0, video.Player.Length-1);
video.Country = Getcountry(code);
m = Regex.Match(code, "<p><strong>语言: </strong>([//s//S]+?)</p>");
ms = Regex.Matches(m.Groups[1].ToString(), ">([//s//S]+?)</a>");
video.Language = "";
foreach (Match item in ms)
{
video.Language += item.Groups[1].ToString() + "|";
}
if (video.Language!=string.Empty&&video.Language!=null)
video.Language = video.Language.Substring(0, video.Language.Length - 1);
video.Viewdate = GetViewDate(code);
}
private void GetParticularDeeply(string code, Video video)
{
MatchCollection ms = Regex.Matches(code, "<h3>([//s//S]+?)</div>");
Match m;
if (ms.Count == 3)
{
m = Regex.Match(ms[2].ToString(), "<p>([//s//S]+?)</p>([//s]+)</div>");
video.Feature = m.Groups[1].ToString()==null?"":m.Groups[1].ToString();
m = Regex.Match(ms[1].ToString(), "<p>([//s//S]+?)</p>([//s]+)</div>");
video.CurtainBehind = m.Groups[1].ToString() == null ? "" : m.Groups[1].ToString();
m = Regex.Match(ms[0].ToString(), "<p>([//s//S]+?)</p>([//s]+)</div>");
video.Summary = m.Groups[1].ToString() == null ? "" : m.Groups[1].ToString();
}
else if (ms.Count == 2)
{
video.Feature="";
m = Regex.Match(ms[1].ToString(), "<p>([//s//S]+?)</p>([//s]+)</div>");
video.CurtainBehind = m.Groups[1].ToString() == null ? "" : m.Groups[1].ToString();
m = Regex.Match(ms[0].ToString(), "<p>([//s//S]+?)</p>([//s]+)</div>");
video.Summary = m.Groups[1].ToString() == null ? "" : m.Groups[1].ToString();
}
else if (ms.Count == 1)
{
video.Feature="";
video.CurtainBehind="";
m = Regex.Match(ms[0].ToString(), "<p>([//s//S]+)</p>([//s]+)</div>");
video.Summary = m.Groups[1].ToString() == null ? "" : m.Groups[1].ToString();
}
else
{
video.Summary = "";
video.Feature = "";
video.CurtainBehind = "";
}
}
private string Getcountry(string code)
{
Match m = Regex.Match(code, "<p><strong>制片国家/地区: </strong>([//s//S]+?)>([//s//S]+?)</a></p>");
if (m.ToString() == "")
{
m = Regex.Match(code, "<p><strong>国家/地区: </strong>([//s//S]+?)>([//s//S]+?)</a></p>");
}
return m.Groups[2].ToString() == null ? "" : m.Groups[2].ToString();
}
private string GetDirector(string code)
{
Match m = Regex.Match(code, "<p><strong>导演: </strong>([//s//S]+?)</p>");
if (m.Value=="")
{
m = Regex.Match(code, "<p><strong>作者: </strong>([//s//S]+?)</p>");
if(m.Value=="")
m = Regex.Match(code, "<p><strong>主持: </strong>([//s//S]+?)</p>");
}
MatchCollection ms = Regex.Matches(m.Groups[1].ToString(), ">([//s//S]+?)</a>");
string result = "";
foreach (Match item in ms)
{
result += item.Groups[1].ToString() + " ";
}
return result;
}
private Match GetPlayer(string code)
{
Match m = Regex.Match(code, "<p><strong>主演: </strong>([//s//S]+?)</p>");
if (m.Value=="")
{
m = Regex.Match(code, "<p><strong>配音: </strong>([//s//S]+?)</p>");
}
return m;
}
private string GetViewDate(string code)
{
string date = "";
Match m = Regex.Match(code, "<p><strong>上映日期: </strong>([//s//S]+?)>([//s//S]+?) ([//s//S]+?)</span></p>");
date = m.Groups[2].ToString();
if (m.Groups[2].Length < 5)
{
m = Regex.Match(code, "<p><strong>上映日期: </strong><span>([//s//S]+?)</span></p>");
m = Regex.Match(m.Groups[1].ToString(), ">([//s//S]+?)</a>");
if (m.ToString() == "")
return "";
date = m.Groups[1].ToString();
}
return date==null?"":date;
}
#region IDisposable 成员
void IDisposable.Dispose()
{
}
#endregion
}
}
- 常用的抓取web实例
- python的web抓取技术
- JAVA Servlet 简单实例和web.xml常用的配置
- java web中常用对象对应的实例化接口
- WEB:常用标签学习实例
- Web抓取
- 简单的python爬虫抓取图片实例
- 一个RCurl抓取美团网信息的实例
- hibernate之优化抓取(定义全局抓取计划--延迟抓取的实例1)
- hibernate之优化抓取(定义全局抓取计划--延迟抓取的实例2)
- 从web抓取数据的一点经验教训
- 用于抓取网页内容的常用正则
- iptables 常用的实例
- jQuery常用的实例
- string的常用实例
- 常用的javascript实例
- PHP常用的实例
- (网页抓取)一个用PHP实现的网页抓取的实例
- 权限
- 新闻,商品推荐
- VC++远程控制软件的通信架构与源码分析
- 史上BT的排序算法
- 键盘钩子的类(VB.NET)
- 常用的抓取web实例
- Printf And Scalable Param in C Language (水滴石穿C语言之可变参数问题 )
- Windows Azure(三):结构
- ubuntu下安装 Eclipse 和 其KINECT配置
- 在Ubuntu上安装opencv 并且整合到Eclipse中
- XML与Object间序列化
- 计算机爱好者 VS. 程序员
- 泄露你的JavaScript技术很烂的五个表现
- [摘抄]1937年西方人拍电影反映中国生活《大地》