抓取网页相关方法

来源:互联网 发布:网上买旧书知乎 编辑:程序博客网 时间:2024/06/05 14:49

最近在学习搜索引擎,记录一下相关信息

嵌入式数据库

  • BDB
  • Perst

信息过滤

  • Aho-Corasick

抓取网页的方法

  • 使用TCPClient
private static void TCPClientMethod(){    TcpClient client = new TcpClient();    string hostName = "www.sina.com.cn";    int PortNumber = 8080;    try    {        client.Connect(hostName, PortNumber);        Console.Write("链接上了");        //获得返回的数据流        NetworkStream clientStream = client.GetStream();        //利用数据流构建流读取器        StreamWriter writeStream = new StreamWriter(clientStream);        writeStream.Write("GET/HTTP/1.1\r\n"            + "User-Agent:craler request!\r\n"            + "Host:www.sina.com.cn\r\n"            + "Connection:Close\r\n"            + "\r\n"            );        writeStream.Flush();        string text = "";        byte[] buffer = new byte[1024];        while (clientStream.Read(buffer, 0, 1024) > 0)        {            text = text + Encoding.UTF8.GetString(buffer);        }        Console.WriteLine(text);    }    catch (Exception ex)    {        Console.WriteLine(ex.Message);    }    finally    {        client.Close();    }}
  • 使用WebRequest
private static void WebRequestMethod(string urlDownLoad){    //构建一个HttpWebRequest对象 代表要给某个url发送http请求    HttpWebRequest request = (HttpWebRequest)System.Net.WebRequest.Create(urlDownLoad);    //获得请求的响应    HttpWebResponse response = (HttpWebResponse)request.GetResponse();    //设置编码格式    Encoding encode = Encoding.GetEncoding("utf-8");    //使用流读取器进行读取    TextReader tr = new StreamReader(response.GetResponseStream(), encode);    //从头都到尾    string htmlcontent = tr.ReadToEnd();    //获得返回的头信息    WebHeaderCollection whc = response.Headers;    for (int i = 0; i < whc.Count; i++)    {        Console.WriteLine("Header " + whc.GetKey(i) + " : " + whc[i]);    }    response.Close();}
  • 使用WebClient
private static void WebClientMethod(string url){    WebClient webclient = new WebClient();    Stream stream = webclient.OpenRead(url);    StreamReader reader = new StreamReader(stream);    string strResult = reader.ReadToEnd();    Console.WriteLine(strResult);    reader.Close();    stream.Close();    webclient.Dispose();}

提交数据的方法

  • GET
private static void GETMethod(string url){    string DefaultUserAgent = "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.2;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727)";    HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;    request.Method = "GET";    //设置头信息    request.UserAgent = DefaultUserAgent;}
  • POST
private static void POSTMethod(string url){    HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;    request.Method = "POST";    //设置头信息    request.ContentType = "application/x-www-form-urlencoded"; }

*HEAD 获取url页面的长度

private static long HEADGetWebLength(string url){    try    {        long length = 0;        HttpWebRequest req = (HttpWebRequest)WebRequest.CreateDefault(new Uri(url));        req.Method = "HEAD";        req.Timeout = 5000;        HttpWebResponse res = (HttpWebResponse)req.GetResponse();        if (res.StatusCode == HttpStatusCode.OK)        {            length = res.ContentLength;        }        res.Close();        return length;    }    catch (Exception ex)    {        return 0;    }}
0 0