C#程序设计之多线程爬虫程序

来源：互联网发布：2016淘宝卖家电脑配置编辑：程序博客网时间：2024/06/14 19:09

一、简单介绍：
技术方面主要包括：
（1）技术选型：
1）课程设计使用的开发语言是C#。
2）课程设计选用了文件流方式获取网站数据。
3）课程设计使用多线程抓取网页代码。
4）课程设计使用了正则表达式对源码进行解析处理。

(2)程序运行流程：
通过图示可以更形象的了解程序运行的整个流程：
1）程序首先下载网站首页的源代码
2）对首页源代码进行分析，提取出网站建设类目下的链接并存储到队列中。
3）运用多线程，分别同时下载队列中的链接。
4）利用正则表达式对下载的链接源码进行分析。提取图片的URL并下载图片。截取需要的文本信息。
5）保存下载的文本和图片。把下载分析链接的信息显示在操作界面上。

程序的整体流程：
这里写图片描述

二、再贴代码：

using System;using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.Text;using System.Windows.Forms;using System.Collections;using System.Threading;using System.Net;using System.IO;using System.Text.RegularExpressions;using System.Diagnostics;namespace crawlWebsiteAndExtractInfo{    public partial class frmCrawlWebsite : Form    {        public static Queue<String> q = new Queue<string>();        public static string[] surl = new string[100];       //照片冗余        public static int a = 0;        public static int j = 0;        //public static int m = 0;        public static bool b = false;        public static object locker = new object();//添加一个对象作为锁        public static object locker1 = new object();        public static object locker2 = new object();        public static object locker3 = new object();        public static Stopwatch watch = new Stopwatch();        public static bool flag=true;        public static string textbox = string.Empty;        public int Num=5;         //string[] surl = new string[300];        public frmCrawlWebsite()        {            InitializeComponent();        }        private void btnCrawlAndExtract_Click(object sender, EventArgs e)        {            //获得网址            //http://www.hyzbi.com            string urlToCrawl = txbUrlToCrawl.Text;            //HTTP请求            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);            //GET方法            req.Method = "GET";            //获得HTTP回复            HttpWebResponse resp = (HttpWebResponse)req.GetResponse();            //定义编码方式            string htmlCharset = "utf-8";            //编码方式            Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);            StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);            //显示html内容            string respHtml = sr.ReadToEnd();            rtbExtractedHtml.Text = respHtml;        }        private void btnExtractInfo_Click(object sender, EventArgs e)        {            //找到网站建设类目下面的连接                     string h1userP = @"/portal/article/index/cid/19/id/(\d+)";            //捕获匹配            MatchCollection foundH1user = (new Regex(h1userP)).Matches(rtbExtractedHtml.Text);            foreach (Match m in foundH1user)            {                string url = "http://www.hyzbi.com" + (string)m.Value;                q.Enqueue(url);            }            Thread[] downloadThread;//声名下载线程            downloadThread = new Thread[21];//为线程申请资源，确定线程总数            //int i=0;           // richTextBox1.Text = "aaa";            watch.Start();            for (int i = 0; i <Num; i++)            {                ThreadStart startDownload = new ThreadStart(DownLoad);                //ParameterizedThreadStart startDownload = new ParameterizedThreadStart(DownLoad);                downloadThread[i] = new Thread(startDownload);//指定线程起始设置                downloadThread[i].Start();//逐个开启线程            }            //while (q.Count != 0) ;        }        public delegate void ProcessDelegate();        public void richTextShow(string ss)        {            string s=ss;            richTextBox1.Text=s;        }        public void picture(string rrh)        {            string rh = rrh;            string pp = @"src\s*=\s*[""']?([^'"" >]+?)[ '""][^>]*?>";            MatchCollection found = (new Regex(pp)).Matches(rh);            foreach (Match mm in found)            {                string urll = "http://www.hyzbi.com" + (string)mm.Groups[1].Value;                int id = Array.IndexOf(surl, urll);                //exists = ((IList)surl).Contains(urll);                if (id == -1)                {                    surl[a] = urll;                    a++;                    try                    {                        Bitmap img = null;                        HttpWebRequest req = (HttpWebRequest)(WebRequest.Create(urll));                        req.Method = "GET";                        HttpWebResponse res = (HttpWebResponse)(req.GetResponse());                        img = new Bitmap(res.GetResponseStream());                        lock (locker3)                        {                            img.Save(@"e:/c/" + a + ".jpg");                        }                        //m++;                    }                    catch (Exception ee)                    {                    }                }            }        }       // public void picture save()        public string wenben(string rrh)        {            string rh = rrh;            //抓取标题            //string hstrOutput=null;            string bt = @"<h2>(.*)</h2>";            //StreamWriter sw = new StreamWriter("e:/b.txt", true);            //MatchCollection bbt = (new Regex(bt)).Matches(rh);            Match nnn = (new Regex(bt)).Match(rh);           // foreach (Match nn in bbt)            //{                //richTextBox1.Text += nn.Value + "\n";                Regex hregex = new Regex("<.+?>", RegexOptions.IgnoreCase);                string hstrOutput = hregex.Replace(nnn.Value, "");//替换掉"<"和">"之间的内容                hstrOutput = hstrOutput.Replace("<", "");                hstrOutput = hstrOutput.Replace(">", "\r");                hstrOutput = hstrOutput.Replace(" ", "");               // sw.WriteLine(hstrOutput);               // sw.WriteLine("\n");           // }                string p = @"<p.*>(<span .*>)?(.*)(</span>)?</p>";                MatchCollection tp = (new Regex(p)).Matches(rh);                lock (locker)                {                    StreamWriter sw = new StreamWriter("e:/b.txt", true);                    sw.WriteLine(hstrOutput);                    sw.WriteLine("\n");                    //StreamWriter sw = new StreamWriter("e:/b.txt", true);                    //string p = @"<p.*>(<span .*>)?(.*)(</span>)?</p>";                    //MatchCollection tp = (new Regex(p)).Matches(rh);                    foreach (Match n in tp)                    {                        Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);                        string strOutput = regex.Replace(n.Value, "");//替换掉"<"和">"之间的内容                        strOutput = strOutput.Replace("<", "");                        strOutput = strOutput.Replace(">", "\r");                        strOutput = strOutput.Replace(" ", "");                        //sw.WriteLine(n.Groups[0].Value);                        sw.WriteLine(strOutput);                        sw.WriteLine("\n");                    }                    sw.Close();                }            return hstrOutput;        }        public  void DownLoad()        {                while (true)                {                    string url;                    string h2;                    if (q.Count != 0)                    {                        lock (locker1)                        {                            url = q.Dequeue();                            j++;                        }                        //richTextBox1.Text = url;                        try                        {                            HttpWebRequest rr = (HttpWebRequest)WebRequest.Create(url);                            rr.Method = "GET";                            HttpWebResponse resp = (HttpWebResponse)rr.GetResponse();                            string htmlCharset = "utf-8";                            Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);                            StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);                            string rh = sr.ReadToEnd();                            picture(rh);                            h2 = wenben(rh);                            lock (locker2)                            {                                ProcessDelegate showProcess = delegate()                                {                                    richTextBox1.AppendText(url + h2 + DateTime.Now.ToString() + "\n");                                };                                richTextBox1.Invoke(showProcess);                            }                        }                        catch (Exception eee)                        {                        }                    }                    else                    {                       /* flag = false;                        watch.Stop();                        string time = watch.ElapsedMilliseconds.ToString();                        ProcessDelegate showTime = delegate()                        {                            richTextBox1.AppendText("共用时" + time + "\n");                        };                        richTextBox1.Invoke(showTime);*/                        break;                    }                }        }       // }        private void button1_Click(object sender, EventArgs e)        {            //string path1 = @"e:\c";  //打开D盘下的log.txt文件            //System.Diagnostics.Process.Start(path1);            string path2 = @"e:\c";  //调用资源管理器，打开e盘下的c文件夹            System.Diagnostics.Process.Start("explorer", path2);        }        private void button2_Click(object sender, EventArgs e)        {                string path1 = @"e:\b.txt";                System.Diagnostics.Process.Start("explorer",path1);        }        private void button3_Click(object sender, EventArgs e)        {            try            {                textbox = this.textBox1.Text;                Num = Convert.ToInt32(textbox);            }            catch (Exception eee)            {            }        }    }}

三、多说一点：
（1）文件流方式：
C#中通常有三种方法获取网页内容。第一种方式为：使用webclient、第二种方式为：webBrowser、第三种方式为HttpWebRequest/HttpWebResponse
。此次程序设计中选用的是第三种方式即HttpWebResquest/HttpWebResponse方式。这是一种比较通用的获取方式。
（2）c#中的多线程：
在Visul C#中System.Threading 命名空间提供一些使得可以进行多线程编程的类和接口，其中线程的创建有以下三种方法：Thread、ThreadPool、Timer。
本次课程设计中选用的是Thread方式。这也许是最复杂的方法，但它提供了对线程的各种灵活控制。首先你必须使用它的构造函数创建一个线程实例，它的参数比较简单，只有一个ThreadStart 委托：
public Thread(ThreadStart start);
然后调用Start（）启动它。
（3）正则表达式：
在编写处理字符串的程序或网页时，经常会有查找符合某些复杂规则的字符串的需要。正则表达式就是用于描述这些规则的工具。换句话说，正则表达式就是记录文本规则的代码。
在此次课程设计中，使用了正则表达式对网页的源码进行截取和分析，以获得想要的数据信息。
程序流程详解：
（1）程序使用文件流方式从首页hyzbi.com中下载整个页面的源代码。
（2）通过使用正则表达式，将网站建设类目下的网页URL截取下来，并保存在一个队列中。
（3）启用多线程，每个线程中从队列中拿取一个URL。此时此时使用了互斥锁，避免线程冲突。线程每拿取一个URL就将队列中的URL记录删除，避免重复下载。线程中同样运用文件流的方式下载网页URL的源代码。并封装了两个函数，分别对应图片和文本的下载分析。
当线程工作完成时，将完成时间以及完成项目显示在操作界面上。
（4）对于图面的解析，使用正则表达式截取网页URL中所有的图片URL。把图片URL保存在数组中，美获得一个URL就和数组中的项比较，有重复项就舍弃，无重复项就下载，这样就做到了图片的冗余，避免下载重复的图片。
（5）对于文本的解析，则使用了正则表达式的截取和替代。在文本的保存上，也加了一个互斥锁，使同一时刻只有一个线程可以访问本地文件保存文本。
四、程序运行结果：
1、主程序界面
这里写图片描述
2、爬取的网站中的图片信息：

3、爬取的网站中的文本信息：

四、更多了解：
完整的代码及设计说明我已上至资源，手戳这里是链接儿
【注】程序中指定了爬取的网站网址
不知为什么贴的代码，移动端看，csdn的编辑器会出乱码？？？
我看了一下，电脑上看，显示的代码是正确的，最好把代码从我的资源中下下来看，没有问题。

阅读全文

1 0