c#远程网页抓取器(beta1.0)

来源:互联网 发布:unity3d 播放动画 编辑:程序博客网 时间:2024/05/23 13:00

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Diagnostics;
namespace frmTest
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void Form1_Load(object sender, EventArgs e)
{
//初始化
txtFilePath.Text = Application.StartupPath;//初始化保存目录为应用程序所在的目录
txtWeb.Text = "http://www.cfchina.cn";
cboEnCode.SelectedIndex = 0;
timer1.Enabled = false;
lblResult.Text = "";
initToolTip();
}

private void btnSelect_Click(object sender, EventArgs e)
{
//选择文件的保存路径
if (folderBrowserDlg.ShowDialog()==DialogResult.OK) {
txtFilePath.Text = folderBrowserDlg.SelectedPath;
}
}
private void btnStart_Click(object sender, EventArgs e)
{
lblResult.Text = "正在初始化...";
getUrl();
timer1.Enabled = true;//启动定时器
}
//抓取网页并保存
private void getUrl() {
try
{
string fileName;
fileName = txtFilePath.Text + "//" + DateTime.Now.ToString().Replace(" ", "").Replace(":", "").Replace("-", "") + ".html";
WebClient mywebclient = new WebClient();
//从指定网址下载数据
lblResult.Text = "正在尝试从指定网址下载数据...";
byte[] pagedata = mywebclient.DownloadData(txtWeb.Text);
string pagehtml = "";
//设置编码
if (cboEnCode.SelectedItem.ToString() == "GB2312")
{
pagehtml = Encoding.Default.GetString(pagedata);
}
else
{
pagehtml = Encoding.UTF8.GetString(pagedata);
}
using (StreamWriter sw = new StreamWriter(fileName))
{
textBox1.Text = pagehtml;
textBox1.Text = textBox1.Text.Replace("gb2312", "utf-8");//默认抓取下的内容就是utf-8编码的,没这一行,抓取下来的网页无法正常显示
//很奇怪用pagehtml = pagehtml.Repalce("gb2312","utf-8")没用????可能pagehtml内部已经是utf-8了,根本找不到"gb2312"这几个字
sw.WriteLine(textBox1.Text);
lst1.Items.Add(fileName);
}
}
catch (WebException webEx)
{
lblResult.Text = "错误:" + webEx.Message.ToString();
}
}
//定时器事件
private void timer1_Tick(object sender, EventArgs e)
{
timer1.Interval = Convert.ToInt16(txtTime.Text) * 1000;
getUrl();
}
private void btnEnd_Click(object sender, EventArgs e)
{
lblResult.Text = "已停止";
timer1.Enabled = false;//关闭定时器
delmyFile();//删除生成的网页
lst1.Items.Clear();
}
//双击打开生成的网页
private void lst1_DoubleClick(object sender, EventArgs e)
{
if (lst1.Items.Count == 0)
{
return;
}
else
{
using (Process p = new Process())
{
p.StartInfo.FileName = "IEXPLORE.EXE";
p.StartInfo.Arguments = lst1.SelectedItem.ToString();
p.Start();
}
}
}

//删除生成的网页
private void delmyFile() {
string filename;
int i = 0;
for (i = 0; i <= lst1.Items.Count - 1; i++) {
filename = lst1.Items[i].ToString();
if (File.Exists(filename)){
File.Delete(filename);
}
}
}

//初始化toolTip
private void initToolTip()
{
toolTip1.SetToolTip(lst1, "抓取并保存后的网页名称列表/n双击可直接用IE打开抓取下来的网页");
toolTip1.SetToolTip(btnEnd, "停止网页抓取,并删除抓取下来的网页");
toolTip1.SetToolTip(btnStart, "开始抓取指定的网页");
toolTip1.SetToolTip(btnSelect, "选择保存的文件路径");
toolTip1.SetToolTip(txtTime, "定时器间隔秒数,即每隔多少秒抓取一次网页");
toolTip1.SetToolTip(lblTime, "定时器间隔秒数,即每隔多少秒抓取一次网页");
toolTip1.SetToolTip(textBox1, "显示抓取下来的网页代码内容");
toolTip1.SetToolTip(txtWeb, "输入要抓取的网址,必须以http://开头");
toolTip1.SetToolTip(cboEnCode, "选取网页的保存编码,必须与网站源代码一致,否则抓取下来后可能显示不正常");
toolTip1.SetToolTip(txtFilePath, "输入文件保存目录,抓下来的网页将保存在这里");
toolTip1.SetToolTip(btnPause, "暂时抓取");
}

//抓取暂停
private void btnPause_Click(object sender, EventArgs e)
{
    timer1.Enabled = false;
}
}
}


 

原创粉丝点击