天涯文章采集器

来源:互联网 发布:linux c语言开发工具 编辑:程序博客网 时间:2024/05/16 12:13
using System;using System.Collections.Generic;using System.Text;using lintTools;using System.Web;using System.Net;using System.IO;using System.Text.RegularExpressions;using System.Collections;namespace ty{    class Program    {        static void Main(string[] args)        {            SortedList[] arrTitle;            string content="",writer="",title="",nextUrl="";            LintSys.WriteLine("********************天涯文章下载大师v0.1 Copyright By lintg.200801***********",ConsoleColor.Yellow);            LintSys.WriteLine("-s:单篇文章地址,默认取文章名称为下载文件名称,默认下载多页", ConsoleColor.Yellow);            LintSys.WriteLine("-u:下载页面的首页,默认取天涯舞文弄墨首页", ConsoleColor.Yellow);            LintSys.WriteLine("-p:下载页数,默认取100页", ConsoleColor.Yellow);            LintSys.WriteLine("-d:下载目录,默认取当前目录", ConsoleColor.Yellow);            LintSys.WriteLine("-f:过滤文件大小,默认过滤4000字节以下文件", ConsoleColor.Yellow);            string downMode = "m",url="http://cache.tianya.cn/pub/list/0/culture.shtml",downDir="./",sUrl="";            int downPage = 100,filterBytes=4000;            Regex regex=new Regex(@"-(?<paramName>[supdf]):(?<paramValue>/S*?)___FCKpd___0quot;,RegexOptions.Singleline);            for(int i=0;i<args.Length;i++)            {              //  Console.WriteLine(args[i]);                Match m=regex.Match(args[i]);                if(m.Success)                {                    //Console.WriteLine(m.Result("${paramName}") + m.Result("${paramValue}"));                    switch(m.Result("${paramName}"))                    {                        case "s":                            downMode="s";                            sUrl=m.Result("${paramValue}");                            Console.WriteLine(sUrl);                            break;                        case "u":                            url=m.Result("${paramValue}");                            break;                        case "p":                            try{                                downPage=int.Parse(m.Result("${paramValue}").ToString());                            }                            catch                            {                                Console.WriteLine("参数错误,-p:页数");                            }                            break;                        case "d":                            downDir=m.Result("${paramValue}").ToString();                            break;                            case "f":                             try{                                     filterBytes=int.Parse(m.Result("${paramValue}").ToString());                                }                            catch                             {                                Console.WriteLine("参数错误,-f:过滤文件大小");                            }                            break;                    }                }            }                        switch(downMode)            {                case "s":                     GetAuthor(sUrl,ref writer,ref title);                     content = GetArticle(sUrl,writer);                     LintSys.WriteFile(downDir+title + ".txt", content);                    break;                default:                                             for(int j=0;j<downPage;j++)            {                LintSys.WriteLine("第" + (j + 1).ToString() + "页:"+url, ConsoleColor.Red);                arrTitle = GetTitle(url, ref nextUrl);               // return;                for (int i = 1; i < arrTitle.Length-1; i++)                {                                        LintSys.WriteLine((j + 1).ToString() + "-"+i.ToString()+",下载文章:"+arrTitle[i]["title"].ToString()+",作者:"+arrTitle[i]["writer"],ConsoleColor.Green);                     content = GetArticle( arrTitle[i]["url"].ToString(), arrTitle[i]["writer"].ToString());                     if(content.Length>filterBytes)                        LintSys.WriteFile(arrTitle[i]["title"].ToString() + ".txt", content,FileMode.Create);                     else                         Console.WriteLine("文件:"+content.Length.ToString()+"<"+filterBytes.ToString()+",被过滤");                }                if (nextUrl != null)                {                    url = nextUrl;                }                else                {                    break;                }            }            break;            }                                            //content = GetArticle("http://cache.tianya.cn/publicforum/Content/culture/1/245493.shtml", "不愿当好人");            //LintSys.WriteFile("write.txt", content);        }        static bool GetAuthor(string url,ref string writer,ref string title)        {            CookieContainer cc = new CookieContainer();            string content = Net.GetContent(url, ref cc);            Regex regex = new Regex("<TITLE>(?<title>.*?)</TITLE>",RegexOptions.Singleline);            Match m = regex.Match(content);            if (m.Success)            {                title = m.Result("${title}");            }            regex = new Regex(@"作者:<a .*?>(?<writer>.*?)</a>", RegexOptions.Singleline);            m = regex.Match(content);            if (m.Success)            {                writer = m.Result("${writer}");            }            return true;        }        static string GetArticle(string url,string writer)        {            string content,filterContent="",replyContent;            int j=0;            CookieContainer cc = new CookieContainer();            while (true)            {                Console.WriteLine("连接" + url + ".....");                content = Net.GetContent(url, ref cc);                Regex regex = new Regex(writer.Replace("*","//*") + "</a>.*?</table>(?<content>.*?)(<TABLE)", RegexOptions.Singleline);                MatchCollection mc = regex.Matches(content);                LintSys.WriteLine("匹配回帖:" + mc.Count.ToString(), ConsoleColor.Yellow);           //     Console.WriteLine(mc.Count.ToString() + regex.ToString());                                                    for (int i = 0; i < mc.Count; i++)                    {                        replyContent=mc[i].Result("${content}").Trim();                        if (replyContent.Length>50&&replyContent.Substring(0, 2) != "作者"  || replyContent.Length > 100&&replyContent.Substring(0, 2) == "作者" )  //回帖字数超过30认为有效                        {                            filterContent += "(" + (j++).ToString() + ")/n" + replyContent;                        }                    }                                regex = new Regex(@"<a /S*? href=(?<url>/S*?)>下一页</a>",RegexOptions.Singleline);                Match m = regex.Match(content);                if (!m.Success)                    break;                else                    url = m.Result("${url}");            }            LintSys.WriteLine("下载完成....",ConsoleColor.DarkGreen);            return Trans.ReplaceHtml(filterContent);        }                static SortedList[] GetTitle(string url,ref string nextUrl)        {            SortedList[] title;            string content = "";            CookieContainer cc = new CookieContainer();            content = Net.GetContent(url, ref cc);           // Console.WriteLine(content);            LintSys.WriteFile("log.txt", content);            //Regex regex = new Regex(@"<a href='(?<url>http://cache.tianya.cn/publicforum/S*)'.*?>(?<title>.*?)<.*?vwriter=(?<writer>)'.*?", RegexOptions.Singleline);            Regex regex = new Regex(@"<a href='(?<url>http://cache.tianya.cn/publicforum/content/S*)'.*?>(?<title>.*?)<.*?vwriter=(?<writer>.*?)'", RegexOptions.Singleline);            MatchCollection mc = regex.Matches(content);           // Console.WriteLine(mc.Count.ToString());           // return null;            title = new SortedList[mc.Count ];            for (int i = 0; i < mc.Count-1; i++)             {                title[i] = new SortedList();                title[i]["url"] = mc[i].Result("${url}");                title[i]["title"] = (new Regex(Reg.dirStr)).Replace(mc[i].Result("${title}"),"");                title[i]["writer"] = mc[i].Result("${writer}");                //Console.WriteLine(mc[i].Result("${url}") + mc[i].Result("${title}") + mc[i].Result("${writer}"));            }            title[0] = new SortedList();            regex=new Regex(@"<a href=(?<url>/S*)?>下一页</a>",RegexOptions.Singleline);            Match m = regex.Match(content);            if (m.Success)            {                nextUrl = m.Result("${url}");                           }            return title;        }    }}
http://info95.vicp.net/info95/non-cgi/usr/5/5_6.rar
原创粉丝点击