下载并读取PDF文本内容

来源:互联网 发布:ecmall多用户商城源码 编辑:程序博客网 时间:2024/05/29 16:17
    string strUrlFilePath = string.Empty;

            string url = @"http://www.jsgsj.gov.cn:58888//province/NoticeServlet.json?showCrcontentPdf=true&org=2156&id=70789446&seqId=1";

            HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);

            request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-CN; rv:1.8.1.8) Gecko/20071008 Firefox/2.0.0.8";

            request.Accept = "text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";

            request.AllowAutoRedirect = true;

            request.Headers.Add(HttpRequestHeader.AcceptCharset, "gb2312,utf-8;q=0.7,*;q=0.7");

            request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");

            request.Headers.Add(HttpRequestHeader.Cookie, "id58=05dz8VMG5yk8RCViPEsFAg==; city=wz; 58home=wz; ipcity=wz|%u6E29%u5DDE; myfeet_tooltip=end; __utma=253535702.1542940914.1392961328.1392961328.1392961328.1; __utmc=253535702; __utmz=253535702.1392961328.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)");

            request.Timeout = 30000;

            HttpWebResponse httpresponse = (HttpWebResponse)request.GetResponse();

            Stream stream = httpresponse.GetResponseStream();

            if (stream != null)
            {
                strUrlFilePath = httpresponse.ResponseUri.ToString();//拿到跳转后的地址
            }

            httpresponse.Close();

            WebClient wc = new WebClient();

            String pdf_path = @"D:\其它\PDFDownloadFile\aaaa.pdf";

            if (!System.IO.Directory.Exists(@"D:\其它\PDFDownloadFile"))
            {
                System.IO.Directory.CreateDirectory(@"D:\其它\PDFDownloadFile");//不存在就创建目录 
            }

            if (!string.IsNullOrEmpty(strUrlFilePath))
            {
                wc.DownloadFile(strUrlFilePath, pdf_path);//下载文件
            }

            //获取pdf内容
            FileInfo file = new FileInfo(pdf_path);
            PDDocument doc = PDDocument.load(file.FullName);

            PDFTextStripper pdfStripper = new PDFTextStripper();

            string text = pdfStripper.getText(doc);
原创粉丝点击