get deflate stream 的代码

来源:互联网 发布:mac 看不到隐藏文件夹 编辑:程序博客网 时间:2024/04/28 20:27

最近忙别的,没空关心pdf了。既然有人需要;找了找,大概是这块了

//定义一个识别stream 的正则表达式;自己定义的,未必通用

Regex streamRegex = new Regex(@"<<[^>]*//FlateDecode[^>]*>>/s*stream/s*/n(.*?)/s*endstream", RegexOptions.Singleline);

。。。。。

               match = streamRegex.Match(pdfContent);
                if (match.Length > 0)
                {
                    pdfContent = match.Groups[1].Value;
                    documentContentStart += (match.Groups[1].Index + 2);
                    strLen = match.Groups[1].Length;
                    if (strLen < 2) {
                        strContent = "";
                    }
                    byte[] bufTemp = new byte[strLen - 2];
                    try
                    {
                        for (int i =0; i < strLen - 2; i++)    
                        {
                            bufTemp[i] = pdfBuf[documentContentStart + i];                                   
                        }

                        MemoryStream ms = new MemoryStream();
                        ms.Write(bufTemp, 0, bufTemp.Length);
                        ms.Position = 0;

//解压缩
                        DeflateStream deStream = new DeflateStream(ms, CompressionMode.Decompress, true);
                        //GZipStream deStream = new GZipStream(ms, CompressionMode.Decompress);
                        deStream.Flush();

                        int nSize = 16 * 1024;
                        byte[] decompressedBuffer = new byte[nSize]; //16*1024 + 256 Maxium
                        int totalCount = deStream.Read(decompressedBuffer, 0, nSize);
                        //int totalCount = ReadAllBytesFromStream(deStream, decompressedBuffer);
                        deStream.Close();
                        pdfContent = Encoding.Default.GetString(decompressedBuffer, 0, totalCount);
                        File.WriteAllText("c:/tmp/pdftxt.txt", pdfContent);
                        //strContent = strContent + "<P>" + pdfContent + "</P>";
                    }
                    catch (Exception ex)
                    {
                        throw new Exception("error inflate string", ex);
                    }
                }
           
            }
           

 

解压出来的文本是要继续分析的;算法比较复杂,需要用到矩阵的计算用来判断各段文本的位置。暂时没空搞了。