java解析word2003 doc文件中的表格

来源:互联网 发布:捕鱼机网络后台服务器 编辑:程序博客网 时间:2024/06/08 01:57

1;apache poi插件链接http://poi.apache.org/ 这个插件主要用于office文件文本内以及富文本(表格,图片)等的提取,还有支持对已知密码的office文件的提取,
 黑框里面的是Word (HWPF+XWPF)
其他格式的文件加密以及解密都有 见链接http://poi.apache.org/encryption.html
http://www.openoffice.org/sc/compdocfileformat.pdf
因为之前做全文检索的时候需要提取word03里面的表格进行分析,使用这个插件对加密的文件不支持,所以研究了下doc文件格式以及它的加密方式,另外office07系列的比较简单,基本都是对xml解析,这里就不分析了。再说一句,这里只是对linux平台,其他调用微软com的那套就不用说了。

2;首先先说下word97-03文件是怎么加密的,word加密时并不是所有字节都加密,只是带有文本,表格,富文本,图片的stream才会加密,涉及到的加密算法有RC4,MD5算法。
具体细节参考:
http://wenku.baidu.com/link?url=TrCAFtr1mHXZbh3qnnOlhYmXTS7-ynw-CES0W6KOUzRzzRg7l04Y5LXl0V3W8pQRGO4SxyzXDlXk5zlLOzPphlXOxfC6UglqRzJlnb6439_

这里只是列出一些代码实现:
字符串转utf8

 public static String utf8ToUnicode(String inStr) {      char[] myBuffer = inStr.toCharArray();      StringBuffer sb = new StringBuffer();      for (int i = 0; i < inStr.length(); i++) {       UnicodeBlock ub = UnicodeBlock.of(myBuffer[i]);          if(ub == UnicodeBlock.BASIC_LATIN){           sb.append(myBuffer[i]);          }else if(ub == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS){           int j = (int) myBuffer[i] - 65248;           sb.append((char)j);          }else{           short s = (short) myBuffer[i];              String hexS = Integer.toHexString(s);              String unicode = "\\u"+hexS;           sb.append(unicode.toLowerCase());          }      }      return sb.toString();  }
//验证passwdpublic boolean  verifypwd(byte pwarray[], byte docid[], byte salt[], byte hashedsalt[],HWMPFEMD5 valContext)  {      HWPFRC4 tool  =new HWPFRC4();      HWPFRC4 key  =new HWPFRC4();      int offset=0, keyoffset=0;      int tocopy=5;      HWMPFEMD5 md5=new HWMPFEMD5();      HWMPFEMD5 md51=new HWMPFEMD5();      md5.md5Init();      md5.md5Update(pwarray, 64);      md5.getMD5StoreDigest(md5);       valContext.md5Init();      while (offset != 16)      {          if ((64 - offset) < 5)              tocopy = 64 - offset;          for(int y=0;y<tocopy;y++)          {             pwarray[offset+y]=md5.digest[keyoffset+y];          }          offset += tocopy;          if (offset == 64)            {            valContext.md5Update(pwarray, 64);            keyoffset = tocopy;            tocopy = 5 - tocopy;            offset = 0;            continue;            }          keyoffset = 0;          tocopy = 5;          for(int y=0;y<16;y++)          {              pwarray[offset+y]=docid[y];          }          offset += 16;       }       pwarray[16] = (byte) 0x80;       for(int i=0;i<47;i++)       {           pwarray[17+i]=0;       }       pwarray[56] = (byte)0x80;       pwarray[57] = (byte)0x0A;       valContext.md5Update(pwarray,64);       valContext.getMD5StoreDigest(valContext);        tool.makekey (0, key,valContext);       tool.rc4 (salt, 16, key);       tool.rc4 (hashedsalt, 16, key);       salt[16] = (byte) 0x80;       for(int i=0;i<47;i++) salt[17+i]=0;       salt[56] = (byte) 0x80;       md51.md5Init();       md51.md5Update(salt, 64);        md51.getMD5StoreDigest(md51);       for(int i=0;i<16;i++)       {              if(hashedsalt[i]!=md51.digest[i]) return false;       }       return true;  }
 public void preparekey(byte[] key_data_ptr, int key_data_len, HWPFRC4 key)     {            int index1;            int index2;            byte []state=new byte[256];            int counter;            state = key.state;            for (counter = 0; counter < 256; counter++) state[counter] =  (byte)counter;            key.x = 0;            key.y = 0;            index1 = 0;             index2 = 0;            for (counter = 0; counter < 256; counter++)              {              index2 =  ((key_data_ptr[index1]&0xff) + (state[counter]&0xff )+ index2 ) &0xff;              byte btemp=state[counter];              state[counter]=state[index2];              state[index2]=btemp;              index1 = ((index1 + 1) % key_data_len);              }     }     public void makekey(int block,HWPFRC4 rc4key,HWMPFEMD5 md5)      {                byte[]pwarray=new byte[64];             HWMPFEMD5 temp=new HWMPFEMD5();             for(int i=0;i<64;i++) pwarray[i]=0;             for(int i=0;i<5;i++)             {                   pwarray[i]=md5.digest[i];              }               pwarray[5] = (byte) (block & 0xFF);             pwarray[6] = (byte) ((block >> 8) & 0xFF);             pwarray[7] = (byte) ((block >> 16) & 0xFF);             pwarray[8] = (byte) ((block >> 24) & 0xFF);             pwarray[9] = (byte)0x80;             pwarray[56] =(byte)0x48;             temp.md5Init();             temp.md5Update(pwarray, 64);             temp.getMD5StoreDigest(temp);             preparekey(temp.digest, 16, rc4key);     }     void rc4 ( byte[] buffer_ptr, int buffer_len, HWPFRC4  key)     {             int x;             int y;             byte []state=new byte[256];             int xorIndex;             int counter;             x = key.x;             y = key.y;             state = key.state;             for (counter = 0; counter < buffer_len; counter++)               {                  x =  ((x + 1) & 0xff);                  y = (((state[x]&0xff) + y) & 0xff);                  byte btemp=state[x];                  state[x]=state[y];                  state[y]=btemp;                  xorIndex =  (((state[x]&0xff )+ (state[y]&0xff)) & 0xff);                  buffer_ptr[counter]^=(state[xorIndex]);               }             key.x = x;              key.y = y;     }

这里代码是按照文档里面写的doc文件格式,先对密码进行unicde解码,然后取出x,docid,salt,hashedsalt,调用verifypwd进行验证,密码正确后会返回HWMPFEMD5对象,然后对_tableStream的每512字节进行RC4操作,在对_mainStream进行RC4操作,然后
_fib = new FileInformationBlock(_mainStream);
_fib.fillVariableFields(_mainStream, _tableStream);
这里存储文件头部信息,然后在对_dataStream进行解码,这里就是简单的针对512字节进行RC4操作,这里只是对文档的简单说明。
3;Java调用方式

 POIFSFileSystem pfs;            HWPFDocument hwpf =null;            try {                pfs = new POIFSFileSystem(new FileInputStream("./test/20030523jm.doc"));                hwpf = new HWPFDocument(pfs,"111111");             } catch ( Exception e) {            }                   Range range = hwpf.getOverallRange();            TableIterator it = new TableIterator(range);

4:githup链接
https://github.com/DusonWang/word95-03parse.git

参考项目代码http://sourceforge.net/projects/wvware/

5:jar包下载
http://download.csdn.net/detail/dusonblog/9310899

0 0
原创粉丝点击