提取word文档的图片及表格并用html代码替换

来源:互联网 发布:php分销系统源码 编辑:程序博客网 时间:2024/06/01 10:15

实现功能:从word文档中提取图片及表格,并用html代码替换,将处理后的word 文档存放到一个html文档中

 /*
     * 实现步骤:1、源word另存为html。可以得到图片和html文件。
     *          2、从html文件中提取出table,替换word中的table。
     *          3、将得到的图片名称以img标签替换源word中的图片区域。
     *          4、将替换结束的源文件的内容读出来存放到test.html中

     */

我引用的.net 组件有 Microsoft.Office.Interop.Word以及Office,都是12.0.0.0版本

第一步:

 

    string htmlpath = WordBase.WordToHtml(sourceWord);//另存为html    #region 取得通过html获得的图片名称    List<string> imageUrl = new List<string>();//存放通过另存为html得到的图片地址    string imgFullpath = sourceWord.Replace(".docx", ".files").Replace(".doc", ".files");   DirectoryInfo di = new DirectoryInfo(imgFullpath);   FileInfo[] ArrFi = di.GetFiles();   foreach (FileInfo fi in ArrFi)   {    if (fi.Extension.ToLower() == ".jpg" || fi.Extension.ToLower() == ".gif" || fi.Extension.ToLower() == ".bmp" || fi.Extension.ToLower() == ".jpg")      {       imageUrl.Add(fi.Name);       }      }  #endregion     #region 将得到的图片,拷贝到新的文件夹中      if (!Directory.Exists(newSourceFolder + "image" + "\\"))        {          Directory.CreateDirectory(newSourceFolder + "image" + "\\");        }      for (int i = 0; i < imageUrl.Count; i++)      {        File.Copy(imgFullpath + "\\" + imageUrl[i], newSourceFolder + "image\\" + imageUrl[i], true);       }       parseFile(sourceWord, htmlpath, imageUrl);    #endregion




第二步到第四步:

 

 /// <summary>        ///         /// </summary>        /// <param name="filepath">源文件路径</param>        /// <param name="htmlpath">生成的html路径</param>        /// <param name="imgurl">生成的所有图片名称集合</param>        public void parseFile(string filepath, string htmlpath, List<string> imgurl)        {            object Nothing = System.Reflection.Missing.Value;            object filename = filepath;            Microsoft.Office.Interop.Word.Application WordApp = new Microsoft.Office.Interop.Word.ApplicationClass();            Microsoft.Office.Interop.Word.Document WordDoc = WordApp.Documents.Open(ref filename, ref Nothing, ref Nothing,                ref Nothing, ref Nothing, ref Nothing,                ref Nothing, ref Nothing, ref Nothing,                ref Nothing, ref Nothing, ref Nothing, ref Nothing, ref Nothing, ref Nothing, ref Nothing);            #region 处理tables            WordBase.WordTable(WordDoc, htmlpath);            #endregion            #region 处理图片            WordBase.WordPic(WordDoc, newSourceFolder + "image" + "\\", imgurl);            #endregion            #region 处理文字,生成新的html            StringBuilder sbContent = new StringBuilder();            foreach (Paragraph item in WordDoc.Paragraphs)            {                if (item != null)                {                    sbContent.Append("<p>");                    sbContent.Append(item.Range.Text);                    sbContent.Append("</p>");                }            }            WordDoc.Close(ref Nothing, ref Nothing, ref Nothing);            WordApp.Quit(ref Nothing, ref Nothing, ref Nothing);            StreamWriter sw = new StreamWriter(newSourceFolder + "New.html", false);            sw.Write(sbContent.ToString());            sw.Close();             #endregion        }

wordBase.cs如下

 

using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Text.RegularExpressions;using System.Net;using Microsoft.Office.Interop.Word;namespace WordOperator{    class WordBase    {        public WordBase()        {         }        /// <summary>        /// word转成html        /// </summary>        /// <param name="wordFileName"></param>        public static string WordToHtml(object wordFileName)        {            //在此处放置用户代码以初始化页面            Microsoft.Office.Interop.Word.ApplicationClass word = new Microsoft.Office.Interop.Word.ApplicationClass();            Type wordType = word.GetType();            Microsoft.Office.Interop.Word.Documents docs = word.Documents;            //打开文件            Type docsType = docs.GetType();            Microsoft.Office.Interop.Word.Document doc = (Microsoft.Office.Interop.Word.Document)docsType.InvokeMember("Open", System.Reflection.BindingFlags.InvokeMethod, null, docs, new Object[] { wordFileName, true, true });            //转换格式,另存为            Type docType = doc.GetType();            string[] wordSaveFileName = wordFileName.ToString().Split('.');            string strSaveFileName = wordSaveFileName[0] + ".html";            object saveFileName = (object)strSaveFileName;            docType.InvokeMember("SaveAs", System.Reflection.BindingFlags.InvokeMethod, null, doc, new object[] { saveFileName, Microsoft.Office.Interop.Word.WdSaveFormat.wdFormatFilteredHTML });            docType.InvokeMember("Close", System.Reflection.BindingFlags.InvokeMethod, null, doc, null);            //退出 Word            wordType.InvokeMember("Quit", System.Reflection.BindingFlags.InvokeMethod, null, word, null);            return saveFileName.ToString();        }        /// <summary>        ///  替换word中的tables        /// </summary>        /// <param name="WordDoc"></param>        public static void WordTable(Microsoft.Office.Interop.Word.Document WordDoc, string htmlpath)        {            string HtmlString = string.Empty;            HtmlString = GetHtml(Encoding.GetEncoding("GB2312"), htmlpath);            Regex reg1 = new Regex("<table.*?>.*?</table>", RegexOptions.Singleline);            Regex reg2 = new Regex("style='.*?'", RegexOptions.Singleline);            MatchCollection DataString = reg1.Matches(HtmlString);            int i = 0;            string tablehtml = "";            foreach (Microsoft.Office.Interop.Word.Table ta in WordDoc.Tables)            {                tablehtml = DataString[i].ToString();                Microsoft.Office.Interop.Word.Range rangetemp = ta.Range;                ta.Delete();                MatchCollection datastyle = reg2.Matches(DataString[i].ToString());                for (int j = 0; j < datastyle.Count; j++)                {                    tablehtml = tablehtml.Replace(datastyle[j].ToString(), "");                }                rangetemp.InsertBefore(tablehtml.Replace("\r\n", ""));                i++;            }         }        protected static string GetHtml(Encoding enc, string url)        {            WebClient wc = new WebClient();            wc.Encoding = enc;            return wc.DownloadString(url);        }        /// <summary>        ///         /// </summary>        /// <param name="WordDoc"></param>        /// <param name="virtualPath">图片的虚拟路径</param>        /// <param name="ImageUrl">由该word通过另存为html得到的所有图片名称</param>        public static void WordPic(Microsoft.Office.Interop.Word.Document WordDoc, string virtualPath, List<string> ImageUrl)        {            List<Microsoft.Office.Interop.Word.Range> ranges = new List<Microsoft.Office.Interop.Word.Range>();            //处理嵌入式的图片            foreach (Microsoft.Office.Interop.Word.InlineShape s in WordDoc.InlineShapes)            {                if (s.Type == Microsoft.Office.Interop.Word.WdInlineShapeType.wdInlineShapePicture                || s.Type == Microsoft.Office.Interop.Word.WdInlineShapeType.wdInlineShapeEmbeddedOLEObject)                {                    ranges.Add(s.Range);                    s.Delete();                }            }            // 非Inlineshape图片是依据与段落的锁定标记关联起来的,所以要遍历所有的段落            foreach (Paragraph item in WordDoc.Paragraphs)//遍历段落            {                foreach (Microsoft.Office.Interop.Word.Shape s in item.Range.ShapeRange)                {                    try                    {                        s.Select();                        if (!s.AutoShapeType.ToString().Equals("msoShapeMixed")) { continue; }//仅仅处理图片,矩形框之类的不需要                        Range range = WordDoc.ActiveWindow.Selection.Range;                        ranges.Add(range);                        WordDoc.ActiveWindow.Selection.Delete();                    }                    catch                    {                    }                }            }            //将rangs按照在文档中出现的顺序排序            ranges = ranges.OrderBy(i => i.Start).ToList();            for (int i = 0; i < ranges.Count; i++)            {                Microsoft.Office.Interop.Word.Range r = ranges[i];                //替换图片                r.InsertBefore("<img width='90%' src='" + string.Concat(virtualPath, ImageUrl[i]) + "'>");                r.InsertAfter("</img>\r");            }        }    }}







原创粉丝点击