Java引述POI实现Word转Html方法 _HTML/CSS

Java引用POI实现Word转Html方法
1.下载POI工具并引用
2.读取整个doc文档，获得该文档的所有字符串。
3.从该字符串中得到标题，把该标题构成一个HTML格式的字符串，如<html><head><title>测试文档</title></head><body>。
4.从该文档中判断是否有表格，如有，把每个表格的开始偏移量，结束偏移量记录下来，同时根据每个表格的行，列读取表格的内容，并构造出表格的HTML字符串。
5.从该字符串的第一个字符开始逐个字符循环，得到字符的字体，字号大小，直到下一个字符的字体，字号不一样时，把这些字符内容构造成一个HTML格式的字符串。
6.如果碰到字符为回车符，制表符，把回车符，制表符构造成HTML格式的字符串。
7.如果碰到字符为图片，读取图片，把图片放在指定路径，再把这一路径的信息构造成HTML字符串，如<img src='c://test//1.jpg'/>。
8.如读取字符串的位置等于表格的开始偏移量时，插入前面一构造出的表格HTML字符串，同时跳过表格的结束偏移量，继续往下循环读取字符。
9.由于以上读取是按字符串逐个读取，并且根据字符的变化同时构造出HTML字符串，所以当字符串读取完毕后，即构造出一个完整的HTML字符串。
10.举例
Word文件
HTML文件
11.源代码
WordExcelToHtml.java

01.package com;
02.import java.io.BufferedWriter;
03.import java.io.File;
04.import java.io.FileInputStream;
05.import java.io.FileNotFoundException;
06.import java.io.FileOutputStream;
07.import java.io.IOException;
08.import java.io.OutputStream;
09.import java.io.OutputStreamWriter;
10.
11.import org.apache.poi.hwpf.HWPFDocument;
12.import org.apache.poi.hwpf.model.PicturesTable;
13.import org.apache.poi.hwpf.usermodel.CharacterRun;
14.import org.apache.poi.hwpf.usermodel.Picture;
15.import org.apache.poi.hwpf.usermodel.Range;
16.import org.apache.poi.hwpf.extractor.WordExtractor;
17.import org.apache.poi.hwpf.usermodel.Paragraph;
18.import org.apache.poi.hwpf.usermodel.Table;
19.import org.apache.poi.hwpf.usermodel.TableCell;
20.import org.apache.poi.hwpf.usermodel.TableIterator;
21.import org.apache.poi.hwpf.usermodel.TableRow;
22.
23.
24.public class WordExcelToHtml {
25.
26.    /**
27.     * 回车符ASCII码
28.     */
29.    private static final short ENTER_ASCII = 13;
30.
31.    /**
32.     * 空格符ASCII码
33.     */
34.    private static final short SPACE_ASCII = 32;
35.
36.    /**
37.     * 水平制表符ASCII码
38.     */
39.    private static final short TABULATION_ASCII = 9;
40.
41.    public static String htmlText = "";
42.    public static String htmlTextTbl = "";
43.    public static int counter=0;
44.    public static int beginPosi=0;
45.    public static int endPosi=0;
46.    public static int beginArray[];
47.    public static int endArray[];
48.    public static String htmlTextArray[];
49.    public static boolean tblExist=false;
50.
51.    public static final String inputFile="c://bb.doc";
52.    public static void main(String argv[])
53.    {
54.        try {
55.            getWordAndStyle(inputFile);
56.        } catch (Exception e) {
57.            // TODO Auto-generated catch block
58.            e.printStackTrace();
59.        }
60.    }
61.
62.    /**
63.     * 读取每个文字样式
64.     *
65.     * @param fileName
66.     * @throws Exception
67.     */
68.
69.
70.    public static void getWordAndStyle(String fileName) throws Exception {
71.        FileInputStream in = new FileInputStream(new File(fileName));
72.        HWPFDocument doc = new HWPFDocument(in);
73.
74.         Range rangetbl = doc.getRange();//得到文档的读取范围
75.         TableIterator it = new TableIterator(rangetbl);
76.         int num=100;
77.
78.
79.         beginArray=new int[num];
80.         endArray=new int[num];
81.         htmlTextArray=new String[num];
82.
83.
84.
85.
86.
87.
88.        // 取得文档中字符的总数
89.        int length = doc.characterLength();
90.        // 创建图片容器
91.        PicturesTable pTable = doc.getPicturesTable();
92.
93.        htmlText = "<html><head><title>" + doc.getSummaryInformation().getTitle() + "</title></head><body>";
94.        // 创建临时字符串,好加以判断一串字符是否存在相同格式
95.
96.         if(it.hasNext())
97.         {
98.             readTable(it,rangetbl);
99.         }
100.
101.         int cur=0;
102.
103.        String tempString = "";
104.        for (int i = 0; i < length - 1; i++) {
105.            // 整篇文章的字符通过一个个字符的来判断,range为得到文档的范围
106.            Range range = new Range(i, i + 1, doc);
107.
108.
109.
110.            CharacterRun cr = range.getCharacterRun(0);
111.            //beginArray=new int[num];
112.             //endArray=new int[num];
113.             //htmlTextArray=new String[num];
114.            if(tblExist)
115.            {
116.                if(i==beginArray[cur])
117.                {
118.                    htmlText+=tempString+htmlTextArray[cur];
119.                    tempString="";
120.                    i=endArray[cur]-1;
121.                    cur++;
122.                    continue;
123.                }
124.            }
125.            if (pTable.hasPicture(cr)) {
126.                htmlText += tempString ;
127.                // 读写图片
128.                readPicture(pTable, cr);
129.                tempString = "";
130.            }
131.            else {
132.
133.                Range range2 = new Range(i + 1, i + 2, doc);
134.                // 第二个字符
135.                CharacterRun cr2 = range2.getCharacterRun(0);
136.                char c = cr.text().charAt(0);
137.
138.                System.out.println(i+"::"+range.getEndOffset()+"::"+range.getStartOffset()+"::"+c);
139.
140.                // 判断是否为回车符
141.                if (c == ENTER_ASCII)
142.                    {
143.                    tempString += "<br/>";
144.
145.                    }
146.                // 判断是否为空格符
147.                else if (c == SPACE_ASCII)
148.                    tempString += " ";
149.                // 判断是否为水平制表符
150.                else if (c == TABULATION_ASCII)
151.                    tempString += "    ";
152.                // 比较前后2个字符是否具有相同的格式
153.                boolean flag = compareCharStyle(cr, cr2);
154.                if (flag)
155.                    tempString += cr.text();
156.                else {
157.                    String fontStyle = "<span style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
158.
159.                    if (cr.isBold())
160.                        fontStyle += "font-weight:bold;";
161.                    if (cr.isItalic())
162.                        fontStyle += "font-style:italic;";
163.
164.                    htmlText += fontStyle + "" mce_style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
165.
166.                    if (cr.isBold())
167.                        fontStyle += "font-weight:bold;";
168.                    if (cr.isItalic())
169.                        fontStyle += "font-style:italic;";
170.
171.                    htmlText += fontStyle + "">" + tempString + cr.text() + "</span>";
172.                    tempString = "";
173.                }
174.            }
175.        }
176.
177.        htmlText += tempString+"</body></html>";
178.        writeFile(htmlText);
179.    }
180.
181.    /**
182.     * 读写文档中的表格
183.     *
184.     * @param pTable
185.     * @param cr
186.     * @throws Exception
187.     */
188.    public static void readTable(TableIterator it, Range rangetbl) throws Exception {
189.
190.        htmlTextTbl="";
191.        //迭代文档中的表格
192.
193.        counter=-1;
194.        while (it.hasNext())
195.        {
196.            tblExist=true;
197.             htmlTextTbl="";
198.             Table tb = (Table) it.next();
199.             beginPosi=tb.getStartOffset() ;
200.             endPosi=tb.getEndOffset();
201.
202.             System.out.println("............"+beginPosi+"...."+endPosi);
203.             counter=counter+1;
204.             //迭代行，默认从0开始
205.             beginArray[counter]=beginPosi;
206.             endArray[counter]=endPosi;
207.
208.             htmlTextTbl+="<table border>";
209.             for (int i = 0; i < tb.numRows(); i++) {
210.             TableRow tr = tb.getRow(i);
211.
212.             htmlTextTbl+="<tr>";
213.             //迭代列，默认从0开始
214.             for (int j = 0; j < tr.numCells(); j++) {
215.                 TableCell td = tr.getCell(j);//取得单元格
216.                 int cellWidth=td.getWidth();
217.
218.                 //取得单元格的内容
219.                 for(int k=0;k<td.numParagraphs();k++){
220.                         Paragraph para =td.getParagraph(k);
221.                         String s = para.text().toString().trim();
222.                         if(s=="")
223.                         {
224.                             s=" ";
225.                         }
226.                         System.out.println(s);
227.                         htmlTextTbl += "<td width="+cellWidth+ ">"+s+"</td>";
228.                         System.out.println(i+":"+j+":"+cellWidth+":"+s);
229.                    } //end for
230.                 }   //end for
231.              }   //end for
232.            htmlTextTbl+="</table>" ;
233.            htmlTextArray[counter]=htmlTextTbl;
234.
235.        } //end while
236.    }
237.
238.    /**
239.     * 读写文档中的图片
240.     *
241.     * @param pTable
242.     * @param cr
243.     * @throws Exception
244.     */
245.    public static void readPicture(PicturesTable pTable, CharacterRun cr) throws Exception {
246.        // 提取图片
247.        Picture pic = pTable.extractPicture(cr, false);
248.        // 返回POI建议的图片文件名
249.        String afileName = pic.suggestFullFileName();
250.        OutputStream out = new FileOutputStream(new File("c://test" + File.separator + afileName));
251.        pic.writeImageContent(out);
252.        htmlText += "<img src="c://test//" + afileName + "" mce_src="c://test//" + afileName + ""/>";
253.    }
254.
255.    public static boolean compareCharStyle(CharacterRun cr1, CharacterRun cr2)
256.    {
257.        boolean flag = false;
258.        if (cr1.isBold() == cr2.isBold() && cr1.isItalic() == cr2.isItalic() && cr1.getFontName().equals(cr2.getFontName()) && cr1.getFontSize() == cr2.getFontSize())
259.        {
260.            flag = true;
261.        }
262.        return flag;
263.    }
264.
265.
266.    /**
267.     * 写文件
268.     *
269.     * @param s
270.     */
271.    public static void writeFile(String s) {
272.        FileOutputStream fos = null;
273.        BufferedWriter bw = null;
274.        try {
275.            File file = new File("c://abc.html");
276.            fos = new FileOutputStream(file);
277.            bw = new BufferedWriter(new OutputStreamWriter(fos));
278.            bw.write(s);
279.        } catch (FileNotFoundException fnfe) {
280.            fnfe.printStackTrace();
281.        } catch (IOException ioe) {
282.            ioe.printStackTrace();
283.        } finally {
284.            try {
285.                if (bw != null)
286.                    bw.close();
287.                if (fos != null)
288.                    fos.close();
289.            } catch (IOException ie) {
290.            }
291.        }
292.    }
293.
294.