1.下载POI工具并引用
2.读取整个doc文档,获得该文档的所有字符串。
3.从该字符串中得到标题,把该标题构成一个HTML格式的字符串,如<html><head><title>测试文档</title></head><body>。
4.从该文档中判断是否有表格,如有,把每个表格的开始偏移量,结束偏移量记录下来,同时根据每个表格的行,列读取表格的内容,并构造出表格的HTML字符串。
5.从该字符串的第一个字符开始逐个字符循环,得到字符的字体,字号大小,直到下一个字符的字体,字号不一样时,把这些字符内容构造成一个HTML格式的字符串。
6.如果碰到字符为回车符,制表符,把回车符,制表符构造成HTML格式的字符串。
7.如果碰到字符为图片,读取图片,把图片放在指定路径,再把这一路径的信息构造成HTML字符串,如<img src='c://test//1.jpg'/>。
8.如读取字符串的位置等于表格的开始偏移量时,插入前面一构造出的表格HTML字符串,同时跳过表格的结束偏移量,继续往下循环读取字符。
9.由于以上读取是按字符串逐个读取,并且根据字符的变化同时构造出HTML字符串,所以当字符串读取完毕后,即构造出一个完整的HTML字符串。
10.举例
Word文件
HTML文件
11.源代码
WordExcelToHtml.java
01.package com;
02.import java.io.BufferedWriter;
03.import java.io.File;
04.import java.io.FileInputStream;
05.import java.io.FileNotFoundException;
06.import java.io.FileOutputStream;
07.import java.io.IOException;
08.import java.io.OutputStream;
09.import java.io.OutputStreamWriter;
10.
11.import org.apache.poi.hwpf.HWPFDocument;
12.import org.apache.poi.hwpf.model.PicturesTable;
13.import org.apache.poi.hwpf.usermodel.CharacterRun;
14.import org.apache.poi.hwpf.usermodel.Picture;
15.import org.apache.poi.hwpf.usermodel.Range;
16.import org.apache.poi.hwpf.extractor.WordExtractor;
17.import org.apache.poi.hwpf.usermodel.Paragraph;
18.import org.apache.poi.hwpf.usermodel.Table;
19.import org.apache.poi.hwpf.usermodel.TableCell;
20.import org.apache.poi.hwpf.usermodel.TableIterator;
21.import org.apache.poi.hwpf.usermodel.TableRow;
22.
23.
24.public class WordExcelToHtml {
25.
26. /**
27. * 回车符ASCII码
28. */
29. private static final short ENTER_ASCII = 13;
30.
31. /**
32. * 空格符ASCII码
33. */
34. private static final short SPACE_ASCII = 32;
35.
36. /**
37. * 水平制表符ASCII码
38. */
39. private static final short TABULATION_ASCII = 9;
40.
41. public static String htmlText = "";
42. public static String htmlTextTbl = "";
43. public static int counter=0;
44. public static int beginPosi=0;
45. public static int endPosi=0;
46. public static int beginArray[];
47. public static int endArray[];
48. public static String htmlTextArray[];
49. public static boolean tblExist=false;
50.
51. public static final String inputFile="c://bb.doc";
52. public static void main(String argv[])
53. {
54. try {
55. getWordAndStyle(inputFile);
56. } catch (Exception e) {
57. // TODO Auto-generated catch block
58. e.printStackTrace();
59. }
60. }
61.
62. /**
63. * 读取每个文字样式
64. *
65. * @param fileName
66. * @throws Exception
67. */
68.
69.
70. public static void getWordAndStyle(String fileName) throws Exception {
71. FileInputStream in = new FileInputStream(new File(fileName));
72. HWPFDocument doc = new HWPFDocument(in);
73.
74. Range rangetbl = doc.getRange();//得到文档的读取范围
75. TableIterator it = new TableIterator(rangetbl);
76. int num=100;
77.
78.
79. beginArray=new int[num];
80. endArray=new int[num];
81. htmlTextArray=new String[num];
82.
83.
84.
85.
86.
87.
88. // 取得文档中字符的总数
89. int length = doc.characterLength();
90. // 创建图片容器
91. PicturesTable pTable = doc.getPicturesTable();
92.
93. htmlText = "<html><head><title>" + doc.getSummaryInformation().getTitle() + "</title></head><body>";
94. // 创建临时字符串,好加以判断一串字符是否存在相同格式
95.
96. if(it.hasNext())
97. {
98. readTable(it,rangetbl);
99. }
100.
101. int cur=0;
102.
103. String tempString = "";
104. for (int i = 0; i < length - 1; i++) {
105. // 整篇文章的字符通过一个个字符的来判断,range为得到文档的范围
106. Range range = new Range(i, i + 1, doc);
107.
108.
109.
110. CharacterRun cr = range.getCharacterRun(0);
111. //beginArray=new int[num];
112. //endArray=new int[num];
113. //htmlTextArray=new String[num];
114. if(tblExist)
115. {
116. if(i==beginArray[cur])
117. {
118. htmlText+=tempString+htmlTextArray[cur];
119. tempString="";
120. i=endArray[cur]-1;
121. cur++;
122. continue;
123. }
124. }
125. if (pTable.hasPicture(cr)) {
126. htmlText += tempString ;
127. // 读写图片
128. readPicture(pTable, cr);
129. tempString = "";
130. }
131. else {
132.
133. Range range2 = new Range(i + 1, i + 2, doc);
134. // 第二个字符
135. CharacterRun cr2 = range2.getCharacterRun(0);
136. char c = cr.text().charAt(0);
137.
138. System.out.println(i+"::"+range.getEndOffset()+"::"+range.getStartOffset()+"::"+c);
139.
140. // 判断是否为回车符
141. if (c == ENTER_ASCII)
142. {
143. tempString += "<br/>";
144.
145. }
146. // 判断是否为空格符
147. else if (c == SPACE_ASCII)
148. tempString += " ";
149. // 判断是否为水平制表符
150. else if (c == TABULATION_ASCII)
151. tempString += " ";
152. // 比较前后2个字符是否具有相同的格式
153. boolean flag = compareCharStyle(cr, cr2);
154. if (flag)
155. tempString += cr.text();
156. else {
157. String fontStyle = "<span style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
158.
159. if (cr.isBold())
160. fontStyle += "font-weight:bold;";
161. if (cr.isItalic())
162. fontStyle += "font-style:italic;";
163.
164. htmlText += fontStyle + "" mce_style="font-family:" + cr.getFontName() + ";font-size:" + cr.getFontSize() / 2 + "pt;";
165.
166. if (cr.isBold())
167. fontStyle += "font-weight:bold;";
168. if (cr.isItalic())
169. fontStyle += "font-style:italic;";
170.
171. htmlText += fontStyle + "">" + tempString + cr.text() + "</span>";
172. tempString = "";
173. }
174. }
175. }
176.
177. htmlText += tempString+"</body></html>";
178. writeFile(htmlText);
179. }
180.
181. /**
182. * 读写文档中的表格
183. *
184. * @param pTable
185. * @param cr
186. * @throws Exception
187. */
188. public static void readTable(TableIterator it, Range rangetbl) throws Exception {
189.
190. htmlTextTbl="";
191. //迭代文档中的表格
192.
193. counter=-1;
194. while (it.hasNext())
195. {
196. tblExist=true;
197. htmlTextTbl="";
198. Table tb = (Table) it.next();
199. beginPosi=tb.getStartOffset() ;
200. endPosi=tb.getEndOffset();
201.
202. System.out.println("............"+beginPosi+"...."+endPosi);
203. counter=counter+1;
204. //迭代行,默认从0开始
205. beginArray[counter]=beginPosi;
206. endArray[counter]=endPosi;
207.
208. htmlTextTbl+="<table border>";
209. for (int i = 0; i < tb.numRows(); i++) {
210. TableRow tr = tb.getRow(i);
211.
212. htmlTextTbl+="<tr>";
213. //迭代列,默认从0开始
214. for (int j = 0; j < tr.numCells(); j++) {
215. TableCell td = tr.getCell(j);//取得单元格
216. int cellWidth=td.getWidth();
217.
218. //取得单元格的内容
219. for(int k=0;k<td.numParagraphs();k++){
220. Paragraph para =td.getParagraph(k);
221. String s = para.text().toString().trim();
222. if(s=="")
223. {
224. s=" ";
225. }
226. System.out.println(s);
227. htmlTextTbl += "<td width="+cellWidth+ ">"+s+"</td>";
228. System.out.println(i+":"+j+":"+cellWidth+":"+s);
229. } //end for
230. } //end for
231. } //end for
232. htmlTextTbl+="</table>" ;
233. htmlTextArray[counter]=htmlTextTbl;
234.
235. } //end while
236. }
237.
238. /**
239. * 读写文档中的图片
240. *
241. * @param pTable
242. * @param cr
243. * @throws Exception
244. */
245. public static void readPicture(PicturesTable pTable, CharacterRun cr) throws Exception {
246. // 提取图片
247. Picture pic = pTable.extractPicture(cr, false);
248. // 返回POI建议的图片文件名
249. String afileName = pic.suggestFullFileName();
250. OutputStream out = new FileOutputStream(new File("c://test" + File.separator + afileName));
251. pic.writeImageContent(out);
252. htmlText += "<img src="c://test//" + afileName + "" mce_src="c://test//" + afileName + ""/>";
253. }
254.
255. public static boolean compareCharStyle(CharacterRun cr1, CharacterRun cr2)
256. {
257. boolean flag = false;
258. if (cr1.isBold() == cr2.isBold() && cr1.isItalic() == cr2.isItalic() && cr1.getFontName().equals(cr2.getFontName()) && cr1.getFontSize() == cr2.getFontSize())
259. {
260. flag = true;
261. }
262. return flag;
263. }
264.
265.
266. /**
267. * 写文件
268. *
269. * @param s
270. */
271. public static void writeFile(String s) {
272. FileOutputStream fos = null;
273. BufferedWriter bw = null;
274. try {
275. File file = new File("c://abc.html");
276. fos = new FileOutputStream(file);
277. bw = new BufferedWriter(new OutputStreamWriter(fos));
278. bw.write(s);
279. } catch (FileNotFoundException fnfe) {
280. fnfe.printStackTrace();
281. } catch (IOException ioe) {
282. ioe.printStackTrace();
283. } finally {
284. try {
285. if (bw != null)
286. bw.close();
287. if (fos != null)
288. fos.close();
289. } catch (IOException ie) {
290. }
291. }
292. }
293.
294.