最近在开发过程中,遇到了需要将附件中的文本读出存储到数据库中的问题,下面就记录一下各个格式文本的读取方式,大家按需索取,如有不明白的地方,欢迎留言讨论。
String filePath = “你的文件路径";
注意,要精准到文件的后缀名,比如D:/demo.doc;
File file = new File(filePath);
当文件后缀为doc时:
if(file.isFile() && file.exists()) {
if (filePath.endsWith(".doc")) {
InputStream is = null;WordExtractor re = null;try {
is = new FileInputStream(new File(filePath));re = new WordExtractor(is);fileContent = re.getText();} catch (IOException e) {
e.printStackTrace();} finally {
try {
if(re != null){
re.close();}if(is != null){
is.close();}} catch (IOException e) {
e.printStackTrace();}}}
当文件后缀为docx时:
if (filePath.endsWith(".docx")) {
OPCPackage opcPackage = null;POIXMLTextExtractor extractor = null;try {
opcPackage = POIXMLDocument.openPackage(filePath);extractor = new XWPFWordExtractor(opcPackage);fileContent = extractor.getText();} catch (Exception e) {
e.printStackTrace();} finally {
try {
if(extractor != null){
extractor.close();}} catch (IOException e) {
e.printStackTrace();}}}
当文件后缀为pdf时:
if(filePath.endsWith(".pdf")){
FileInputStream in = null;RandomAccessRead randomAccessRead = null;try {
in = new FileInputStream(new File(filePath));randomAccessRead = new RandomAccessBufferedFileInputStream(in);PDFParser parser = new PDFParser(randomAccessRead);parser.parse();PDDocument pdDocument = parser.getPDDocument();PDFTextStripper stripper = new PDFTextStripper();fileContent = stripper.getText(pdDocument);} catch (IOException e) {
e.printStackTrace();} finally {
try {
if(randomAccessRead != null){
randomAccessRead.close();}if(in != null){
in.close();}} catch (IOException e) {
e.printStackTrace();}}}
当文件后缀为xls时:
if (filePath.endsWith(".xls")) {
List<String> listXLS = new ArrayList<>();// 解析excelPOIFSFileSystem fs = null;// 获取整个excelHSSFWorkbook hb = null;try {
fs = new POIFSFileSystem(new FileInputStream(filePath));hb = new HSSFWorkbook(fs);// 遍历多个sheet页for(int sheetIndex=0;sheetIndex<hb.getNumberOfSheets();sheetIndex++) {
HSSFSheet sheet = hb.getSheetAt(sheetIndex);//HSSFSheet sheet = hb.getSheetAt(0);// 获取第一行int firstrow = sheet.getFirstRowNum();// 获取最后一行int lastrow = sheet.getLastRowNum();// 循环行数依次获取列数for (int i = firstrow; i < lastrow + 1; i++) {
// 获取哪一行iHSSFRow row = sheet.getRow(i);if (row != null) {
// 获取这一行的第一列int firstcell = row.getFirstCellNum();// 获取这一行的最后一列int lastcell = row.getLastCellNum();//将每一行的每一列数据都存入集合中for (int j = firstcell; j < lastcell; j++) {
// 获取第j列HSSFCell cell = row.getCell(j);if (cell != null) {
String cellStr = cell.toString();if(CommonUtils.isNotEmpty(cellStr)){
listXLS.add(cell.toString());}}}}}fileContent= String.valueOf(listXLS);}} catch (IOException e) {
e.printStackTrace();} finally {
try {
if(hb != null){
hb.close();}if(fs != null){
fs.close();}} catch (IOException e) {
e.printStackTrace();}}}
当文件后缀为xlsx时:
if (filePath.endsWith(".xlsx")) {
List<String> listXLSX = new ArrayList<>();// 用流的方式先读取到你想要的excel的文件FileInputStream fis = null;// 获取整个excelXSSFWorkbook hb = null;try {
fis = new FileInputStream(new File(filePath));hb = new XSSFWorkbook(fis);// 遍历表单sheetfor(int sheetIndex=0;sheetIndex<hb.getNumberOfSheets();sheetIndex++) {
Sheet sheet = hb.getSheetAt(sheetIndex);//Sheet sheet = hb.getSheetAt(0);// 获取第一行int firstrow = sheet.getFirstRowNum();// 获取最后一行int lastrow = sheet.getLastRowNum();// 循环行数依次获取列数for (int i = firstrow; i < lastrow + 1; i++) {
// 获取哪一行iRow row = sheet.getRow(i);if (row != null) {
// 获取这一行的第一列int firstcell = row.getFirstCellNum();// 获取这一行的最后一列int lastcell = row.getLastCellNum();// 创建一个集合,用处将每一行的每一列数据都存入集合中for (int j = firstcell; j < lastcell; j++) {
// 获取第j列Cell cell = row.getCell(j);if (cell != null) {
String cellStr = cell.toString();if(CommonUtils.isNotEmpty(cellStr)){
listXLSX.add(cell.toString());}}}}}fileContent = String.valueOf(listXLSX);}} catch (IOException e) {
e.printStackTrace();} finally {
try {
if(hb != null){
hb.close();}if(fis != null){
fis.close();}} catch (IOException e) {
e.printStackTrace();}}}
大家有更好的方法欢迎留言讨论。