Java 读取Word文本框中的文本、图片、表格
Java 读取Word中的文本的图片
Java 提取Word中的文本和图片本文将介绍通过Java来提取或读取Word文档中文本和图片的方法。
这里提取文本和图片包括同时提取文档正文当中以及页眉、页脚中的的文本和图片。
使用工具:Spire.Doc for JavaJar文件导入方法(参考):方法1:下载jar文件包。
下载后解压文件,并将lib文件夹下的Spire.Doc.jar文件导入到java程序。
导入效果参考如下:方法2:可通过maven导入。
参考导入方法。
测试文档如下:Java 代码示例(供参考)【示例1】提取Word 中的文本 import com.spire.doc.*; import java.io.FileWriter;import java.io.IOException;public class ExtractText {public static void main(String[] args) throws IOException{//加载测试文档Document doc = new Document();doc.loadFromFile("test.docx");//获取文本保存为StringString text = doc.getText();//将String写入TxtwriteStringToTxt(text,"提取文本.txt");}public static void writeStringToTxt(String content, String txtFileName) throws IOException {FileWriter fWriter= new FileWriter(txtFileName,true);try {fWriter.write(content);}catch(IOException ex){ex.printStackTrace();}finally{try{fWriter.flush();fWriter.close();} catch (IOException ex) {ex.printStackTrace();}}}}文本提取结果:【示例2】提取Word中的图片import com.spire.doc.Document;import com.spire.doc.documents.DocumentObjectType;import com.spire.doc.fields.DocPicture;import com.spire.doc.interfaces.ICompositeObject;import com.spire.doc.interfaces.IDocumentObject;import javax.imageio.ImageIO;import java.awt.image.RenderedImage;import java.io.File;import java.io.IOException;import java.util.ArrayList;import java.util.LinkedList;import java.util.List;import java.util.Queue;public class ExtractImg {public static void main(String[] args) throws IOException { //加载Word文档Document document = new Document();document.loadFromFile("test.docx");//创建Queue对象Queue nodes = new LinkedList();nodes.add(document);//创建List对象List images = new ArrayList();//遍历文档中的子对象while (nodes.size() > 0) {ICompositeObject node = (ICompositeObject) nodes.poll();for (int i = 0; i < node.getChildObjects().getCount(); i++) {IDocumentObject child = node.getChildObjects().get(i);if (child instanceof ICompositeObject) {nodes.add((ICompositeObject) child);//获取图片并添加到Listif (child.getDocumentObjectType() == DocumentObjectType.Picture) { DocPicture picture = (DocPicture) child;images.add(picture.getImage());}}}}//将图片保存为PNG格式文件for (int i = 0; i < images.size(); i++) {File file = new File(String.format("图片-%d.png", i));ImageIO.write((RenderedImage) images.get(i), "PNG", file);}}}图片提取结果:(本文完)。
JavaPOI操作word文档内容、表格
JavaPOI操作word⽂档内容、表格⼀、pom<dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>4.0.0</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>4.0.0</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>4.0.0</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml-schemas</artifactId><version>4.0.0</version></dependency>⼆、直接上代码word模板中${content} 注意我只有在.docx⽤XWPFDocument才有效2.1/*** 获取document**/XWPFDocument document = null;try {document = new XWPFDocument(inputStream);} catch (IOException ioException) {ioException.printStackTrace();}/*** 替换段落⾥⾯的变量** @param doc 要替换的⽂档* @param params 参数*/private void replaceInPara(XWPFDocument doc, Map<String, String> params) {for (XWPFParagraph para : doc.getParagraphs()) {replaceInPara(para, params);}}/*** 替换段落⾥⾯的变量** @param para 要替换的段落* @param params 参数*/private void replaceInPara(XWPFParagraph para, Map<String, String> params) {List<XWPFRun> runs;Matcher matcher;replaceText(para);//如果para拆分的不对,则⽤这个⽅法修改成正确的if (matcher(para.getParagraphText()).find()) {runs = para.getRuns();for (int i = 0; i < runs.size(); i++) {XWPFRun run = runs.get(i);String runText = run.toString();matcher = matcher(runText);if (matcher.find()) {while ((matcher = matcher(runText)).find()) {runText = matcher.replaceFirst(String.valueOf(params.get(matcher.group(1))));}//直接调⽤XWPFRun的setText()⽅法设置⽂本时,在底层会重新创建⼀个XWPFRun,把⽂本附加在当前⽂本后⾯, para.removeRun(i);para.insertNewRun(i).setText(runText);}}}}/*** 替换⽂本内容* @param para* @return*/private List<XWPFRun> replaceText(XWPFParagraph para) {List<XWPFRun> runs = para.getRuns();String str = "";boolean flag = false;for (int i = 0; i < runs.size(); i++) {XWPFRun run = runs.get(i);String runText = run.toString();if (flag || runText.equals("${")) {str = str + runText;flag = true;para.removeRun(i);if (runText.equals("}")) {flag = false;para.insertNewRun(i).setText(str);str = "";}i--;}}return runs;}2.22.2.1XWPFTable table = document.getTableArray(0);//获取当前表格XWPFTableRow twoRow = table.getRow(2);//获取某⼀⾏XWPFTableRow nextRow = table.insertNewTableRow(3);//插⼊⼀⾏XWPFTableCell firstRowCellOne = firstRow.getCell(0);firstRowCellOne.removeParagraph(0);//删除默认段落,要不然表格内第⼀条为空⾏XWPFParagraph pIO2 =firstRowCellOne.addParagraph();XWPFRun rIO2 = pIO2.createRun();rIO2.setFontFamily("宋体");//字体rIO2.setFontSize(8);//字体⼤⼩rIO2.setBold(true);//是否加粗rIO2.setColor("FF0000");//字体颜⾊rIO2.setText("这是写⼊的内容");//rIO2.addBreak(BreakType.TEXT_WRAPPING);//软换⾏,亲测有效/*** 复制单元格和样式** @param targetRow 要复制的⾏* @param sourceRow 被复制的⾏*/public void createCellsAndCopyStyles(XWPFTableRow targetRow, XWPFTableRow sourceRow) {targetRow.getCtRow().setTrPr(sourceRow.getCtRow().getTrPr());List<XWPFTableCell> tableCells = sourceRow.getTableCells();if (CollectionUtils.isEmpty(tableCells)) {return;}for (XWPFTableCell sourceCell : tableCells) {XWPFTableCell newCell = targetRow.addNewTableCell();newCell.getCTTc().setTcPr(sourceCell.getCTTc().getTcPr());List sourceParagraphs = sourceCell.getParagraphs();if (CollectionUtils.isEmpty(sourceParagraphs)) {continue;}XWPFParagraph sourceParagraph = (XWPFParagraph) sourceParagraphs.get(0);List targetParagraphs = newCell.getParagraphs();if (CollectionUtils.isEmpty(targetParagraphs)) {XWPFParagraph p = newCell.addParagraph();p.getCTP().setPPr(sourceParagraph.getCTP().getPPr());XWPFRun run = p.getRuns().isEmpty() ? p.createRun() : p.getRuns().get(0);run.setFontFamily(sourceParagraph.getRuns().get(0).getFontFamily());} else {XWPFParagraph p = (XWPFParagraph) targetParagraphs.get(0);p.getCTP().setPPr(sourceParagraph.getCTP().getPPr());XWPFRun run = p.getRuns().isEmpty() ? p.createRun() : p.getRuns().get(0);if (sourceParagraph.getRuns().size() > 0) {run.setFontFamily(sourceParagraph.getRuns().get(0).getFontFamily());}}}}#### 2.2.3/*** 合并单元格** @param table 表格对象* @param beginRowIndex 开始⾏索引* @param endRowIndex 结束⾏索引* @param colIndex 合并列索引*/public void mergeCell(XWPFTable table, int beginRowIndex, int endRowIndex, int colIndex) { if (beginRowIndex == endRowIndex || beginRowIndex > endRowIndex) {return;}//合并⾏单元格的第⼀个单元格CTVMerge startMerge = CTVMerge.Factory.newInstance();startMerge.setVal(STMerge.RESTART);//合并⾏单元格的第⼀个单元格之后的单元格CTVMerge endMerge = CTVMerge.Factory.newInstance();endMerge.setVal(STMerge.CONTINUE);table.getRow(beginRowIndex).getCell(colIndex).getCTTc().getTcPr().setVMerge(startMerge); for (int i = beginRowIndex + 1; i <= endRowIndex; i++) {table.getRow(i).getCell(colIndex).getCTTc().getTcPr().setVMerge(endMerge);}}/*** insertRow 在word表格中指定位置插⼊⼀⾏,并将某⼀⾏的样式复制到新增⾏* @param copyrowIndex 需要复制的⾏位置* @param newrowIndex 需要新增⼀⾏的位置* */public static void insertRow(XWPFTable table, int copyrowIndex, int newrowIndex) {// 在表格中指定的位置新增⼀⾏XWPFTableRow targetRow = table.insertNewTableRow(newrowIndex);// 获取需要复制⾏对象XWPFTableRow copyRow = table.getRow(copyrowIndex);//复制⾏对象targetRow.getCtRow().setTrPr(copyRow.getCtRow().getTrPr());//或许需要复制的⾏的列List<XWPFTableCell> copyCells = copyRow.getTableCells();//复制列对象XWPFTableCell targetCell = null;for (int i = 0; i < copyCells.size(); i++) {XWPFTableCell copyCell = copyCells.get(i);targetCell = targetRow.addNewTableCell();targetCell.getCTTc().setTcPr(copyCell.getCTTc().getTcPr());if (copyCell.getParagraphs() != null && copyCell.getParagraphs().size() > 0) {targetCell.getParagraphs().get(0).getCTP().setPPr(copyCell.getParagraphs().get(0).getCTP().getPPr()); if (copyCell.getParagraphs().get(0).getRuns() != null&& copyCell.getParagraphs().get(0).getRuns().size() > 0) {XWPFRun cellR = targetCell.getParagraphs().get(0).createRun();cellR.setBold(copyCell.getParagraphs().get(0).getRuns().get(0).isBold());}}}}/*** 正则匹配字符串** @param str* @return*/private Matcher matcher(String str) {Pattern pattern = pile("\\$\\{(.+?)\\}", Pattern.CASE_INSENSITIVE);Matcher matcher = pattern.matcher(str);return matcher;}。
java中读取word文档里的内容
Iterator<XWPFTable> itTable = document.getTablesIterator(); int ind = 0; while (itTable.hasNext()){
ind++; XWPFTable table = (XWPFTable) itTable.next(); //行 int rcount = table.getNumberOfRows(); for (int i = 0; i < rcount; i++){
// OPCPt.openPackage(srcPath); // XWPFDocument doc = new XWPFDocument(pack); return xdoc; } catch (IOException e) { System.out.println("读取文件出错!"); e.printStackTrace(); return null; } } return null; }
}
//pom.xml文件
<project xmlns="/POM/4.0.0" xmlns:xsi="/2001/XMLSchema-instance" xsi:schemaLocation="/POM/4.0.0 /xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId></groupId> <artifactId>excelReadAndWrite</artifactId> <version>0.0.1-SNAPSHOT</version> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.directory.studio</groupId> <artifactId>mons.codec</artifactId> <version>1.8</version> </dependency> <dependency> <groupId>net.sourceforge.jexcelapi</groupId> <artifactId>jxl</artifactId> <version>2.6.12</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.9</version> </dependency> </dependencies>
java读取word并解析
java读取word并解析java POI3.8处理word模板,⽂字图⽚表格将word模板⾥⾯的特殊标签换成⽂字,图⽚,以下是处理的代码特殊标签最好的复制粘贴到word模板⾥⾯ ,因为⼿动敲⼊可能有点⼩的差别都导致这个标签不是⼀⼩块(chunk)这样会⽆法识别,⽂字样式设置的时候也最好选择特殊标签整体进⾏设置,尽量不要多选(例如标签后⾯跟上⼀个空格)这⾥的替换包含了⽂字样式的替换,图⽚的替换-------------------------------------------------------------------------------------------------------------------------------------package com.util.export;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import java.util.Map.Entry;import org.apache.poi.POIXMLDocument;import org.apache.poi.openxml4j.exceptions.InvalidFormatException;import ermodel.ParagraphAlignment;import ermodel.UnderlinePatterns;import ermodel.VerticalAlign;import ermodel.XWPFDocument;import ermodel.XWPFParagraph;import ermodel.XWPFRun;import ermodel.XWPFTable;import ermodel.XWPFTableCell;import ermodel.XWPFTableRow;import org.apache.xmlbeans.XmlException;import org.apache.xmlbeans.XmlToken;import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;import org.openxmlformats.schemas.drawingml.x2006.main.CTPositiveSize2D;import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline;public class WordCompileReport {public static void searchAndReplace(String srcPath, String destPath,Map map,Map mapImage) {try {XWPFDocument document = new XWPFDocument(POIXMLDocument.openPackage(srcPath));//替换表格占位符checkTables(document,map);//替换段落占位符checkParagraphs(document,map);//在末尾添加⽂字addParagraphToWord(document,"这⾥添加⽂字",30,0,"#EB9074",true);//替换图⽚replaceTextToImage(document,mapImage,200,200);FileOutputStream outStream = null;outStream = new FileOutputStream(destPath);document.write(outStream);outStream.close();} catch (Exception e) {e.printStackTrace();}}public static void checkTables(XWPFDocument document,Map map) {Iterator it = document.getTablesIterator();while (it.hasNext()) {XWPFTable table = (XWPFTable) it.next();int rcount = table.getNumberOfRows();for (int i = 0; i < rcount; i++) {XWPFTableRow row = table.getRow(i);List cells = row.getTableCells();for (XWPFTableCell cell : cells) {List listCell;for (Entry e : map.entrySet()) {listCell = cell.getParagraphs();List cellRun;Map mapAttr = new HashMap();for (int j = 0; j < listCell.size(); j++) {if (listCell.get(j).getText().indexOf(e.getKey()) != -1) {cellRun = listCell.get(j).getRuns();for (int c = 0; c < cellRun.size(); c++) {if (cellRun.get(c).getText(0).equals(e.getKey())) {mapAttr = getWordXWPFRunStyle(cellRun.get(c));listCell.get(j).removeRun(c);XWPFRun newRun = listCell.get(j).insertNewRun(c);setWordXWPFRunStyle(newRun, mapAttr,e.getValue(), false);}}}}}}}}}public static void checkParagraphs(XWPFDocument document,Map map){List listRun;Map mapAttr = new HashMap();List listParagraphs = document.getParagraphs();for (int sa = 0; sa < listParagraphs.size(); sa++) {for (Entry e : map.entrySet()) {if (listParagraphs.get(sa).getText().indexOf(e.getKey()) != -1) {listRun = listParagraphs.get(sa).getRuns();for (int p = 0; p < listRun.size(); p++) {if (listRun.get(p).toString().equals(e.getKey())) {//得到占位符的⽂本格式XWPFRun runOld = listParagraphs.get(sa).getRuns().get(p);mapAttr=getWordXWPFRunStyle(runOld); //封装该占位符⽂本样式到map listParagraphs.get(sa).removeRun(p);//移除占位符//创建设置对应占位符的⽂本XWPFRun runNew = listParagraphs.get(sa).insertNewRun(p);setWordXWPFRunStyle(runNew,mapAttr,e.getValue(),true);}}}}}}public static Map getWordXWPFRunStyle(XWPFRun runOld){Map mapAttr = new HashMap();mapAttr.put("Color", runOld.getColor());if(-1==runOld.getFontSize()){mapAttr.put("FontSize", 12);}else{mapAttr.put("FontSize", runOld.getFontSize());}mapAttr.put("Subscript", runOld.getSubscript());mapAttr.put("Underline", runOld.getUnderline());mapAttr.put("FontFamily",runOld.getFontFamily());return mapAttr;}public static XWPFRun setWordXWPFRunStyle(XWPFRun runNew,Map mapAttr,String text,boolean flag){ runNew.setColor((String) mapAttr.get("Color"));if("-1".equals(mapAttr.get("FontSize").toString())){//处理⼩四字号读取为-1的问题runNew.setFontSize(12);}else{runNew.setFontSize((Integer) mapAttr.get("FontSize"));}runNew.setBold(flag);runNew.setUnderline((UnderlinePatterns) mapAttr.get("Underline"));runNew.setText(text);runNew.setSubscript((VerticalAlign) mapAttr.get("Subscript"));runNew.setFontFamily((String) mapAttr.get("FontFamily"));return runNew;}public static void updatePicture(XWPFDocument document,int id, int width, int height) {if(id==0){id = document.getAllPictures().size()-1;}final int EMU = 9525;width *= EMU;height *= EMU;String blipId = document.getAllPictures().get(id).getPackageRelationship().getId();CTInline inline = document.createParagraph().createRun().getCTR().addNewDrawing().addNewInline();String picXml = ""+ ""+ " "+ " "+ " " + "+ id+ "\" name=\"Generated\"/>"+ " "+ " "+ " "+ "+ blipId+ "\" xmlns:r=\"/officeDocument/2006/relationships\"/>"+ " "+ " "+ " "+ " "+ " "+ " "+ " "+ "+ width+ "\" cy=\""+ height+ "\"/>"+ " "+ " "+ " "+ " "+ " "+ " "+ " " + "";// CTGraphicalObjectData graphicData =inline.addNewGraphic().addNewGraphicData();XmlToken xmlToken = null;try {xmlToken = XmlToken.Factory.parse(picXml);} catch (XmlException xe) {xe.printStackTrace();}inline.set(xmlToken);// graphicData.set(xmlToken);inline.setDistT(0);inline.setDistB(0);inline.setDistL(0);inline.setDistR(0);CTPositiveSize2D extent = inline.addNewExtent();extent.setCx(width);extent.setCy(height);CTNonVisualDrawingProps docPr = inline.addNewDocPr();docPr.setId(id);docPr.setName("IMG_" + id);docPr.setDescr("IMG_" + id);}public static void addPictureToWord(XWPFDocument document,String imagePath,int imageType,int width,int height){if(0==imageType){imageType=XWPFDocument.PICTURE_TYPE_JPEG;}try {String ind = document.addPictureData(new FileInputStream(imagePath), imageType);} catch (InvalidFormatException e) {e.printStackTrace();} catch (FileNotFoundException e) {e.printStackTrace();}updatePicture(document,document.getAllPictures().size()-1,400,400);}public static void addParagraphToWord(XWPFDocument document,String text,int fontSize,int alignment,String RGBColor,boolean isBold){XWPFParagraph paragraph = document.createParagraph();if(1==alignment){paragraph.setAlignment(ParagraphAlignment.CENTER);}else if(2==alignment){paragraph.setAlignment(ParagraphAlignment.CENTER);}else if(3==alignment){paragraph.setAlignment(ParagraphAlignment.RIGHT);}else{paragraph.setIndentationLeft(alignment);}XWPFRun runOne = paragraph.createRun();runOne.setText(text);runOne.setBold(isBold);runOne.setFontSize(fontSize);if(RGBColor.startsWith("#")){runOne.setColor(RGBColor.substring(1));}else{runOne.setColor(RGBColor);}}public static void addRunToParagraph(XWPFParagraph paragraph,String text,int fontSize,String RGBColor,boolean isBold,boolean isWrap){XWPFRun runText = paragraph.createRun();// runText.setStrike(true); //删除线runText.setBold(isBold);runText.setColor(RGBColor);runText.setFontSize(fontSize);runText.setText(text);if(isWrap)runText.addBreak();}public static void replaceTextToImage(XWPFDocument document,Map mapImage,int width,int height){List listRun;List listParagraphs = document.getParagraphs();for (int sa = 0; sa < listParagraphs.size(); sa++) {for (Entry e : mapImage.entrySet()) {if (listParagraphs.get(sa).getText().indexOf(e.getKey()) != -1) {listRun = listParagraphs.get(sa).getRuns();for (int p = 0; p < listRun.size(); p++) {if (listRun.get(p).toString().equals(e.getKey())) {listParagraphs.get(sa).removeRun(p);//移除占位符//获得当前CTInlineCTInline inline = listParagraphs.get(sa).createRun().getCTR().addNewDrawing().addNewInline();try {insertPicture(document,e.getValue(),inline,width,height);} catch (InvalidFormatException e1) {e1.printStackTrace();} catch (FileNotFoundException e1) {e1.printStackTrace();}}}}}}}public static void insertPicture(XWPFDocument document,String filePath,CTInline inline,int width, int height) throws InvalidFormatException, FileNotFoundException{String ind = document.addPictureData(new FileInputStream(filePath), 5);int id = document.getAllPictures().size()-1;final int EMU = 9525;width *= EMU;height *= EMU;String blipId = document.getAllPictures().get(id).getPackageRelationship().getId();String picXml = ""+ ""+ " "+ " "+ " " + "+ id+ "\" name=\"Generated\"/>"+ " "+ " "+ " "+ "+ blipId+ "\" xmlns:r=\"/officeDocument/2006/relationships\"/>"+ " "+ " "+ " "+ " "+ " "+ " "+ " "+ "+ width+ "\" cy=\""+ height+ "\"/>"+ " "+ " "+ " "+ " "+ " "+ " "+ " " + "";inline.addNewGraphic().addNewGraphicData();XmlToken xmlToken = null;try {xmlToken = XmlToken.Factory.parse(picXml);} catch (XmlException xe) {xe.printStackTrace();}inline.set(xmlToken);inline.setDistT(0);inline.setDistB(0);inline.setDistL(0);inline.setDistR(0);CTPositiveSize2D extent = inline.addNewExtent();extent.setCx(width);extent.setCy(height);CTNonVisualDrawingProps docPr = inline.addNewDocPr(); docPr.setId(id);docPr.setName("IMG_" + id);docPr.setDescr("IMG_" + id);}public static void main(String[] args) {HashMap map = new HashMap();HashMap mapImage = new HashMap();map.put("${name}$", "02");map.put("${userIDs}$", "5201314");mapImage.put("${image1}$", "F:\\A.jpg");mapImage.put("${image2}$", "F:\\B.jpg");String srcPath = "c:\\zhenli\\cc.docx";String destPath = "c:\\zhenli\\输出模版.docx";searchAndReplace(srcPath, destPath, map,mapImage);}}。
Java实现word文档在线预览,读取office(word,excel,ppt)文件
Java实现word⽂档在线预览,读取office(word,excel,ppt)⽂件想要实现word或者其他office⽂件的在线预览,⼤部分都是⽤的两种⽅式,⼀种是使⽤openoffice转换之后再通过其他插件预览,还有⼀种⽅式就是通过POI读取内容然后预览。
⼀、使⽤openoffice⽅式实现word预览主要思路是:1.通过第三⽅⼯具openoffice,将word、excel、ppt、txt等⽂件转换为pdf⽂件2.通过swfTools将pdf⽂件转换成swf格式的⽂件3.通过FlexPaper⽂档组件在页⾯上进⾏展⽰我使⽤的⼯具版本:openof:3.4.1swfTools:1007FlexPaper:这个关系不⼤,我随便下的⼀个。
推荐使⽤1.5.1JODConverter:需要jar包,如果是maven管理直接引⽤就可以操作步骤:1.office准备下载openoffice:从过往⽂件,其他语⾔中找到中⽂版3.4.1的版本下载后,解压缩,安装然后找到安装⽬录下的program ⽂件夹在⽬录下运⾏soffice -headless -accept="socket,host=127.0.0.1,port=8100;urp;" -nofirststartwizard如果运⾏失败,可能会有提⽰,那就加上 .\ 在运⾏试⼀下这样openoffice的服务就开启了。
2.将flexpaper⽂件中的js⽂件夹(包含了flexpaper_flash_debug.js,flexpaper_flash.js,jquery.js,这三个js⽂件主要是预览swf⽂件的插件)拷贝⾄⽹站根⽬录;将FlexPaperViewer.swf拷贝⾄⽹站根⽬录下(该⽂件主要是⽤在⽹页中播放swf⽂件的播放器)项⽬结构:页⾯代码:fileUpload.jsp<%@ page language="java" contentType="text/html; charset=UTF-8"pageEncoding="UTF-8"%><!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "/TR/html4/loose.dtd"><html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>⽂档在线预览系统</title><style>body {margin-top:100px;background:#fff;font-family: Verdana, Tahoma;}a {color:#CE4614;}#msg-box {color: #CE4614; font-size:0.9em;text-align:center;}#msg-box .logo {border-bottom:5px solid #ECE5D9;margin-bottom:20px;padding-bottom:10px;}#msg-box .title {font-size:1.4em;font-weight:bold;margin:0 0 30px 0;}#msg-box .nav {margin-top:20px;}</style></head><body><div id="msg-box"><form name="form1" method="post" enctype="multipart/form-data" action="docUploadConvertAction.jsp"><div class="title">请上传要处理的⽂件,过程可能需要⼏分钟,请稍候⽚刻。
JAVA读取WORD_pdf等
JAVA读取WORD,EXCEL,POWERPOINT,PDF文件的方式OFFICE文档使用POI控件,PDF可以使用PDFBOX0.7.3控件,完全支持中文,用XPDF也行.java2word 是一个在java程序中调用MS Office Word 文档的组件(类库)。
该组件提供了一组简单的接口,以便java 档。
这些服务包括:打开文档、新建文档、查找文字、替换文字,插入文字、插入图片、插入表格,在书签处插入文字、插入图片、插入表格等。
填充数据到表格中读取表格数据更多激动人心的功能见详细说明:用jacob.其实jacob是一个bridage,连接java和com或者win32函数的一个中间件,jacob并不能直接抽取word,excel等文有为你写好的了,就是jacob的作者一并提供了。
jacob下载:下载了jacob并放到指定的路径之后(dll放到path,jar文件放到classpath),就可以写你自己的抽取程序了,下面是一个import java.io.File;import .*;import com.jacob.activeX.*;public class FileExtracter{public static void main(String[] args) {ActiveXComponent app = new ActiveXComponent("Word.Application");String inFile = "c:\\test.doc";String tpFile = "c:\\temp.htm";String otFile = "c:\\temp.xml";boolean flag = false;try {app.setProperty("Visible", new Variant(false));Object docs = app.getProperty("Documents").toDispatch();Object doc = Dispatch.invoke(docs,"Open", Dispatch.Method, new Object[]{inFile,new Variant(false), new int[1]).toDispatch();Dispatch.invoke(doc,"SaveAs", Dispatch.Method, new Object[]{tpFile,new Variant(8)}, new int[1]); Variant f = new Variant(false);Dispatch.call(doc, "Close", f);flag = true;} catch (Exception e) {e.printStackTrace();} finally {app.invoke("Quit", new Variant[] {});}}}2。
[原创]java读写word文档,完美解决方案
[原创]java读写word⽂档,完美解决⽅案 做项⽬的过程中,经常需要把数据⾥⾥的数据读出来,经过加⼯,以word格式输出。
在⽹上找了很多解决⽅案都不太理想,偶尔发现了PageOffice,⼀个国产的Office插件,开发调⽤⾮常简单!⽐⽹上介绍的poi,jacob等解决⽅按容易上⼿多了!功能接⼝虽然没有poi,jacob那么多,但是满⾜了⼀般的需求的百分之⼋九⼗,⽽且不像poi那样还需要区分处理07格式和03格式那么⿇烦。
下⾯是百度百科PageOffice的介绍: https:///item/PageOffice/2737741?fr=aladdinPageOffice的基本功能包括: 在web⽹页⾥打开、编辑、打印预览、打印Word、Excel、PowerPoint等Office⽂档。
⽂档并发控制机制。
提供Word修订痕迹、⼿写批注、圈阅划线、键盘批注、电⼦印章等OA公⽂模块的必备功能。
根据数据库动态将数据,包括⽂本、图⽚、表格等填充、导出到Word、Excel模板中指定的位置处,并且可以动态指定内容的各种格式。
提取Word、Excel⽂档中指定位置处的内容,包括⽂本、图⽚(Excel暂不⽀持)、表格等,保存到数据库。
其他功能就不细说了......更多的接⼝说明参考PageOffice官⽹API:/help/java3/index.html下载地址:/dowm/下载PageOffice开发包之后,拷贝 Samples4 ⽂件夹到 Tomcat 的 Webapps ⽬录下,访问:http://localhost:8080/Samples4/index.html 就会看到所有官⽅⽰例效果的列表,重点参考以下⼏个⽰例: ⼀、11、给Word⽂档中的数据区域(DataRegion)赋值的简单⽰例 ⼀、17、给Word⽂档中Table赋值的简单⽰例 ⼀、18、使⽤数据标签(DataTag)给Word⽂件填充⽂本数据 ⼆、6、后台编程插⼊Word⽂件到数据区域 ⼆、7、后台编程插⼊图⽚到数据区域 ⼆、8、后台编程插⼊Excel⽂件到数据区域 ⼆、9、给Word⽂档添加⽔印 ⼆、10、使⽤数据标签(DataTag)给Word⽂件填充带格式的数据 ⼆、11、在Word中动态创建数据区域我们可以写⼀个简单的程序测试⼀下效果:PageOfficeCtrl poCtrl1 = new PageOfficeCtrl(request);poCtrl1.setServerPage(request.getContextPath()+"/poserver.zz");WordDocument worddoc = new WordDocument();//先在要插⼊word⽂件的位置⼿动插⼊书签,书签必须以“PO_”为前缀//给DataRegion赋值,值的形式为:"纯⽂本内容、[word]word⽂件路径[/word]、[image]图⽚路径[/image]"DataRegion data1 = worddoc.openDataRegion("PO_p1");data1.setValue("测试字符串");//纯⽂本内容DataRegion data2 = worddoc.openDataRegion("PO_p2");data2.setValue("[word]doc/2.doc[/word]");//插⼊word⽂件DataRegion data3 = worddoc.openDataRegion("PO_p3");data3.setValue("[image]doc/1.jpg[/image]");//插⼊图⽚//打开⽂件、填充数据poCtrl1.setWriter(worddoc);poCtrl1.webOpen("doc/template.doc", OpenModeType.docNormalEdit, "⽤户名"); 代码很简单。
java读取word文档,提取标题和内容的实例
java读取word⽂档,提取标题和内容的实例使⽤的⼯具为poi,需要导⼊的依赖如下<dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>3.17</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>3.17</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>3.17</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>ooxml-schemas</artifactId><version>1.1</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml-schemas</artifactId><version>3.17</version></dependency>我采⽤的分离⽅式是根据字体⼤⼩判断。
Java 提取、删除Word文本框中的表格
Java 获取、删除Word文本框中的表格本文介绍如何来获取Word文本框中包含的表格,以及删除表格。
程序测试环境包括:IDEAJDK 1.8.0Spire.Doc.jar注:jar导入,可通过创建Maven程序项目,并在pom.xml中配置Maven仓库路径,并指定Free Spire.Doc for Java的Maven依赖,点击“Import Changes”即可导入JAR包。
(如果使用的Eclipse,点击保存按钮导入),配置如下:<repositories><repository><id>com.e-iceblue</id><url>/repository/maven-public/</url></repository></repositories><dependencies><dependency><groupId>e-iceblue</groupId><artifactId>spire.doc.free</artifactId><version>2.7.3</version></dependency></dependencies>导入效果:另外,也可通过下载jar包,手动导入Spire.Doc.jar到Java程序。
Word测试文档如下,包含一个表格:Java代码1. 获取Word文本框中的表格import com.spire.doc.*;import com.spire.doc.documents.Paragraph;import com.spire.doc.fields.TextBox;import java.io.BufferedWriter;import java.io.File;import java.io.FileWriter;import java.io.IOException;public class ExtractTable {public static void main(String[]args) throws IOException { //加载Word测试文档Document doc = new Document();doc.loadFromFile("test.docx");//获取第一个文本框TextBox textbox = doc.getTextBoxes().get(0);//获取文本框中第一个表格Table table = textbox.getBody().getTables().get(0);//保存文本String output = "EtractTableFromTextbox.txt";File file = new File(output);if (!file.exists()) {file.delete();}file.createNewFile();FileWriter fw = new FileWriter(file, true);BufferedWriter bw = new BufferedWriter(fw);//遍历表格中的段落并提取文本for (int i = 0; i < table.getRows().getCount(); i++) {TableRow row = table.getRows().get(i);for (int j = 0; j < row.getCells().getCount(); j++) {TableCell cell = row.getCells().get(j);for (int k = 0; k < cell.getParagraphs().getCount(); k++) { Paragraph paragraph = cell.getParagraphs().get(k);bw.write(paragraph.getText() + "\t");}}bw.write("\r\n");}bw.flush();bw.close();fw.close();}}表格内容获取结果:2. 删除Word文本框中的表格import com.spire.doc.*;import com.spire.doc.fields.TextBox;public class DeleteTableInTextbox {public static void main(String[] args) {//加载Word测试文档Document doc = new Document();doc.loadFromFile("test.docx");//获取第一个文本框TextBox textbox = doc.getTextBoxes().get(0);//获取文本框中第一个表格textbox.getBody().getTables().get(0);//删除第一个表格textbox.getBody().getTables().removeAt(0);//保存文档doc.saveToFile("DeleteTableInTextbox.docx", FileFormat.Docx_2013); doc.dispose();}}表格删除结果:。
Java读取word文件,字体,颜色
Java读取word⽂件,字体,颜⾊在Android读取Word⽂件时,在⽹上查看时可以⽤tm-extractors,但好像没有提到怎么读取Word⽂档中字体的颜⾊,字体,上下标等相关的属性。
但由于需要,要把doc⽂档中的内容(字体,下划线,颜⾊等)读取应⽤到android中(不包括图⽚和图表)。
后⾯采⽤的是poi三⽅jar包(原包太⼤,可以从源代码⾥⾃⼰抽取有⽤的⼀些代码减少包的⼤⼩)。
我的想法是:把doc中的内容解析出来后,加上html对应的标签,在android中通过Html.fromHtml在TextView中进⾏显⽰,或者通过WebView.loadData进⾏加载显⽰但测试后,发现如果加载太多内容的话,在Android中效率不⾏。
效果(该图的效果是在TextView中的效果,在WebView中效果会更好些):doc图:android图:做法1:(解析为span样式的,这种做法只能⽤WebView⽅式加载,Html.fromHtml⽆效)Java代码1. /**Span样式2. * 通过字体的样式进⾏加载3. * @param inputStream4. * @return5. */6. public static String readDocToSpanByRun(InputStream inputStream) {7. HWPFDocument hwpfDocument = null;8. if(inputStream == null)9. throw new RuntimeException("inputStream is null ...");10. try{11. hwpfDocument = new HWPFDocument(inputStream);12. }catch(Exception e) {13. throw new RuntimeException("HWPFDocment Exception", e);14. }15. Range allRange = hwpfDocument.getRange();16. int length = allRange.numCharacterRuns();17. StringBuffer sb = new StringBuffer();18. CharacterRun cur;19. String text = "";20. for (int i = 0; i < length; i++) {21. cur = allRange.getCharacterRun(i);22. sb.append(CharacterRunUtils.toSpanType(cur));23. text = CharacterRunUtils.getSpicalSysbomByRun(cur.text());24. if(cur.getSubSuperScriptIndex() == 1)25. sb.append("<sup>").append(text).append("</sup>");26. else if(cur.getSubSuperScriptIndex() == 2)27. sb.append("<sub>").append(text).append("</sub>");28. else29. sb.append(text);30. sb.append("</span>");31. }32. return sb.toString();33. }34.做法2:(解析为font样式的,Html.fromHtml有效,但对应size的设置⽆效果)Java代码1. /**2. * Html样式3. * 通过字体样式解析4. * @param inputStream5. * @return6. */7. public static String readDocToHtml(InputStream inputStream) {8. HWPFDocument hwpfDocument = null;9. if(inputStream == null)10. throw new RuntimeException("inputStream is null ...");11. try{12. hwpfDocument = new HWPFDocument(inputStream);13. }catch(Exception e) {14. throw new RuntimeException("HWPFDocment Exception", e);15. }16. CharacterRun cur = null;17. StringBuffer sb = new StringBuffer();18. StringBuffer charStr = new StringBuffer();19. Range allRange = hwpfDocument.getRange();20. for(int i = 0; i < allRange.numCharacterRuns(); i++) {21. cur = allRange.getCharacterRun(i);22. sb.append(CharacterRunUtils.fontFaceColorSizeToHtml(cur));23. charStr.append(CharacterRunUtils.toSupOrSub(cur, CharacterRunUtils.getSpicalSysbomByRun(cur.text())));24. if(cur.isBold()) {25. charStr.insert(0, "<b>");26. charStr.insert(charStr.length(), "</b>");27. }28. if(cur.getUnderlineCode() != 0) {29. charStr.insert(0, "<u>");30. charStr.insert(charStr.length(), "</u>");31. }32. if(cur.isItalic()) {33. charStr.insert(0, "<i>");34. charStr.insert(charStr.length(), "</i>");35. }36. if(cur.isStrikeThrough()) {37. charStr.insert(0, "<s>");38. charStr.insert(charStr.length(), "</s>");39. }40. sb.append(charStr).append("</font>");41. charStr.setLength(0);42. }43. hwpfDocument = null;44. return sb.toString();45. } 以下是会⽤到的⽅法:Java代码1. /**2. *处理字体相关的属性3. */4. public class CharacterRunUtils {5.6. private static final short ENTER_ASCII = 13;7. private static final short SPACE_ASCII = 32;8. private static final short TABULATION_ASCII = 9;9.10. /**11. * ⽐对字体是否相同12. * 可以继续加其它属性13. * @param cr114. * @param cr215. * @return16. */17. public static boolean compareCharStyleForSpan(CharacterRun cr1,18. CharacterRun cr2) {19. return cr1.isBold() == cr2.isBold()20. && cr1.getFontName().equals(cr2.getFontName())21. && cr1.getFontSize() == cr2.getFontSize()22. && cr1.isItalic() == cr2.isItalic()23. && cr1.getColor() == cr2.getColor()24. && cr1.getUnderlineCode() == cr2.getUnderlineCode()25. && cr1.isStrikeThrough() == cr2.isStrikeThrough()26. && cr1.getColor() == cr2.getColor();27. }28.29. public static boolean compareCharColor(CharacterRun cr1, CharacterRun cr2) {30. return cr1.getFontName().equals(cr2.getFontName())31. && cr1.getFontSize() == cr2.getFontSize()32. && cr1.getColor() == cr2.getColor();33. }34.35. public static String getSpicalSysbom(char currentChar) {36. String tempStr = "";37. if (currentChar == ENTER_ASCII) {38. tempStr += "<br/>";39. } else if (currentChar == SPACE_ASCII) {40. tempStr += " ";41. } else if (currentChar == TABULATION_ASCII) {42. tempStr += " ";43. } else {44. tempStr += currentChar;45. }46. return tempStr;47. }48.49. public static String getSpicalSysbomSpan(char currentChar) {50. String tempStr = "";51. if (currentChar == ENTER_ASCII) {52. tempStr += "<br/>";53. } else if (currentChar == SPACE_ASCII) {54. tempStr += " ";55. } else if (currentChar == TABULATION_ASCII) {56. tempStr += " ";57. }58. return tempStr;59. }60.61. /**62. * 特殊字符的取代63. * @param currentChar64. * @return65. */66. public static String getSpicalSysbomByRun(String currentChar) {67. StringBuffer tempStr = new StringBuffer();68. int length = currentChar.length();69. for (int i = 0; i < length; i++) {70. tempStr.append(getSpicalSysbom(currentChar.charAt(i)));71. }72. return tempStr.toString();73. }74.75. /**76. * span⽅式前缀77. * @param cr78. * @return79. */80. public static String toSpanType(CharacterRun cr) {81. StringBuffer spanStyle = new StringBuffer("<span style='font-family:");82. spanStyle.append(cr.getFontName()).append("; font-size:")83. .append(cr.getFontSize() / 2).append("pt;");84. if (cr.isBold())85. spanStyle.append("font-weight:bold;");86. if (cr.isItalic())87. spanStyle.append("font-style:italic;");88. if (cr.isStrikeThrough())89. spanStyle.append("text-decoration:line-through;");90. if (cr.getUnderlineCode() != 0)91. spanStyle.append("text-decoration:underline;");92. spanStyle.append("color:")93. .append(ColorUtils.getHexColor(cr.getIco24())).append(";")94. .append("'>");95. return spanStyle.toString();96. }97.98. /**99. * 为font⽅式提供<font前缀100. * @param cr101. * @return102. */103. public static String fontFaceColorSizeToHtml(CharacterRun cr) { 104. StringBuffer htmlType = new StringBuffer("<font ");105. htmlType.append("size='").append(cr.getFontSize() / 2).append("' ") 106. .append("face='").append(cr.getFontName()).append("' ") 107. .append("color='")108. .append(ColorUtils.getHexColor(cr.getIco24())).append("'>"); 109. return htmlType.toString();110. }111.112. /**113. * 处理上下标114. * @param cr115. * @param currentChar116. * @return117. */118. public static String toSupOrSub(CharacterRun cr, String currentChar) { 119. int sub = cr.getSubSuperScriptIndex();120. if (sub != 0) {121. if (sub == 1)122. // 上标123. return "<sup>" + currentChar + "</sup>";124. else125. // 下标126. return "<sub>" + currentChar + "</sub>";127. } else128. return currentChar;129. }130.131. public static String toSupOrSub(CharacterRun cr, char currentChar) { 132. return toSupOrSub(cr, new String(new char[]{currentChar}));133. }134. }⽤到的颜⾊的转换(进⾏简单的颜⾊转换)Java代码1. public class ColorUtils {2.3. public static int red(int c) {4. return c & 0XFF;5. }6.7. public static int green(int c) {8. return (c >> 8) & 0XFF;9. }10.11. public static int blue(int c) {12. return (c >> 16) & 0XFF;13. }14.15. public static int rgb(int c) {16. return (red(c) << 16) | (green(c) <<8) | blue(c);17. }18.19. public static String rgbToSix(String rgb) {20. int length = 6 - rgb.length();21. String str = "";22. while(length > 0){23. str += "0";24. length--;25. }26. return str + rgb;27. }28.29. public static String getHexColor(int color) {30. color = color == -1 ? 0 : color;31. int rgb = rgb(color);32. return "#" + rgbToSix(Integer.toHexString(rgb));33. }34. }。
JAVA读取WORD文档解决方案
JAVA读取WORD文档解决方案在Java中,可以使用Apache POI库来读取和操作Word文档。
Apache POI库提供了一组Java API,用于读取、写入和操作Microsoft Office格式的文件,包括Word文档。
以下是使用Apache POI库来读取Word文档的解决方案:1. 添加依赖:首先,需要在项目中添加Apache POI库的依赖。
可以在maven或gradle构建文件中添加以下依赖:```xml<dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>4.1.2</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>4.1.2</version></dependency>```2. 创建文档对象:使用POIXMLDocument类来创建一个XWPFDocument对象,它代表Word文档。
```javaFile file = new File("path/to/word/document.docx");XWPFDocument document = new XWPFDocument(new FileInputStream(file));```3.读取文档内容:通过遍历文档中的段落和表格来读取文档的内容。
```java//遍历段落List<XWPFParagraph> paragraphs = document.getParagraphs(;for (XWPFParagraph paragraph : paragraphs)String text = paragraph.getText(;System.out.println(text);//遍历表格List<XWPFTable> tables = document.getTables(;for (XWPFTable table : tables)List<XWPFTableRow> rows = table.getRows(;for (XWPFTableRow row : rows)List<XWPFTableCell> cells = row.getTableCells(;for (XWPFTableCell cell : cells)String text = cell.getText(;System.out.println(text);}}```4. 关闭文档:在读取完成后,需要关闭XWPFDocument对象来释放资源。
JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例
JAVA读取WORD,EXCEL,PDF,TXT,RTF,HTML文件文本内容的方法示例2012-06-29 17:13:08| 分类:JAVA | 标签:|字号大中小订阅以下是Java对几种文本文件内容读取代码。
其中,OFFICE文档(WORD,EXCEL)使用了POI控件,PDF使用了PDFBOX 控件。
查看相关控件的下载地址和配置方法。
转自:/allan811112/blog/item/d77b70492f0a65fb82025c3b.htmlWORDJava代码package textReader;import java.io.*;import org.apache.poi.hwpf.extractor.WordExtractor;public class WordReader {public WordReader(){}/*** @param filePath 文件路径* @return 读出的Word的内容*/public String getTextFromWord(String filePath){String result = null;File file = new File(filePath);try{FileInputStream fis = new FileInputStream(file);WordExtractor wordExtractor = new WordExtractor(fis);result = wordExtractor.getText();}catch(FileNotFoundException e){e.printStackTrace();}catch(IOException e){e.printStackTrace();};return result;}}EXCELJava代码package textReader;import ermodel.HSSFWorkbook;import ermodel.HSSFSheet;import ermodel.HSSFRow;import ermodel.HSSFCell;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;public class ExcelReader {@SuppressWarnings("deprecation")/*** @param filePath 文件路径* @return 读出的Excel的内容*/public String getTextFromExcel(String filePath) {StringBuffer buff = new StringBuffer();try {//创建对Excel工作簿文件的引用HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(filePath));//创建对工作表的引用。
JAVA直接对MS Word的操作
【关键词】直接读取MS Word 表格数据【分类】JA V A 编码实现【提供者】曾明奇管理信息化【问题描述】我们在实际的运用中,用纯JA V A操纵MS Excel的较多,但是用JA V A直接对MS Word 的操作较少,这里介绍下直接读取MS Word里的表格中的数据。
这里需要用到POI的几个包,到其官方网站下载下来:/一、直接读取MS Word里的文本内容//读文档@Testpublic void testReadWord(){try{FileInputStream in = new FileInputStream ("c:\\test.doc");WordExtractor extractor = new WordExtractor(in);String str = extractor.getText();log.debug(str);}catch(FileNotFoundException e) {e.printStackTrace();}catch (IOException e) {e.printStackTrace();}}一、直接读取MS Word里的表格数据/***直接读取Word文档中的表格中的内容*/@Testpublic void testTableCellLastParagraph() throws Exception { File file = new File(dirname, "test.doc");FileInputStream in = new FileInputStream(file);H WPFDocument doc = new HWPFDocument(in);Range r = doc.getRange();TableIterator it = new TableIterator(r);//在表格外面插入内容CharacterProperties cp = new CharacterProperties();cp.setBold(true);cp.setCharacterSpacing(10);cp.setChse(cp.SPRM_CHARSCALE);cp.setCapitalized(true);int p = r.numParagraphs();while(it.hasNext()){Table t = (Table)it.next();for(int i=0,j=t.numRows();i<j;i++){TableRow row = t.getRow(i);for(int m=0,n=row.numCells();m<n;m++){TableCell cell = row.getCell(m);String[] a = cell.text().split("_");//这个符号特别,类似小方框□(粘贴到doc文档后变成了下划线)List<String> list = Arrays.asList(a);String name = (String)list.get(0);log.debug("第 "+i+" 行第 "+m+" 列==>>"+name);}}}Paragraph para = r.getParagraph(p-1);log.debug(para.text());para.insertBefore("==>test poi==", cp);log.debug(para.text());in.close();}二、写文档// 写文档public boolean writeWordFile(String filePath, String content) { boolean w = false;try {byte b[] = content.getBytes();ByteArrayInputStream bais = new ByteArrayInputStream(b);POIFSFileSystem fs = new POIFSFileSystem();DirectoryEntry directory = fs.getRoot();DocumentEntry de = directory.createDocument("WordDocument ", bais);FileOutputStream ostream = new FileOutputStream(filePath);fs.writeFilesystem(ostream);bais.close();ostream.close();}catch (IOException e) {e.printStackTrace();}return w;}【分析原因】由于POI对MS Word的操作还不尽完善等原因,在写MS Word时会有一点问题。
Java读取word文档解决方案
Java读取word文档解决方案嘿,小伙伴,今天咱们就来聊聊如何在Java中读取Word文档,让你的程序也能像人一样“读懂”Word文件。
这可是个常用需求,不管你是做数据分析,还是文档处理,这项技能绝对不能少。
下面,我就用我那十年的方案写作经验,带你一起探索这个话题。
咱们得明确一下,Java读取Word文档主要有两种方式:一种是通过ApachePOI库,另一种是通过JODConverter库。
这两种方法各有千秋,下面我会一一介绍。
一、ApachePOI库ApachePOI,这可是Java读取Word文档的经典之作。
它支持读取和写入Word文档,功能强大,稳定性高。
不过,使用起来可能会有点难度,因为它的API相对复杂。
1.引入依赖你需要在项目的pom.xml文件中引入ApachePOI的依赖:xml<dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>5.1.0</version></dependency>2.读取Word文档就是读取Word文档的核心代码了。
这里我以读取.docx格式的文档为例:javaermodel.XWPFDocument;ermodel.XWPFParagraph;ermodel.XWPFRun;importjava.io.FileInputStream;importjava.io.IOException;importjava.util.List;publicclassWordReader{publicstaticvoidmn(Stringargs){try(FileInputStreamfis=newFileInputStream("path/to/your/ document.docx");XWPFDocumentdoc=newXWPFDocument(fis)){List<XWPFParagraph>paragraphs=doc.getParagraphs();for(XWPFParagraphparagraph:paragraphs){List<XWPFRun>runs=paragraph.getRuns();Stringtext="";for(XWPFRunrun:runs){text+=run.getText(0);}System.out.println(text);}}catch(IOExceptione){e.printStackTrace();}}}这里,我们通过`FileInputStream`读取Word文档,然后创建一个`XWPFDocument`对象来解析文档。
用Java读取Word文档
用Java读取Word文档由于Word的编码方式比较复杂,所以Word文档不可能通过流的方式直接读取;当然如果Word可以转化成TXT文件就可以直接读取了;目前读取Word比较好的开源工具是Poi及Jacob,感觉Poi读取功能要比Jacob略逊一筹,毕竟Jacob可以直接调用Word的COM组件;但是微软产品不开放源码,所以Jacob读取Word文档也只能是摸着石头过河,一点一点破解了。
Jacob读取Word内容,由于Word内容的复杂性,读取也是非常不方便的,目前可以有"按段落读取","按书签读取"及"按照表格读取"等几种形式。
示例讲解(通过Java FileReader,Jacob两种方式读取Word内容)一.通过java流读取Word内容复制代码1.import java.io.BufferedReader;2.import java.io.FileReader;3.import java.io.IOException;4.5.public class ReadWordByStream {6.public static void main(String[] args) throws IOException {7. String rowContent = new String();8. String content = new String();9. BufferedReader in = new BufferedReader(new FileReader("d:\\test3.doc"));10. while ((rowContent = in.readLine()) != null) {11.content = content + rowContent + "\n";12. }13. System.out.println(content.getBytes());14. System.out.println(new String(content.getBytes(),"utf-8"));//因为编码方式不同,不容易解析15. in.close();16.}17.18.}二.通过Jacob读取Word内容复制代码1.import com.jacob.activeX.ActiveXComponent;2.import Thread;3.import .Dispatch;4.import .Variant;5.6.public class WordReader {7.public static void main(String args[]) {8. ComThread.InitSTA();// 初始化com的线程9. ActiveXComponent wordApp = new ActiveXComponent("Word.Application"); // 启动word10. // Set the visible property as required.11. Dispatch.put(wordApp, "Visible", new Variant(true));// //设置word可见12. Dispatch docs = wordApp.getProperty("Documents").toDispatch();//所有文档窗口13.// String inFile = "d:\\test.doc";14.// Dispatch doc = Dispatch.invoke(docs,"Open",Dispatch.Method,15.// new Object[] { inFile, new Variant(false),new Variant(false) },//参数3,false:可写,true:只读16.// new int[1]).toDispatch();//打开文档17.18. Dispatch doc = Dispatch.call(docs, "Add").toDispatch(); //创建一个新文档19. Dispatch wordContent = Dispatch.get(doc, "Content").toDispatch(); //取得word文件的内容20. Dispatch font = Dispatch.get(wordContent, "Font").toDispatch();21. Dispatch.put(font, "Bold", new Variant(true)); // 设置为粗体22.Dispatch.put(font, "Italic", new Variant(true)); // 设置为斜体23.Dispatch.put(font, "Underline", new Variant(true));24.Dispatch.put(font, "Name", new Variant("宋体"));25.Dispatch.put(font, "Size", new Variant(14));26. for(int i=0;i<10;i++){//作为一个段落27.Dispatch.call(wordContent, "InsertAfter", "current paragraph"+i+" ");28. }29. for(int j=0;j<10;j++){//作为十个段落30. Dispatch.call(wordContent, "InsertAfter", "current paragraph"+j+"\r");31.}32. Dispatch paragraphs = Dispatch.get(wordContent, "Paragraphs")33. .toDispatch(); //所有段落34. int paragraphCount = Dispatch.get(paragraphs, "Count").getInt();35. System.out.println("paragraphCount:"+paragraphCount);36.37. for (int i = 1; i <= paragraphCount; i++) {38.Dispatch paragraph = Dispatch.call(paragraphs, "Item",39.new Variant(i)).toDispatch();40.Dispatch paragraphRange = Dispatch.get(paragraph, "Range")41..toDispatch();42.String paragraphContent = Dispatch.get(paragraphRange, "Text")43..toString();44.System.out.println(paragraphContent);45.//Dispatch.call(selection, "MoveDown");46. }47. // WordReader.class.getClass().getResource("/").getPath().substring+"test.doc";48. Dispatch.call(doc, "SaveAs","d:\\wordreader.doc");49. // Close the document without saving changes50. // 0 = wdDoNotSaveChanges51. // -1 = wdSaveChanges52. // -2 = wdPromptToSaveChanges53. ComThread.Release();//释放com线程54. Dispatch.call(docs, "Close", new Variant(0));55. docs = null;56. Dispatch.call(wordApp,"Quit");57. wordApp = null;58.}59.}用Java简单的读取word文档中的数据:第一步:下载tm-extractors-0.4.jar下载地址:/browser/elated-core/trunk/lib/tm-extractors-0.4.jar?rev =46并把它放到你的classpath路径下面。
java读取word表格
java读取word表格import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import org.apache.poi.hwpf.HWPFDocument;import/doc/736797535.html,ermodel.Paragraph;import/doc/736797535.html,ermodel.Range;import/doc/736797535.html,ermodel.Table;import/doc/736797535.html,ermodel.TableCell;import/doc/736797535.html,ermodel.TableIterat or;import/doc/736797535.html,ermodel.TableRow;import java.io.File;import java.io.FileInputStream;import java.io.InputStream;import org.apache.poi.POIXMLDocument;import org.apache.poi.POIXMLT extExtractor;import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.openxml4j.opc.OPCPackage;import org.apache.poi.xwpf.extractor.XWPFWordExtractor;import org.apache.poi.poifs.filesystem.POIFSFileSystem;public class ExportDocImpl{public void testWord(){try{FileInputStream in = new FileInputStream("D:\\sinye.doc");//载入文档POIFSFileSystem pfs = new POIFSFileSystem(in);HWPFDocument hwpf = new HWPFDocument(pfs);Range range = hwpf.getRange();//得到文档的读取范围TableIterator it = new TableIterator(range);//迭代文档中的表格while (it.hasNext()) {Table tb = (Table) it.next();//迭代行,默认从0开始for (int i = 0; i < tb.numRows(); i++) {TableRow tr = tb.getRow(i);//迭代列,默认从0开始for (int j = 0; j < tr.numCells(); j++) {TableCell td = tr.getCell(j);//取得单元格//取得单元格的内容for(int k=0;kParagraph para =td.getParagraph(k);String s = para.text();System.out.println(s);} //end for} //end for} //end for} //end while}catch(Exception e){e.printStackTrace();}}//end methodpublic void testWord1(){try {//word 2003:图片不会被读取InputStream is = new FileInputStream(new File("D:\\sinye.doc"));WordExtractor ex = new WordExtractor(is);String text2003 = ex.getText();System.out.println(text2003);//word 2007 图片不会被读取,表格中的数据会被放在字符串的最后OPCPackage opcPackage = POIXMLDocument.openPackage("D:\\sinye.doc");POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);String text2007 = extractor.getText();System.out.println(text2007);} catch (Exception e) {e.printStackTrace();}} }。
Java读取word文档解决方案
Java读取word文档解决方案java读取word文档时,虽然网上介绍了很多插件poi、java2Word、jacob、itext等等,poi无法读取格式(新的API估计行好像还在处于研发阶段,不太稳定,做项目不太敢用);java2Word、jacob容易报错找不到注册,比较诡异,我曾经在不同的机器上试过,操作方法完全一致,有的机器不报错,有的报错,去他们论坛找高人解决也说不出原因,项目部署用它有点玄;itxt好像写很方便但是我查了好久资料没有见到过关于读的好办法。
经过一番选择还是折中点采用rtf最好,毕竟rtf是开源格式,不需要借助任何插件,只需基本IO操作外加编码转换即可。
rtf格式文件表面看来和doc没啥区别,都可以用word打开,各种格式都可以设定。
----- 实现的功能:读取rtf模板内容(格式和文本内容),替换变化部分,形成新的rtf文档。
----- 实现思路:模板中固定部分手动输入,变化的部分用$info$表示,只需替换$info$即可。
1、采用字节的形式读取rtf模板内容2、将可变的内容字符串转为rtf编码3、替换原文中的可变部分,形成新的rtf文档主要程序如下:Java代码以上为核心代码,剩余部分就是替换,从新组装java中的String.replace(oldstr,newstr);方法可以实现,在这就不贴了。
源代码部分详见附件。
运行源代码前提:c盘创建YQ目录,将附件中"模板.rtf"复制到YQ目录之下,运行OpreatorRTF.java文件即可,就会在YQ 目录下生成文件名如:21时15分19秒_cheney_记录.rtf 的文件。
文件名是在程序中指定的呵呵。
由于是由商业软件中拆分出的demo所以只是将自己原来的写的程序分离,合并在一个java文件中,所以有的方法在示例程序中看似多余,没有必要那么麻烦。
对于替换部分需要循环的特例程序,我不好拆分,里面很容易暴露商业软件的东西,所以就不贴了,有需要的话可以加我QQ或者MSN,一起讨论呵呵。
java通过url在线预览Word、excel、ppt、pdf、txt文档中的内容(只获得了文字)
java通过url在线预览Word、excel、ppt、pdf、txt⽂档中的内容(只获得了⽂字)在页⾯上显⽰各种⽂档中的内容。
在servlet中的逻辑word:BufferedInputStream bis = null;URL url = null;HttpURLConnection httpUrl = null; // 建⽴链接url = new URL(urlReal);httpUrl = (HttpURLConnection) url.openConnection();// 连接指定的资源httpUrl.connect();// 获取⽹络输⼊流bis = new BufferedInputStream(httpUrl.getInputStream());String bodyText = null;WordExtractor ex = new WordExtractor(bis);bodyText = ex.getText();response.getWriter().write(bodyText);excel:BufferedInputStream bis = null;URL url = null;HttpURLConnection httpUrl = null; // 建⽴链接url = new URL(urlReal);httpUrl = (HttpURLConnection) url.openConnection();// 连接指定的资源httpUrl.connect();// 获取⽹络输⼊流bis = new BufferedInputStream(httpUrl.getInputStream());content = new StringBuffer();HSSFWorkbook workbook = new HSSFWorkbook(bis);for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得⼀个sheetcontent.append("/n");if (null == aSheet) {continue;}for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {content.append("/n");HSSFRow aRow = aSheet.getRow(rowNum);if (null == aRow) {continue;}for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {HSSFCell aCell = aRow.getCell(cellNum);if (null == aCell) {continue;}if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {content.append(aCell.getRichStringCellValue().getString());} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {boolean b = HSSFDateUtil.isCellDateFormatted(aCell);if (b) {Date date = aCell.getDateCellValue();SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");content.append(df.format(date));}}}}}response.getWriter().write(content.toString());ppt:BufferedInputStream bis = null;URL url = null;HttpURLConnection httpUrl = null; // 建⽴链接url = new URL(urlReal);httpUrl = (HttpURLConnection) url.openConnection();// 连接指定的资源httpUrl.connect();// 获取⽹络输⼊流bis = new BufferedInputStream(httpUrl.getInputStream());StringBuffer content = new StringBuffer("");SlideShow ss = new SlideShow(new HSLFSlideShow(bis));Slide[] slides = ss.getSlides();for (int i = 0; i < slides.length; i++) {TextRun[] t = slides[i].getTextRuns();for (int j = 0; j < t.length; j++) {content.append(t[j].getText());}content.append(slides[i].getTitle());}response.getWriter().write(content.toString());pdf:BufferedInputStream bis = null;URL url = null;HttpURLConnection httpUrl = null; // 建⽴链接url = new URL(urlReal);httpUrl = (HttpURLConnection) url.openConnection();// 连接指定的资源httpUrl.connect();// 获取⽹络输⼊流bis = new BufferedInputStream(httpUrl.getInputStream());PDDocument pdfdocument = null;PDFParser parser = new PDFParser(bis);parser.parse();pdfdocument = parser.getPDDocument();ByteArrayOutputStream out = new ByteArrayOutputStream();OutputStreamWriter writer = new OutputStreamWriter(out);PDFTextStripper stripper = new PDFTextStripper();stripper.writeText(pdfdocument.getDocument(), writer);writer.close();byte[] contents = out.toByteArray();String ts = new String(contents);response.getWriter().write(ts);txt:BufferedReader bis = null;URL url = null;HttpURLConnection httpUrl = null; // 建⽴链接url = new URL(urlReal);httpUrl = (HttpURLConnection) url.openConnection();// 连接指定的资源httpUrl.connect();// 获取⽹络输⼊流bis = new BufferedReader( new InputStreamReader(httpUrl.getInputStream())); StringBuffer buf=new StringBuffer();String temp;while ((temp = bis.readLine()) != null) {buf.append(temp);response.getWriter().write(temp); if(buf.length()>=1000){break;}}bis.close();。
- 1、下载文档前请自行甄别文档内容的完整性,平台不提供额外的编辑、内容补充、找答案等附加服务。
- 2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
- 3、如文档侵犯您的权益,请联系客服反馈,我们会尽快为您处理(人工客服工作时间:9:00-18:30)。
Java 读取Word文本框中的文本/图片/表格
Word可插入文本框,文本框中可嵌入文本、图片、表格等内容。
对文档中的已有文本框,也可以读取其中的内容。
本文以Java程序代码来展示如何读取文本框,包括读取文本框中的文本、图片以及表格等。
【程序环境】
程序编辑环境为IntelliJ IDEA,并在程序中引入了free Spire.Doc.jar 3.9.0,安装的Jdk版本为1.8.0。
【源文档】
程序中用于测试的Word源文档如下图:
【程序代码】
1.读取文本框中的文本
import com.spire.doc.*;
import com.spire.doc.documents.Paragraph;
import com.spire.doc.fields.TextBox;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
public class ExtractText {
public static void main(String[] args) throws IOException {
//加载含有文本框的Word文档
Document doc = new Document();
doc.loadFromFile("sample.docx");
//获取文本框
TextBox textbox = doc.getTextBoxes().get(0);
//保存文本框中的文本到指定文件
File file = new File("ExtractedText.txt");
if (file.exists())
{
file.delete();
}
file.createNewFile();
FileWriter fw = new FileWriter(file, true);
BufferedWriter bw = new BufferedWriter(fw);
//遍历文本框中的对象
for (Object object:textbox.getBody().getChildObjects()) {
//判定是否为文本段落
if(object instanceof Paragraph)
{
//获取段落中的文本
String text = ((Paragraph) object).getText();
//写入文本到txt文档
bw.write(text);
}
}
bw.flush();
bw.close();
fw.close();
}
}
2.读取文本框中的图片
import com.spire.doc.*;
import com.spire.doc.documents.Paragraph;
import com.spire.doc.fields.DocPicture;
import com.spire.doc.fields.TextBox;
import javax.imageio.ImageIO;
import java.awt.image.RenderedImage;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class ExtractImg {
public static void main(String[] args) throws IOException {
//加载含有文本框的Word文档
Document doc = new Document();
doc.loadFromFile("sample.docx");
//获取文本框
TextBox textbox = doc.getTextBoxes().get(0);
//创建List对象
List images = new ArrayList();
//遍历文本框中所有段落
for (int i = 0 ; i < textbox.getBody().getParagraphs().getCount();i++) {
Paragraph paragraph = textbox.getBody().getParagraphs().get(i);
//遍历段落中的所有子对象
for (int j = 0; j < paragraph.getChildObjects().getCount(); j++) {
Object object = paragraph.getChildObjects().get(j);
//判定对象是否为图片
if (object instanceof DocPicture)
{
//获取图片
DocPicture picture = (DocPicture) object;
images.add(picture.getImage());
}
}
}
//将图片以PNG文件格式保存
for (int z = 0; z < images.size(); z++) {
File file = new File(String.format("图片-%d.png", z));
ImageIO.write((RenderedImage) images.get(z), "PNG", file); }
}
}
3.读取文本框中的表格
import com.spire.doc.*;
import com.spire.doc.documents.Paragraph;
import com.spire.doc.fields.TextBox;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
public class ExtractTable {
public static void main(String[]args) throws IOException { //加载Word测试文档
Document doc = new Document();
doc.loadFromFile("sample.docx");
//获取文本框
TextBox textbox = doc.getTextBoxes().get(0);
//获取文本框中的表格
Table table = textbox.getBody().getTables().get(0);
//保存到文本文件
File file = new File("ExtractedTable.txt");
if (file.exists())
{
file.delete();
}
file.createNewFile();
FileWriter fw = new FileWriter(file, true);
BufferedWriter bw = new BufferedWriter(fw);
//遍历表格中的段落并提取文本
for (int i = 0; i < table.getRows().getCount(); i++)
{
TableRow row = table.getRows().get(i);
for (int j = 0; j < row.getCells().getCount(); j++)
{
TableCell cell = row.getCells().get(j);
for (int k = 0; k < cell.getParagraphs().getCount(); k++) {
Paragraph paragraph = cell.getParagraphs().get(k);
bw.write(paragraph.getText() + "\t");
}
}
bw.write("\r\n");
}
bw.flush();
bw.close();
fw.close();
}
}。