Java源码示例:org.apache.poi.xwpf.extractor.XWPFWordExtractor
示例1
private static String readDoc (String filePath, InputStream is) throws Exception {
String text= "";
is = FileMagic.prepareToCheckMagic(is);
try {
if (FileMagic.valueOf(is) == FileMagic.OLE2) {
WordExtractor ex = new WordExtractor(is);
text = ex.getText();
ex.close();
} else if(FileMagic.valueOf(is) == FileMagic.OOXML) {
XWPFDocument doc = new XWPFDocument(is);
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
text = extractor.getText();
extractor.close();
}
} catch (OfficeXmlFileException e) {
logger.error(filePath, e);
} finally {
if (is != null) {
is.close();
}
}
return text;
}
示例2
private String microsoftWordDocumentToString(InputStream inputStream) throws IOException {
String strRet;
try (InputStream wordStream = new BufferedInputStream(inputStream)) {
if (POIFSFileSystem.hasPOIFSHeader(wordStream)) {
WordExtractor wordExtractor = new WordExtractor(wordStream);
strRet = wordExtractor.getText();
wordExtractor.close();
} else {
XWPFWordExtractor wordXExtractor = new XWPFWordExtractor(new XWPFDocument(wordStream));
strRet = wordXExtractor.getText();
wordXExtractor.close();
}
}
return strRet;
}
示例3
/**
* Gets the textual element of the .docx at the given {@link URI}.
*
* @param uriConverter
* the {@link URIConverter}
* @param uri
* the .docx {@link URI}
* @return the textual element of the .docx at the given {@link URI}
*/
public static String getTextContent(URIConverter uriConverter, URI uri) {
String result = "";
try (InputStream is = uriConverter.createInputStream(uri);
OPCPackage oPackage = OPCPackage.open(is);
XWPFDocument document = new XWPFDocument(oPackage);
XWPFWordExtractor ex = new XWPFWordExtractor(document);) {
result += "===== Document Text ====\n";
result += ex.getText();
// CHECKSTYLE:OFF
} catch (Throwable e) {
// CHECKSTYLE:ON
/*
* if for some reason we can't use POI to get the text content then move along, we'll still get the XML and hashs
*/
}
return result;
}
示例4
public static String Parse07(String FilePath) throws IOException, XmlException, OpenXML4JException{
String text2007=null;
try{
OPCPackage opcPackage = POIXMLDocument.openPackage(FilePath);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
text2007 = extractor.getText();
} catch (Exception e) {
e.printStackTrace();
}
return text2007;
}
示例5
/**
* 文本提取
*
* @param filename
* @throws IOException
*/
public static void extractor(String filename) throws IOException {
XWPFDocument docx = new XWPFDocument(new FileInputStream(filename));
// using XWPFWordExtractor Class
XWPFWordExtractor we = new XWPFWordExtractor(docx);
System.out.println(we.getText());
}
示例6
@Override
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
try {
OPCPackage opc = OPCPackage.open(new ByteArrayInputStream(cc.getContent()));
XWPFWordExtractor extractor = new XWPFWordExtractor(opc);
String s = extractor.getText();
char[] c = s.toCharArray();
handler.startRegion("document");
handler.text(c, 0, c.length);
handler.endRegion();
} catch (Exception e) {
throw new OntopiaRuntimeException(e);
}
}
示例7
public static String getDocxText(File file) {
try {
XWPFDocument docx = new XWPFDocument(new FileInputStream(file));
XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
String text = extractor.getText();
return text;
}
catch(Exception e) {
e.printStackTrace();
}
return null;
}
示例8
@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws Exception {
POIFSFileSystem poiFS = Mockito.mock(POIFSFileSystem.class);
WordExtractor wordExtractor = Mockito.mock(WordExtractor.class);
XWPFWordExtractor xwpfExtractor = Mockito.mock(XWPFWordExtractor.class);
XWPFDocument xwpfDocument = Mockito.mock(XWPFDocument.class);
PowerMockito.whenNew(POIFSFileSystem.class).withParameterTypes(InputStream.class)
.withArguments(Mockito.any(InputStream.class))
.thenThrow(OfficeXmlFileException.class)
.thenReturn(poiFS)
.thenThrow(APIManagementException.class);
PowerMockito.whenNew(WordExtractor.class).withArguments(poiFS).thenReturn(wordExtractor);
PowerMockito.whenNew(XWPFDocument.class).withParameterTypes(InputStream.class)
.withArguments(Mockito.any())
.thenReturn(xwpfDocument);
PowerMockito.whenNew(XWPFWordExtractor.class).withArguments(xwpfDocument).thenReturn(xwpfExtractor);
Mockito.when(wordExtractor.getText()).thenReturn("");
Mockito.when(xwpfExtractor.getText()).thenReturn("");
MSWordIndexer indexer = new MSWordIndexer();
IndexDocument wordDoc = indexer.getIndexedDocument(file2Index);
// should return the default media type when media type is not defined in file2Index
if (!"application/pdf".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
Assert.fail();
}
// should return the media type we have set in the file2Index
file2Index.mediaType = "text/html";
wordDoc = indexer.getIndexedDocument(file2Index);
if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
Assert.fail();
}
// should return the media type we have set in the file2Index even if exception occurred while reading the file
file2Index.mediaType = "text/html";
wordDoc = indexer.getIndexedDocument(file2Index);
if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
Assert.fail();
}
}