Java源码示例:org.apache.poi.xwpf.extractor.XWPFWordExtractor

示例1
private static String readDoc (String filePath, InputStream is) throws Exception {
    String text= "";
    is = FileMagic.prepareToCheckMagic(is);
    try {
        if (FileMagic.valueOf(is) == FileMagic.OLE2) {
            WordExtractor ex = new WordExtractor(is);
            text = ex.getText();
            ex.close();
        } else if(FileMagic.valueOf(is) == FileMagic.OOXML) {
            XWPFDocument doc = new XWPFDocument(is);
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            text = extractor.getText();
            extractor.close();
        }
    } catch (OfficeXmlFileException e) {
        logger.error(filePath, e);
    } finally {
        if (is != null) {
            is.close();
        }
    }
    return text;
}
 
示例2
private String microsoftWordDocumentToString(InputStream inputStream) throws IOException {
    String strRet;

    try (InputStream wordStream = new BufferedInputStream(inputStream)) {
        if (POIFSFileSystem.hasPOIFSHeader(wordStream)) {
            WordExtractor wordExtractor = new WordExtractor(wordStream);
            strRet = wordExtractor.getText();
            wordExtractor.close();
        } else {
            XWPFWordExtractor wordXExtractor = new XWPFWordExtractor(new XWPFDocument(wordStream));
            strRet = wordXExtractor.getText();
            wordXExtractor.close();
        }
    }

    return strRet;
}
 
示例3
/**
 * Gets the textual element of the .docx at the given {@link URI}.
 * 
 * @param uriConverter
 *            the {@link URIConverter}
 * @param uri
 *            the .docx {@link URI}
 * @return the textual element of the .docx at the given {@link URI}
 */
public static String getTextContent(URIConverter uriConverter, URI uri) {
    String result = "";

    try (InputStream is = uriConverter.createInputStream(uri);
            OPCPackage oPackage = OPCPackage.open(is);
            XWPFDocument document = new XWPFDocument(oPackage);
            XWPFWordExtractor ex = new XWPFWordExtractor(document);) {

        result += "===== Document Text ====\n";
        result += ex.getText();
        // CHECKSTYLE:OFF
    } catch (Throwable e) {
        // CHECKSTYLE:ON
        /*
         * if for some reason we can't use POI to get the text content then move along, we'll still get the XML and hashs
         */
    }
    return result;
}
 
示例4
public static String Parse07(String FilePath) throws IOException, XmlException, OpenXML4JException{
    String text2007=null;
    try{
        OPCPackage opcPackage = POIXMLDocument.openPackage(FilePath);
        POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
        text2007 = extractor.getText();

    } catch (Exception e) {
        e.printStackTrace();
    }
    return text2007;
}
 
示例5
/**
 * 文本提取
 *
 * @param filename
 * @throws IOException
 */
public static void extractor(String filename) throws IOException {
    XWPFDocument docx = new XWPFDocument(new FileInputStream(filename));
    // using XWPFWordExtractor Class
    XWPFWordExtractor we = new XWPFWordExtractor(docx);
    System.out.println(we.getText());
}
 
示例6
@Override
public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
  try {
    OPCPackage opc = OPCPackage.open(new ByteArrayInputStream(cc.getContent()));
    XWPFWordExtractor extractor = new XWPFWordExtractor(opc);
    String s = extractor.getText();
    char[] c = s.toCharArray();
    handler.startRegion("document");
    handler.text(c, 0, c.length);
    handler.endRegion();
  } catch (Exception e) {
    throw new OntopiaRuntimeException(e);
  }    
}
 
示例7
public static String getDocxText(File file) {
    try {
        XWPFDocument docx = new XWPFDocument(new FileInputStream(file));
        XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
        String text = extractor.getText();
        return text;
    }
    catch(Exception e) {
        e.printStackTrace();
    }
    return null;
}
 
示例8
@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws Exception {
    POIFSFileSystem poiFS = Mockito.mock(POIFSFileSystem.class);
    WordExtractor wordExtractor = Mockito.mock(WordExtractor.class);
    XWPFWordExtractor xwpfExtractor = Mockito.mock(XWPFWordExtractor.class);
    XWPFDocument xwpfDocument = Mockito.mock(XWPFDocument.class);
    PowerMockito.whenNew(POIFSFileSystem.class).withParameterTypes(InputStream.class)
            .withArguments(Mockito.any(InputStream.class))
            .thenThrow(OfficeXmlFileException.class)
            .thenReturn(poiFS)
            .thenThrow(APIManagementException.class);
    PowerMockito.whenNew(WordExtractor.class).withArguments(poiFS).thenReturn(wordExtractor);
    PowerMockito.whenNew(XWPFDocument.class).withParameterTypes(InputStream.class)
            .withArguments(Mockito.any())
            .thenReturn(xwpfDocument);
    PowerMockito.whenNew(XWPFWordExtractor.class).withArguments(xwpfDocument).thenReturn(xwpfExtractor);
    Mockito.when(wordExtractor.getText()).thenReturn("");
    Mockito.when(xwpfExtractor.getText()).thenReturn("");
    MSWordIndexer indexer = new MSWordIndexer();

    IndexDocument wordDoc = indexer.getIndexedDocument(file2Index);

    // should return the default media type when media type is not defined in file2Index
    if (!"application/pdf".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index
    file2Index.mediaType = "text/html";
    wordDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if exception occurred while reading the file
    file2Index.mediaType = "text/html";
    wordDoc = indexer.getIndexedDocument(file2Index);
    if (!"text/html".equals(wordDoc.getFields().get(IndexingConstants.FIELD_MEDIA_TYPE).get(0))) {
        Assert.fail();
    }
}