Java源码示例:edu.stanford.nlp.process.DocumentPreprocessor
示例1
public static void main(String args[]){
try {
Reader reader = new FileReader(getResourcePath());
DocumentPreprocessor dp = new DocumentPreprocessor(reader, DocumentPreprocessor.DocType.XML);
dp.setElementDelimiter("sentence");
for(List sentence : dp){
ListIterator list = sentence.listIterator();
while (list.hasNext()) {
System.out.print(list.next() + " ");
}
System.out.println();
}
} catch (FileNotFoundException ex) {
Logger.getLogger(XMLProcessingDemo.class.getName()).log(Level.SEVERE, null, ex);
}
}
示例2
/**
* Get an IDF map for the given document string.
*
* @param document
* @return
*/
private static Counter<String> getIDFMapForDocument(String document) {
// Clean up -- remove some Gigaword patterns that slow things down
// / don't help anything
document = headingSeparator.matcher(document).replaceAll("");
DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document));
preprocessor.setTokenizerFactory(tokenizerFactory);
Counter<String> idfMap = new ClassicCounter<String>();
for (List<HasWord> sentence : preprocessor) {
if (sentence.size() > MAX_SENTENCE_LENGTH)
continue;
List<TaggedWord> tagged = tagger.tagSentence(sentence);
for (TaggedWord w : tagged) {
if (w.tag().startsWith("n"))
idfMap.incrementCount(w.word());
}
}
return idfMap;
}
示例3
public static void main(String[] args) throws IOException {
for (String arg : args) {
// option #1: By sentence.
DocumentPreprocessor dp = new DocumentPreprocessor(arg);
for (List<HasWord> sentence : dp) {
System.out.println(sentence);
}
// option #2: By token
PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(arg), new CoreLabelTokenFactory(), "");
while (ptbt.hasNext()) {
CoreLabel label = ptbt.next();
System.out.println(label);
}
}
}
示例4
private static void usingStanfordDocumentPreprocessor() {
// option #1: By sentence.
Reader reader = new StringReader(paragraph);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
for (List sentence : dp) {
System.out.println(sentence);
}
// try {
// Reader reader = new FileReader("XMLText.xml");
// DocumentPreprocessor dp = new DocumentPreprocessor(
// reader, DocumentPreprocessor.DocType.XML);
// dp.setElementDelimiter("sentence");
// for (List sentence : dp) {
// System.out.println(sentence);
// }
//} catch (FileNotFoundException ex) {
// Handle exception
//}
// // option #2: By token
// PTBTokenizer ptbt = new PTBTokenizer(reader,
// new CoreLabelTokenFactory(), "");
// CoreLabel label;
// while (ptbt.hasNext()) {
// System.out.println(ptbt.next());
// }
}
示例5
public List<String> splitParagraph(String paragraph){
Reader reader = new StringReader(paragraph);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
List<String> sentenceList = new ArrayList<String>();
for (List<HasWord> sentence : dp) {
String sentenceString = SentenceUtils.listToString(sentence);
sentenceList.add(sentenceString);
}
return sentenceList;
}
示例6
/**
* Construct a parse tree using the stanford NLP parser. Only one sentence.
* Here we are omitting the information of dependency labels (tags).
* @param text input text.
*/
public ParseTree(String text, NLParser parser) {
// pre-processing the input text
DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
List<HasWord> sentence = null;
for (List<HasWord> sentenceHasWord : tokenizer) {
sentence = sentenceHasWord;
break;
}
// part-of-speech tagging
List<TaggedWord> tagged = parser.tagger.tagSentence(sentence);
// dependency syntax parsing
GrammaticalStructure gs = parser.parser.predict(tagged);
// Reading the parsed sentence into ParseTree
int N = sentence.size()+1;
Node[] nodes = new Node[N];
root = new Node(0, "ROOT", "ROOT");
nodes[0] = root;
for (int i = 0; i < N-1; i++) {
nodes[i+1] = new Node(i+1,
sentence.get(i).word(), tagged.get(i).tag());
}
for (TypedDependency typedDep : gs.allTypedDependencies()) {
int from = typedDep.gov().index();
int to = typedDep.dep().index();
// String label = typedDep.reln().getShortName(); // omitting the label
nodes[to].parent = nodes[from];
nodes[from].children.add(nodes[to]);
}
}
示例7
public static void main(String[] args) {
String modelPath = DependencyParser.DEFAULT_MODEL;
String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
for (int argIndex = 0; argIndex < args.length;) {
switch (args[argIndex]) {
case "-tagger":
taggerPath = args[argIndex + 1];
argIndex += 2;
break;
case "-com.dukenlidb.nlidb.model":
modelPath = args[argIndex + 1];
argIndex += 2;
break;
default:
throw new RuntimeException("Unknown argument " + args[argIndex]);
}
}
String text = "Return authors who have more papers than Bob in VLDB after 2000";
MaxentTagger tagger = new MaxentTagger(taggerPath);
DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);
DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
for (List<HasWord> sentence : tokenizer) {
List<TaggedWord> tagged = tagger.tagSentence(sentence);
GrammaticalStructure gs = parser.predict(tagged);
// Print typed dependencies
log.info(gs);
}
}
示例8
private static void usingTheStanfordTokenizer() {
// Using PTBTokenizer
System.out.println("----PTBTokenizer Example");
// First example
// PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph),
// new CoreLabelTokenFactory(),null);
// while (ptb.hasNext()) {
// System.out.println(ptb.next());
// }
// CoreLabel example
CoreLabelTokenFactory ctf = new CoreLabelTokenFactory();
PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph),
ctf, "invertible=true");
// PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph),
// new WordTokenFactory(), null);
while (ptb.hasNext()) {
CoreLabel cl = (CoreLabel) ptb.next();
System.out.println(cl.originalText() + " ("
+ cl.beginPosition() + "-" + cl.endPosition() + ")");
}
// Using a DocumentPreprocessor
System.out.println("----DocumentPreprocessor Example");
Reader reader = new StringReader(paragraph);
DocumentPreprocessor documentPreprocessor
= new DocumentPreprocessor(reader);
Iterator<List<HasWord>> it = documentPreprocessor.iterator();
while (it.hasNext()) {
List<HasWord> sentence = it.next();
for (HasWord token : sentence) {
System.out.println(token);
}
}
// for (List<HasWord> sentence : documentPreprocessor) {
//// List<HasWord> sentence = it.next();
// for (HasWord token : sentence) {
// System.out.println(token);
// }
// }
// Using a pipeline
System.out.println("----pipeline Example");
Properties properties = new Properties();
properties.put("annotators", "tokenize, ssplit");
StanfordCoreNLP pipeline = new StanfordCoreNLP(properties);
Annotation annotation = new Annotation(paragraph);
pipeline.annotate(annotation);
pipeline.prettyPrint(annotation, System.out);
}