Java源码示例:morfologik.stemming.DictionaryLookup
示例1
public PolishStemmer() {
synchronized (getClass()) {
if (dictionary == null) {
try {
dictionary = AccessController.doPrivileged(new PrivilegedExceptionAction<Dictionary>() {
@Override
public Dictionary run() throws Exception {
URL dictResource = getClass().getResource("polish.dict");
if (dictResource == null) {
throw new IOException("Polish dictionary resource not found.");
}
return Dictionary.read(dictResource);
}
});
} catch (PrivilegedActionException e) {
throw new RuntimeException("Could not read dictionary data.", e.getException());
}
}
}
lookup = new DictionaryLookup(dictionary);
}
示例2
/**
* Propose suggestions for misspelled run-on words. This algorithm is inspired
* by spell.cc in s_fsa package by Jan Daciuk.
*
* @param original
* The original misspelled word.
* @return The list of suggested pairs, as CandidateData with space-concatenated strings.
*/
public List<CandidateData> replaceRunOnWordCandidates(final String original) {
final List<CandidateData> candidates = new ArrayList<>();
String wordToCheck = original;
if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
wordToCheck = DictionaryLookup.applyReplacements(original, dictionaryMetadata.getInputConversionPairs());
}
if (!isInDictionary(wordToCheck) && dictionaryMetadata.isSupportingRunOnWords()) {
Locale locale = dictionaryMetadata.getLocale();
for (int i = 1; i < wordToCheck.length(); i++) {
// chop from left to right
final String prefix = wordToCheck.substring(0, i);
final String suffix = wordToCheck.substring(i);
if (isInDictionary(suffix)) {
if (isInDictionary(prefix)) {
addReplacement(candidates, prefix + " " + suffix);
} else if (Character.isUpperCase(prefix.charAt(0)) && isInDictionary(prefix.toLowerCase(locale))) {
// a word that's uppercase just because used at sentence start
addReplacement(candidates, prefix + " " + suffix);
}
}
}
}
return candidates;
}
示例3
/**
* Creates a filter with a given dictionary.
*
* @param in input token stream.
* @param dict Dictionary to use for stemming.
*/
public MorfologikFilter(final TokenStream in, final Dictionary dict) {
super(in);
this.input = in;
this.stemmer = new DictionaryLookup(dict);
this.lemmaList = Collections.emptyList();
}
示例4
private void addReplacement(List<CandidateData> candidates, String replacement) {
if (dictionaryMetadata.getOutputConversionPairs().isEmpty()) {
candidates.add(new CandidateData(replacement, 1));
} else {
candidates.add(new CandidateData(DictionaryLookup.applyReplacements(replacement,
dictionaryMetadata.getOutputConversionPairs()), 1));
}
}
示例5
@Override
public ExitStatus call() throws Exception {
ExitStatus exitStatus = validateArguments();
if (exitStatus != null) {
return exitStatus;
}
final DictionaryLookup lookup = new DictionaryLookup(Dictionary.read(this.dictionary));
try (final LineSupplier input = determineInput()) {
String line;
while ((line = input.nextLine()) != null) {
if (line.length() == 0) {
continue;
}
List<WordData> wordData = lookup.lookup(line);
if (wordData.isEmpty()) {
System.out.println(line + " => [not found]");
} else {
for (WordData wd : wordData) {
CharSequence stem = wd.getStem();
CharSequence tag = wd.getTag();
System.out.println(line + " => " +
((skipTags || tag == null) ? stem
: stem + " " + tag));
}
}
}
}
return ExitStatus.SUCCESS;
}
示例6
/**
* Checks whether the word is misspelled, by performing a series of checks
* according to properties of the dictionary.
*
* If the flag <code>fsa.dict.speller.ignore-punctuation</code> is set, then
* all non-alphabetic characters are considered to be correctly spelled.
*
* If the flag <code>fsa.dict.speller.ignore-numbers</code> is set, then all
* words containing decimal digits are considered to be correctly spelled.
*
* If the flag <code>fsa.dict.speller.ignore-camel-case</code> is set, then
* all CamelCase words are considered to be correctly spelled.
*
* If the flag <code>fsa.dict.speller.ignore-all-uppercase</code> is set, then
* all alphabetic words composed of only uppercase characters are considered
* to be correctly spelled.
*
* Otherwise, the word is checked in the dictionary. If the test fails, and
* the dictionary does not perform any case conversions (as set by
* <code>fsa.dict.speller.convert-case</code> flag), then the method returns
* false. In case of case conversions, it is checked whether a non-mixed case
* word is found in its lowercase version in the dictionary, and for
* all-uppercase words, whether the word is found in the dictionary with the
* initial uppercase letter.
*
* @param word
* - the word to be checked
* @return true if the word is misspelled
**/
public boolean isMisspelled(final String word) {
// dictionaries usually do not contain punctuation
String wordToCheck = word;
if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
wordToCheck = DictionaryLookup.applyReplacements(word, dictionaryMetadata.getInputConversionPairs());
}
boolean isAlphabetic = wordToCheck.length() != 1 || isAlphabetic(wordToCheck.charAt(0));
return wordToCheck.length() > 0
&& (!dictionaryMetadata.isIgnoringPunctuation() || isAlphabetic)
&& (!dictionaryMetadata.isIgnoringNumbers() || containsNoDigit(wordToCheck))
&& !(dictionaryMetadata.isIgnoringCamelCase() && isCamelCase(wordToCheck))
&& !(dictionaryMetadata.isIgnoringAllUppercase() && isAlphabetic && isAllUppercase(wordToCheck))
&& !isInDictionary(wordToCheck)
&& (!dictionaryMetadata.isConvertingCase() ||
!(!isMixedCase(wordToCheck) &&
(isInDictionary(wordToCheck.toLowerCase(dictionaryMetadata.getLocale()))
|| isAllUppercase(wordToCheck) && isInDictionary(initialUppercase(wordToCheck)))));
}
示例7
@Test
public void testSeparatorInEncoded() throws Exception {
final Path input = newTempDir().resolve("dictionary.input");
final Path metadata = DictionaryMetadata.getExpectedMetadataLocation(input);
char separator = '_';
try (Writer writer = Files.newBufferedWriter(metadata, StandardCharsets.UTF_8)) {
DictionaryMetadata.builder()
.separator(separator)
.encoder(EncoderType.SUFFIX)
.encoding(StandardCharsets.UTF_8)
.build()
.write(writer);
}
Set<String> sequences = new LinkedHashSet<>();
for (int seqs = randomIntBetween(0, 100); --seqs >= 0;) {
sequences.add("anfragen_anfragen|VER:1:PLU:KJ1:SFT:NEB");
sequences.add("Anfragen_anfragen|VER:1:PLU:KJ1:SFT:NEB");
}
try (Writer writer = Files.newBufferedWriter(input, StandardCharsets.UTF_8)) {
for (String in : sequences) {
writer.write(in);
writer.write('\n');
}
}
Assertions.assertThat(new DictCompile(input, false, true, false, false, false).call())
.isEqualTo(ExitStatus.SUCCESS);
Path dict = input.resolveSibling("dictionary.dict");
Assertions.assertThat(dict).isRegularFile();
// Verify the dictionary is valid.
DictionaryLookup dictionaryLookup = new DictionaryLookup(Dictionary.read(dict));
for (WordData wd : dictionaryLookup) {
System.out.println(wd);
}
}
示例8
/**
* Reads a dictionary in morfologik FSA format.
*
* @param dictURL
* the URL containing the dictionary
* the language
* @throws IllegalArgumentException
* if an exception is illegal
* @throws IOException
* throws an exception if dictionary path is not correct
*/
public MorfologikLemmatizer(final URL dictURL)
throws IOException {
this.dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
}
示例9
/**
* Reads a dictionary in morfologik FSA format.
*
* @param dictURL
* the URL containing the dictionary
* @param aLang
* the language
* @throws IOException
* throws an exception if dictionary path is not correct
*/
public MorfologikTagger(final URL dictURL, final String aLang)
throws IOException {
this.dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
}