Java源码示例:morfologik.stemming.DictionaryLookup

示例1
public PolishStemmer() {
  synchronized (getClass()) {
    if (dictionary == null) {
      try {
        dictionary = AccessController.doPrivileged(new PrivilegedExceptionAction<Dictionary>() {
          @Override
          public Dictionary run() throws Exception {
            URL dictResource = getClass().getResource("polish.dict");
            if (dictResource == null) {
              throw new IOException("Polish dictionary resource not found.");
            }
            return Dictionary.read(dictResource);
          }
        });
      } catch (PrivilegedActionException e) {
        throw new RuntimeException("Could not read dictionary data.", e.getException());
      }
    }
  }

  lookup = new DictionaryLookup(dictionary);
}
 
示例2
/**
 * Propose suggestions for misspelled run-on words. This algorithm is inspired
 * by spell.cc in s_fsa package by Jan Daciuk.
 * 
 * @param original
 *          The original misspelled word.
 * @return The list of suggested pairs, as CandidateData with space-concatenated strings.
 */
public List<CandidateData> replaceRunOnWordCandidates(final String original) {
  final List<CandidateData> candidates = new ArrayList<>();
  String wordToCheck = original;
  if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
    wordToCheck = DictionaryLookup.applyReplacements(original, dictionaryMetadata.getInputConversionPairs());
  }
  if (!isInDictionary(wordToCheck) && dictionaryMetadata.isSupportingRunOnWords()) {
    Locale locale = dictionaryMetadata.getLocale();
    for (int i = 1; i < wordToCheck.length(); i++) {
      // chop from left to right
      final String prefix = wordToCheck.substring(0, i);
      final String suffix = wordToCheck.substring(i);
      if (isInDictionary(suffix)) {
        if (isInDictionary(prefix)) {
          addReplacement(candidates, prefix + " " + suffix);
        } else if (Character.isUpperCase(prefix.charAt(0)) && isInDictionary(prefix.toLowerCase(locale))) {
          // a word that's uppercase just because used at sentence start
          addReplacement(candidates, prefix + " " + suffix);
        }
      }
    }
  }
  return candidates;
}
 
示例3
/**
 * Creates a filter with a given dictionary.
 *
 * @param in input token stream.
 * @param dict Dictionary to use for stemming.
 */
public MorfologikFilter(final TokenStream in, final Dictionary dict) {
  super(in);
  this.input = in;
  this.stemmer = new DictionaryLookup(dict);
  this.lemmaList = Collections.emptyList();
}
 
示例4
private void addReplacement(List<CandidateData> candidates, String replacement) {
  if (dictionaryMetadata.getOutputConversionPairs().isEmpty()) {
    candidates.add(new CandidateData(replacement, 1));
  } else {
    candidates.add(new CandidateData(DictionaryLookup.applyReplacements(replacement,
        dictionaryMetadata.getOutputConversionPairs()), 1));
  }
}
 
示例5
@Override
public ExitStatus call() throws Exception {
  ExitStatus exitStatus = validateArguments();
  if (exitStatus != null) {
    return exitStatus;
  }
  
  final DictionaryLookup lookup = new DictionaryLookup(Dictionary.read(this.dictionary));
  try (final LineSupplier input = determineInput()) {
    String line;
    while ((line = input.nextLine()) != null) {
      if (line.length() == 0) {
        continue;
      }

      List<WordData> wordData = lookup.lookup(line);
      if (wordData.isEmpty()) {
        System.out.println(line + " => [not found]");
      } else {
        for (WordData wd : wordData) {
          CharSequence stem = wd.getStem();
          CharSequence tag = wd.getTag();
          System.out.println(line + " => " +
              ((skipTags || tag == null) ? stem
                                         : stem + " " + tag));
        }
      }
    }
  }

  return ExitStatus.SUCCESS;
}
 
示例6
/**
 * Checks whether the word is misspelled, by performing a series of checks
 * according to properties of the dictionary.
 *
 * If the flag <code>fsa.dict.speller.ignore-punctuation</code> is set, then
 * all non-alphabetic characters are considered to be correctly spelled.
 *
 * If the flag <code>fsa.dict.speller.ignore-numbers</code> is set, then all
 * words containing decimal digits are considered to be correctly spelled.
 *
 * If the flag <code>fsa.dict.speller.ignore-camel-case</code> is set, then
 * all CamelCase words are considered to be correctly spelled.
 *
 * If the flag <code>fsa.dict.speller.ignore-all-uppercase</code> is set, then
 * all alphabetic words composed of only uppercase characters are considered
 * to be correctly spelled.
 *
 * Otherwise, the word is checked in the dictionary. If the test fails, and
 * the dictionary does not perform any case conversions (as set by
 * <code>fsa.dict.speller.convert-case</code> flag), then the method returns
 * false. In case of case conversions, it is checked whether a non-mixed case
 * word is found in its lowercase version in the dictionary, and for
 * all-uppercase words, whether the word is found in the dictionary with the
 * initial uppercase letter.
 *
 * @param word
 *          - the word to be checked
 * @return true if the word is misspelled
 **/
public boolean isMisspelled(final String word) {
  // dictionaries usually do not contain punctuation
  String wordToCheck = word;
  if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) {
    wordToCheck = DictionaryLookup.applyReplacements(word, dictionaryMetadata.getInputConversionPairs());
  }
  boolean isAlphabetic = wordToCheck.length() != 1 || isAlphabetic(wordToCheck.charAt(0));
  return wordToCheck.length() > 0
      && (!dictionaryMetadata.isIgnoringPunctuation() || isAlphabetic)
      && (!dictionaryMetadata.isIgnoringNumbers() || containsNoDigit(wordToCheck))
      && !(dictionaryMetadata.isIgnoringCamelCase() && isCamelCase(wordToCheck))
      && !(dictionaryMetadata.isIgnoringAllUppercase() && isAlphabetic && isAllUppercase(wordToCheck))
      && !isInDictionary(wordToCheck)
      && (!dictionaryMetadata.isConvertingCase() || 
          !(!isMixedCase(wordToCheck) && 
              (isInDictionary(wordToCheck.toLowerCase(dictionaryMetadata.getLocale())) 
                  || isAllUppercase(wordToCheck) && isInDictionary(initialUppercase(wordToCheck)))));
}
 
示例7
@Test
public void testSeparatorInEncoded() throws Exception {
  final Path input = newTempDir().resolve("dictionary.input");
  final Path metadata = DictionaryMetadata.getExpectedMetadataLocation(input);

  char separator = '_';
  try (Writer writer = Files.newBufferedWriter(metadata, StandardCharsets.UTF_8)) {
    DictionaryMetadata.builder()
        .separator(separator)
        .encoder(EncoderType.SUFFIX)
        .encoding(StandardCharsets.UTF_8)
        .build()
        .write(writer);
  }

  Set<String> sequences = new LinkedHashSet<>();
  for (int seqs = randomIntBetween(0, 100); --seqs >= 0;) {
    sequences.add("anfragen_anfragen|VER:1:PLU:KJ1:SFT:NEB");
    sequences.add("Anfragen_anfragen|VER:1:PLU:KJ1:SFT:NEB");
  }

  try (Writer writer = Files.newBufferedWriter(input, StandardCharsets.UTF_8)) {
    for (String in : sequences) {
      writer.write(in);
      writer.write('\n');
    }
  }

  Assertions.assertThat(new DictCompile(input, false, true, false, false, false).call())
    .isEqualTo(ExitStatus.SUCCESS);

  Path dict = input.resolveSibling("dictionary.dict");
  Assertions.assertThat(dict).isRegularFile();

  // Verify the dictionary is valid.
  
  DictionaryLookup dictionaryLookup = new DictionaryLookup(Dictionary.read(dict));
  for (WordData wd : dictionaryLookup) {
    System.out.println(wd);
  }
}
 
示例8
/**
 * Reads a dictionary in morfologik FSA format.
 * 
 * @param dictURL
 *          the URL containing the dictionary
 *          the language
 * @throws IllegalArgumentException
 *           if an exception is illegal
 * @throws IOException
 *           throws an exception if dictionary path is not correct
 */
public MorfologikLemmatizer(final URL dictURL)
    throws IOException {
  this.dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
}
 
示例9
/**
 * Reads a dictionary in morfologik FSA format.
 * 
 * @param dictURL
 *          the URL containing the dictionary
 * @param aLang
 *          the language
 * @throws IOException
 *           throws an exception if dictionary path is not correct
 */
public MorfologikTagger(final URL dictURL, final String aLang)
    throws IOException {
  this.dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
}