Java源码示例:opennlp.tools.stemmer.Stemmer

示例1
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
                            Stemmer stemmer) {
    final String original = token;
    log.log(Level.FINEST, () -> "processToken '"+original+"'");
    token = normalizer.normalize(token);
    token = LinguisticsCase.toLowerCase(token);
    if (removeAccents)
        token = transformer.accentDrop(token, language);
    if (stemMode != StemMode.NONE) {
        final String oldToken = token;
        token = doStemming(token, stemmer);
        final String newToken = token;
        log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'");
    }
    final String result = token;
    log.log(Level.FINEST, () -> "processed token is: "+result);
    return result;
}
 
示例2
@Override
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
    if (input.isEmpty()) return Collections.emptyList();
    Stemmer stemmer = getStemmerForLanguage(language, stemMode);
    if (stemmer == null) {
        return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
    }

    List<Token> tokens = new ArrayList<>();
    int nextCode = input.codePointAt(0);
    TokenType prevType = SimpleTokenType.valueOf(nextCode);
    for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) {
        nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE;
        TokenType nextType = SimpleTokenType.valueOf(nextCode);
        if (!prevType.isIndexable() || !nextType.isIndexable()) {
            String original = input.substring(prev, next);
            String token = processToken(original, language, stemMode, removeAccents, stemmer);
            tokens.add(new SimpleToken(original).setOffset(prev)
                    .setType(prevType)
                    .setTokenString(token));
            prev = next;
            prevType = nextType;
        }
        next += Character.charCount(nextCode);
    }
    return tokens;
}
 
示例3
private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) {
    log.log(Level.FINEST, () -> "getStemmerForLanguage '"+language+"' mode: "+stemMode);
    if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) {
        return null;
    }
    SnowballStemmer.ALGORITHM alg;
    switch (language) {
        case DANISH:
            alg = SnowballStemmer.ALGORITHM.DANISH;
            break;
        case DUTCH:
            alg = SnowballStemmer.ALGORITHM.DUTCH;
            break;
        case FINNISH:
            alg = SnowballStemmer.ALGORITHM.FINNISH;
            break;
        case FRENCH:
            alg = SnowballStemmer.ALGORITHM.FRENCH;
            break;
        case GERMAN:
            alg = SnowballStemmer.ALGORITHM.GERMAN;
            break;
        case HUNGARIAN:
            alg = SnowballStemmer.ALGORITHM.HUNGARIAN;
            break;
        case IRISH:
            alg = SnowballStemmer.ALGORITHM.IRISH;
            break;
        case ITALIAN:
            alg = SnowballStemmer.ALGORITHM.ITALIAN;
            break;
        case NORWEGIAN_BOKMAL:
        case NORWEGIAN_NYNORSK:
            alg = SnowballStemmer.ALGORITHM.NORWEGIAN;
            break;
        case PORTUGUESE:
            alg = SnowballStemmer.ALGORITHM.PORTUGUESE;
            break;
        case ROMANIAN:
            alg = SnowballStemmer.ALGORITHM.ROMANIAN;
            break;
        case RUSSIAN:
            alg = SnowballStemmer.ALGORITHM.RUSSIAN;
            break;
        case SPANISH:
            alg = SnowballStemmer.ALGORITHM.SPANISH;
            break;
        case SWEDISH:
            alg = SnowballStemmer.ALGORITHM.SWEDISH;
            break;
        case TURKISH:
            alg = SnowballStemmer.ALGORITHM.TURKISH;
            break;
        case ENGLISH:
            alg = SnowballStemmer.ALGORITHM.ENGLISH;
            break;
        default:
            return null;

    }
    return new SnowballStemmer(alg);
}
 
示例4
private String doStemming(String token, Stemmer stemmer) {
    return stemmer.stem(token).toString();
}