Java源码示例:opennlp.tools.stemmer.Stemmer
示例1
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
Stemmer stemmer) {
final String original = token;
log.log(Level.FINEST, () -> "processToken '"+original+"'");
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
if (stemMode != StemMode.NONE) {
final String oldToken = token;
token = doStemming(token, stemmer);
final String newToken = token;
log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'");
}
final String result = token;
log.log(Level.FINEST, () -> "processed token is: "+result);
return result;
}
示例2
@Override
public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) {
if (input.isEmpty()) return Collections.emptyList();
Stemmer stemmer = getStemmerForLanguage(language, stemMode);
if (stemmer == null) {
return simpleTokenizer.tokenize(input, language, stemMode, removeAccents);
}
List<Token> tokens = new ArrayList<>();
int nextCode = input.codePointAt(0);
TokenType prevType = SimpleTokenType.valueOf(nextCode);
for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) {
nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE;
TokenType nextType = SimpleTokenType.valueOf(nextCode);
if (!prevType.isIndexable() || !nextType.isIndexable()) {
String original = input.substring(prev, next);
String token = processToken(original, language, stemMode, removeAccents, stemmer);
tokens.add(new SimpleToken(original).setOffset(prev)
.setType(prevType)
.setTokenString(token));
prev = next;
prevType = nextType;
}
next += Character.charCount(nextCode);
}
return tokens;
}
示例3
private Stemmer getStemmerForLanguage(Language language, StemMode stemMode) {
log.log(Level.FINEST, () -> "getStemmerForLanguage '"+language+"' mode: "+stemMode);
if (language == null || Language.ENGLISH.equals(language) || StemMode.NONE.equals(stemMode)) {
return null;
}
SnowballStemmer.ALGORITHM alg;
switch (language) {
case DANISH:
alg = SnowballStemmer.ALGORITHM.DANISH;
break;
case DUTCH:
alg = SnowballStemmer.ALGORITHM.DUTCH;
break;
case FINNISH:
alg = SnowballStemmer.ALGORITHM.FINNISH;
break;
case FRENCH:
alg = SnowballStemmer.ALGORITHM.FRENCH;
break;
case GERMAN:
alg = SnowballStemmer.ALGORITHM.GERMAN;
break;
case HUNGARIAN:
alg = SnowballStemmer.ALGORITHM.HUNGARIAN;
break;
case IRISH:
alg = SnowballStemmer.ALGORITHM.IRISH;
break;
case ITALIAN:
alg = SnowballStemmer.ALGORITHM.ITALIAN;
break;
case NORWEGIAN_BOKMAL:
case NORWEGIAN_NYNORSK:
alg = SnowballStemmer.ALGORITHM.NORWEGIAN;
break;
case PORTUGUESE:
alg = SnowballStemmer.ALGORITHM.PORTUGUESE;
break;
case ROMANIAN:
alg = SnowballStemmer.ALGORITHM.ROMANIAN;
break;
case RUSSIAN:
alg = SnowballStemmer.ALGORITHM.RUSSIAN;
break;
case SPANISH:
alg = SnowballStemmer.ALGORITHM.SPANISH;
break;
case SWEDISH:
alg = SnowballStemmer.ALGORITHM.SWEDISH;
break;
case TURKISH:
alg = SnowballStemmer.ALGORITHM.TURKISH;
break;
case ENGLISH:
alg = SnowballStemmer.ALGORITHM.ENGLISH;
break;
default:
return null;
}
return new SnowballStemmer(alg);
}
示例4
private String doStemming(String token, Stemmer stemmer) {
return stemmer.stem(token).toString();
}