Feature extractors
Bases: BaseFeatureExtractor
Source code in simstring\feature_extractor\character_ngram.py
class CharacterNgramFeatureExtractor(BaseFeatureExtractor):
def __init__(self, n:int=2):
self.n = n
def features(self, string: str) -> List[str]:
list_of_ngrams = self._each_cons('$' * (self.n - 1) + string + '$' * (self.n - 1), self.n)
return self.uniquify_list(list_of_ngrams)
Bases: BaseFeatureExtractor
Source code in simstring\feature_extractor\word_ngram.py
class WordNgramFeatureExtractor(BaseFeatureExtractor):
def __init__(self, n=2, splitter=" "):
self.n = n
self.splitter = splitter
def features(self, text: str) -> List[str]:
# Split text by white space.
# If you want to extract words from text in more complicated way or using your favorite library like NLTK, please implement in your own.
words = text.split(self.splitter)
return self._words_ngram(words, self.n, SENTINAL_CHAR)
Bases: BaseFeatureExtractor
Source code in simstring\feature_extractor\mecab_ngram.py
class MecabNgramFeatureExtractor(BaseFeatureExtractor):
def __init__(self, n=2, user_dic_path='', sys_dic_path=''):
self.n = n
self.mecab = MecabTokenizer(user_dic_path, sys_dic_path)
def features(self, text: str) -> List[str]:
words = [x.surface() for x in self.mecab.tokenize(text)]
return self._words_ngram(words, self.n, SENTINAL_CHAR)