Feature extractors

Bases: BaseFeatureExtractor

Source code in simstring\feature_extractor\character_ngram.py
class CharacterNgramFeatureExtractor(BaseFeatureExtractor):
    def __init__(self, n:int=2):
        self.n = n

    def features(self, string: str) -> List[str]:
        list_of_ngrams = self._each_cons('$' * (self.n - 1) + string + '$' * (self.n - 1), self.n)
        return self.uniquify_list(list_of_ngrams) 

Bases: BaseFeatureExtractor

Source code in simstring\feature_extractor\word_ngram.py
class WordNgramFeatureExtractor(BaseFeatureExtractor):
    def __init__(self, n=2, splitter=" "):
        self.n = n
        self.splitter = splitter

    def features(self, text: str) -> List[str]:
        # Split text by white space.
        # If you want to extract words from text in more complicated way or using your favorite library like NLTK, please implement in your own.
        words = text.split(self.splitter)
        return self._words_ngram(words, self.n, SENTINAL_CHAR)

Bases: BaseFeatureExtractor

Source code in simstring\feature_extractor\mecab_ngram.py
class MecabNgramFeatureExtractor(BaseFeatureExtractor):
    def __init__(self, n=2, user_dic_path='', sys_dic_path=''):
        self.n = n
        self.mecab = MecabTokenizer(user_dic_path, sys_dic_path)

    def features(self, text: str) -> List[str]:
        words = [x.surface() for x in self.mecab.tokenize(text)]
        return self._words_ngram(words, self.n, SENTINAL_CHAR)