Searcher

Source code in simstring\searcher.py
class Searcher:
    def __init__(self, db, measure) -> None:
        """Searcher class

        This is the main way of interacting with the simsting search.

        Args:
            db (database): A database, can be a dict or mongo one as defined by the `database` modeule
            measure (measure): The similarity measure as defined by `measure`
        """
        self.db = db
        self.measure = measure
        self.feature_extractor = db.feature_extractor
        self.lookup_strings_result: dict = defaultdict(dict)

    def search(self, query_string: str, alpha: float) -> List[str]:
        features = self.feature_extractor.features(query_string)
        lf = len(features)
        min_feature_size = self.measure.min_feature_size(lf, alpha)
        max_feature_size = self.measure.max_feature_size(lf, alpha)
        results = []

        for candidate_feature_size in range(min_feature_size, max_feature_size + 1):
            tau = self.__min_overlap(lf, candidate_feature_size, alpha)
            results.extend(self.__overlap_join(features, tau, candidate_feature_size))
        return results

    def ranked_search(self, query_string: str, alpha: float) -> List[Tuple[float, str]]:
        results = self.search(query_string, alpha)
        features = self.feature_extractor.features(query_string)
        results_with_score = list(
            map(
                lambda x: [
                    self.measure.similarity(
                        features, self.feature_extractor.features(x)
                    ),
                    x,
                ],
                results,
            )
        )
        # Why change the signature? is this used in ASAP?
        # return {
        #     name: score
        #     for score, name in sorted(results_with_score, key=lambda x: (-x[0], x[1]))
        # }
        return [(score, name) for score, name in sorted(results_with_score, key=lambda x: (-x[0], x[1])) ]

    def __min_overlap(
        self, query_size: int, candidate_feature_size: int, alpha: float
    ) -> int:
        return self.measure.minimum_common_feature_count(
            query_size, candidate_feature_size, alpha
        )

    def __overlap_join(self, features, tau, candidate_feature_size: int) -> List[str]:
        query_feature_size = len(features)

        features_mapped_to_lookup_strings_sets = {
            x: self.__lookup_strings_by_feature_set_size_and_feature(
                candidate_feature_size, x
            )
            for x in features
        }

        features.sort(key=lambda x: len(features_mapped_to_lookup_strings_sets[x]))

        #candidate_string_to_matched_count : Dict[str,int] = defaultdict(int) # Only in 3.10 and later
        candidate_string_to_matched_count : Dict = defaultdict(int)
        results = []
        for feature in features[0 : query_feature_size - tau + 1]: 
            for s in features_mapped_to_lookup_strings_sets[feature]:
                candidate_string_to_matched_count[s] += 1

        # The next loop does not run for tau = 1, hence candidates are never checked, while all satisfies the criteria
        if tau == 1:
            results = list(candidate_string_to_matched_count.keys())

        for (
            candidate,
            candidate_match_count,
        ) in candidate_string_to_matched_count.items():
            for i in range(query_feature_size - tau + 1, query_feature_size):
                feature = features[i]
                if candidate in features_mapped_to_lookup_strings_sets[feature]:
                    candidate_match_count += 1
                if candidate_match_count >= tau:
                    results.append(candidate)
                    break
                remaining_feature_count = query_feature_size - i - 1
                if candidate_match_count + remaining_feature_count < tau:
                    break

        return results

    def __lookup_strings_by_feature_set_size_and_feature(self, feature_size: int, feature: str):
        if feature not in self.lookup_strings_result[feature_size]:
            self.lookup_strings_result[feature_size][
                feature
            ] = self.db.lookup_strings_by_feature_set_size_and_feature(
                feature_size, feature
            )
        return self.lookup_strings_result[feature_size][feature]

__init__(db, measure)

Searcher class

This is the main way of interacting with the simsting search.

Parameters:

Name Type Description Default
db database

A database, can be a dict or mongo one as defined by the database modeule

required
measure measure

The similarity measure as defined by measure

required
Source code in simstring\searcher.py
def __init__(self, db, measure) -> None:
    """Searcher class

    This is the main way of interacting with the simsting search.

    Args:
        db (database): A database, can be a dict or mongo one as defined by the `database` modeule
        measure (measure): The similarity measure as defined by `measure`
    """
    self.db = db
    self.measure = measure
    self.feature_extractor = db.feature_extractor
    self.lookup_strings_result: dict = defaultdict(dict)