Measure

The measure defines the formula by which the distance between strings is measured.

Use as:

from simstring.measure import CosineMeasure, JaccardMeasure, OverlapMeasure, DiceMeasure

But be carefull, they are not identical to the normal definitions of these measures.

Cosine Measure is different to scipy.spatial.distance.cosine as it works on strings and not vectors.

Jaccard distance does not discard duplicates in its sets, unlike in the normally used definition. This means that 'fooo' is seen as more different from 'fo' than 'foo', which is a more useful way of lookng at the string difference, but is not the usual definition of the distance as implimanted by scipy.spatial.distance.jaccard or wikipedia or any public calculator.

Cosine Measure

Bases: BaseMeasure

Source code in simstring\measure\cosine.py
class CosineMeasure(BaseMeasure):
    def min_feature_size(self, query_size:int, alpha:float) -> int:
        return int(math.ceil(alpha * alpha * query_size))

    def max_feature_size(self, query_size:int, alpha:float) -> int:
        return int(math.floor(query_size / (alpha * alpha)))

    def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int:
        return int(math.ceil(alpha * math.sqrt(query_size * y_size)))

    def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
        return len(set(X) & set(Y)) / math.sqrt(len(set(X)) * len(set(Y)))

Jaccard Measure

Bases: BaseMeasure

Source code in simstring\measure\jaccard.py
class JaccardMeasure(BaseMeasure):
    def min_feature_size(self, query_size:int, alpha:float) -> int:
        return int(math.ceil(alpha * query_size))

    def max_feature_size(self, query_size:int, alpha:float) -> int:
        return int(math.floor(query_size / alpha))

    def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int:
        return int(math.ceil(alpha * (query_size + y_size) * 1.0 / (1 + alpha)))

    def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
        return len(set(X) & set(Y)) * 1.0 / len(set(X) | set(Y))

OverlapMeasures

Bases: BaseMeasure

Source code in simstring\measure\overlap.py
class OverlapMeasure(BaseMeasure):
    def __init__(self, db=None, maxsize: int=100) -> None:
        super().__init__()
        if db:
            self.maxsize = db.max_feature_size()
        else:
            self.maxsize = maxsize

    def min_feature_size(self, query_size, alpha) -> int:
        # return 1 # Not sure the below isn't sufficient
        return math.floor(query_size*alpha) or 1

    def max_feature_size(self, query_size, alpha) -> int:
        return self.maxsize

    def minimum_common_feature_count(self, query_size: int, y_size: int, alpha: float) -> int:
        return int(math.ceil(alpha * min(query_size, y_size)))

    def similarity(self, X: Iterable[str], Y: Iterable[str]) -> int:
        return min(len(set(X)), len(set(Y)))

Bases: BaseMeasure

Source code in simstring\measure\overlap.py
class LeftOverlapMeasure(BaseMeasure):
    def __init__(self, db=None, maxsize: int=100) -> None:
        super().__init__()
        if db:
            self.maxsize = db.max_feature_size()
        else:
            self.maxsize = maxsize

    def min_feature_size(self, query_size, alpha) -> int:
        return math.floor(query_size*alpha) or 1

    def max_feature_size(self, query_size, alpha) -> int:
        return self.maxsize

    def minimum_common_feature_count(self, query_size: int, y_size: int, alpha: float) -> int:
        return math.floor(query_size*alpha) or 1

    def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
        return 1- len(set(X) - set(Y) )/len(set(X))

DiceMeasure

Bases: BaseMeasure

Source code in simstring\measure\dice.py
class DiceMeasure(BaseMeasure):   

    def min_feature_size(self, query_size:int, alpha:float) -> int:
        return int(math.ceil(alpha * 1.0 / (2 - alpha) * query_size))

    def max_feature_size(self, query_size:int, alpha:float) -> int:
        return int(math.floor((2 - alpha) * query_size * 1.0 / alpha))

    def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int:
        return int(math.ceil(0.5 * alpha * query_size * y_size))

    def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
        return len(set(X) & set(Y)) * 2.0 / (len(set(X)) + len(set(Y)))