Source code for pl_fuzzy_frame_match.models

from dataclasses import dataclass
from typing import Literal

FuzzyTypeLiteral = Literal["levenshtein", "jaro", "jaro_winkler", "hamming", "damerau_levenshtein", "indel"]



[docs]
@dataclass
class JoinMap:
    """A simple data structure to hold left and right column names for a join."""

    left_col: str
    right_col: str




[docs]
@dataclass
class FuzzyMapping(JoinMap):
    """Represents the configuration for a fuzzy string match between two columns.

    This class defines all the necessary parameters to perform a fuzzy join,
    including the columns to match, the specific algorithm to use, and the
    similarity threshold required to consider two strings a match.

    It generates a default name for the output score column if one is not
    provided.

    Attributes:
        left_col (str): The name of the column in the left dataframe to join on.
        right_col (str): The name of the column in the right dataframe to join on.
        threshold_score (float): The similarity score threshold required for a
            match, typically on a scale of 0 to 100. Defaults to 80.0.
        fuzzy_type (FuzzyTypeLiteral): The string-matching algorithm to use.
            Defaults to "levenshtein".
        perc_unique (float): A parameter that may be used to assess column
            uniqueness before performing a costly fuzzy match. Defaults to 0.0.
        output_column_name (str | None): The name for the new column that will
            contain the calculated fuzzy match score. If None, a name is
            generated automatically in the format 'fuzzy_score_{left_col}_{right_col}'.
        valid (bool): A flag to indicate whether this mapping is active and should
            be used in a join operation. Defaults to True.
        reversed_threshold_score (float): A property that converts the 0-100
            threshold score into a 0.0-1.0 distance score, where 0.0 is a
            perfect match.
    """

    threshold_score: float = 80.0
    fuzzy_type: FuzzyTypeLiteral = "levenshtein"
    perc_unique: float = 0.0
    output_column_name: str | None = None
    valid: bool = True


[docs]
    def __init__(
        self,
        left_col: str,
        right_col: str | None = None,
        threshold_score: float = 80.0,
        fuzzy_type: FuzzyTypeLiteral = "levenshtein",
        perc_unique: float = 0,
        output_column_name: str | None = None,
        valid: bool = True,
    ):
        """Initializes the FuzzyMapping configuration.

        Args:
            left_col (str): The name of the column in the left dataframe.
            right_col (str | None, optional): The name of the column in the
                right dataframe. If None, it defaults to the value of left_col.
            threshold_score (float, optional): The similarity threshold for a
                match (0-100). Defaults to 80.0.
            fuzzy_type (FuzzyTypeLiteral, optional): The fuzzy matching algorithm
                to use. Defaults to "levenshtein".
            perc_unique (float, optional): The percentage of unique values.
                Defaults to 0.
            output_column_name (str | None, optional): Name for the output score
                column. Defaults to None, which triggers auto-generation.
            valid (bool, optional): Whether the mapping is considered active.
                Defaults to True.
        """
        if right_col is None:
            right_col = left_col

        # The dataclass's __init__ is overridden, so all fields must be manually assigned.
        super().__init__(left_col=left_col, right_col=right_col)
        self.valid = valid
        self.threshold_score = threshold_score
        self.fuzzy_type = fuzzy_type
        self.perc_unique = perc_unique
        self.output_column_name = (
            output_column_name if output_column_name is not None else f"fuzzy_score_{left_col}_{right_col}"
        )


    @property
    def reversed_threshold_score(self) -> float:
        """Converts similarity score (0-100) to a distance score (1.0-0.0).

        For example, a `threshold_score` of 80 becomes a distance of 0.2.
        This is useful for libraries that measure string distance rather than
        similarity.

        Returns:
            float: The converted distance score.
        """
        return ((int(self.threshold_score) - 100) * -1) / 100