Source code for pl_fuzzy_frame_match.models

from dataclasses import dataclass
from typing import Literal

FuzzyTypeLiteral = Literal["levenshtein", "jaro", "jaro_winkler", "hamming", "damerau_levenshtein", "indel"]


[docs] @dataclass class JoinMap: """A simple data structure to hold left and right column names for a join.""" left_col: str right_col: str
[docs] @dataclass class FuzzyMapping(JoinMap): """Represents the configuration for a fuzzy string match between two columns. This class defines all the necessary parameters to perform a fuzzy join, including the columns to match, the specific algorithm to use, and the similarity threshold required to consider two strings a match. It generates a default name for the output score column if one is not provided. Attributes: left_col (str): The name of the column in the left dataframe to join on. right_col (str): The name of the column in the right dataframe to join on. threshold_score (float): The similarity score threshold required for a match, typically on a scale of 0 to 100. Defaults to 80.0. fuzzy_type (FuzzyTypeLiteral): The string-matching algorithm to use. Defaults to "levenshtein". perc_unique (float): A parameter that may be used to assess column uniqueness before performing a costly fuzzy match. Defaults to 0.0. output_column_name (str | None): The name for the new column that will contain the calculated fuzzy match score. If None, a name is generated automatically in the format 'fuzzy_score_{left_col}_{right_col}'. valid (bool): A flag to indicate whether this mapping is active and should be used in a join operation. Defaults to True. reversed_threshold_score (float): A property that converts the 0-100 threshold score into a 0.0-1.0 distance score, where 0.0 is a perfect match. """ threshold_score: float = 80.0 fuzzy_type: FuzzyTypeLiteral = "levenshtein" perc_unique: float = 0.0 output_column_name: str | None = None valid: bool = True
[docs] def __init__( self, left_col: str, right_col: str | None = None, threshold_score: float = 80.0, fuzzy_type: FuzzyTypeLiteral = "levenshtein", perc_unique: float = 0, output_column_name: str | None = None, valid: bool = True, ): """Initializes the FuzzyMapping configuration. Args: left_col (str): The name of the column in the left dataframe. right_col (str | None, optional): The name of the column in the right dataframe. If None, it defaults to the value of left_col. threshold_score (float, optional): The similarity threshold for a match (0-100). Defaults to 80.0. fuzzy_type (FuzzyTypeLiteral, optional): The fuzzy matching algorithm to use. Defaults to "levenshtein". perc_unique (float, optional): The percentage of unique values. Defaults to 0. output_column_name (str | None, optional): Name for the output score column. Defaults to None, which triggers auto-generation. valid (bool, optional): Whether the mapping is considered active. Defaults to True. """ if right_col is None: right_col = left_col # The dataclass's __init__ is overridden, so all fields must be manually assigned. super().__init__(left_col=left_col, right_col=right_col) self.valid = valid self.threshold_score = threshold_score self.fuzzy_type = fuzzy_type self.perc_unique = perc_unique self.output_column_name = ( output_column_name if output_column_name is not None else f"fuzzy_score_{left_col}_{right_col}" )
@property def reversed_threshold_score(self) -> float: """Converts similarity score (0-100) to a distance score (1.0-0.0). For example, a `threshold_score` of 80 becomes a distance of 0.2. This is useful for libraries that measure string distance rather than similarity. Returns: float: The converted distance score. """ return ((int(self.threshold_score) - 100) * -1) / 100