from dataclasses import dataclass
from enum import Enum
from typing import Literal
FuzzyTypeLiteral = Literal["levenshtein", "jaro", "jaro_winkler", "hamming", "damerau_levenshtein", "indel"]
[docs]
class LogicalOp(Enum):
"""Enum representing logical operators for combining FuzzyMapExpr."""
AND = "and"
OR = "or"
[docs]
@dataclass
class JoinMap:
"""A simple data structure to hold left and right column names for a join."""
left_col: str
right_col: str
[docs]
@dataclass
class FuzzyMapping(JoinMap):
"""Represents the configuration for a fuzzy string match between two columns.
This class defines all the necessary parameters to perform a fuzzy join,
including the columns to match, the specific algorithm to use, and the
similarity threshold required to consider two strings a match.
It generates a default name for the output score column if one is not
provided.
Attributes:
left_col (str): The name of the column in the left dataframe to join on.
right_col (str): The name of the column in the right dataframe to join on.
threshold_score (float): The similarity score threshold required for a
match, typically on a scale of 0 to 100. Defaults to 80.0.
fuzzy_type (FuzzyTypeLiteral): The string-matching algorithm to use.
Defaults to "levenshtein".
perc_unique (float): A parameter that may be used to assess column
uniqueness before performing a costly fuzzy match. Defaults to 0.0.
output_column_name (str | None): The name for the new column that will
contain the calculated fuzzy match score. If None, a name is
generated automatically in the format 'fuzzy_score_{left_col}_{right_col}'.
valid (bool): A flag to indicate whether this mapping is active and should
be used in a join operation. Defaults to True.
reversed_threshold_score (float): A property that converts the 0-100
threshold score into a 0.0-1.0 distance score, where 0.0 is a
perfect match.
"""
threshold_score: float = 80.0
fuzzy_type: FuzzyTypeLiteral = "levenshtein"
perc_unique: float = 0.0
output_column_name: str | None = None
valid: bool = True
[docs]
def __init__(
self,
left_col: str,
right_col: str | None = None,
threshold_score: float = 80.0,
fuzzy_type: FuzzyTypeLiteral = "levenshtein",
perc_unique: float = 0,
output_column_name: str | None = None,
valid: bool = True,
):
"""Initializes the FuzzyMapping configuration.
Args:
left_col (str): The name of the column in the left dataframe.
right_col (str | None, optional): The name of the column in the
right dataframe. If None, it defaults to the value of left_col.
threshold_score (float, optional): The similarity threshold for a
match (0-100). Defaults to 80.0.
fuzzy_type (FuzzyTypeLiteral, optional): The fuzzy matching algorithm
to use. Defaults to "levenshtein".
perc_unique (float, optional): The percentage of unique values.
Defaults to 0.
output_column_name (str | None, optional): Name for the output score
column. Defaults to None, which triggers auto-generation.
valid (bool, optional): Whether the mapping is considered active.
Defaults to True.
"""
if right_col is None:
right_col = left_col
# The dataclass's __init__ is overridden, so all fields must be manually assigned.
super().__init__(left_col=left_col, right_col=right_col)
self.valid = valid
self.threshold_score = threshold_score
self.fuzzy_type = fuzzy_type
self.perc_unique = perc_unique
self.output_column_name = (
output_column_name if output_column_name is not None else f"fuzzy_score_{left_col}_{right_col}"
)
@property
def reversed_threshold_score(self) -> float:
"""Converts similarity score (0-100) to a distance score (1.0-0.0).
For example, a `threshold_score` of 80 becomes a distance of 0.2.
This is useful for libraries that measure string distance rather than
similarity.
Returns:
float: The converted distance score.
"""
return ((int(self.threshold_score) - 100) * -1) / 100
[docs]
class FuzzyMapExpr:
"""A composable fuzzy mapping expression that supports AND/OR logical operators.
This class allows you to combine multiple FuzzyMapping configurations using
logical operators (&, |) to create complex matching conditions, similar to
how Polars expressions work.
A FuzzyMapExpr can be:
- A leaf node wrapping a single FuzzyMapping
- An internal node combining two FuzzyMapExpr with AND or OR
Python's operator precedence ensures that:
- `a | b & c` evaluates to `a | (b & c)`
- `c & a | b` evaluates to `(c & a) | b`
Example:
>>> city = FuzzyMapExpr(left_col="city", right_col="city", threshold_score=80)
>>> zipcode = FuzzyMapExpr(left_col="zipcode", right_col="zipcode", threshold_score=90)
>>> street = FuzzyMapExpr(left_col="street", right_col="street", threshold_score=70)
>>> email = FuzzyMapExpr(left_col="email", right_col="email")
>>>
>>> # Complex expression: (city AND zipcode) OR (street AND zipcode) OR email
>>> expr = (city & zipcode) | (street & zipcode) | email
>>>
>>> # Use in fuzzy matching
>>> result = fuzzy_match_dfs(left_df, right_df, expr)
Attributes:
mapping (FuzzyMapping | None): The underlying FuzzyMapping for leaf nodes.
left (FuzzyMapExpr | None): Left child for binary operations.
right (FuzzyMapExpr | None): Right child for binary operations.
op (LogicalOp | None): The logical operator (AND/OR) for internal nodes.
"""
[docs]
def __init__(
self,
left_col: str | None = None,
right_col: str | None = None,
threshold_score: float = 80.0,
fuzzy_type: FuzzyTypeLiteral = "levenshtein",
output_column_name: str | None = None,
*,
_mapping: "FuzzyMapping | None" = None,
_left: "FuzzyMapExpr | None" = None,
_right: "FuzzyMapExpr | None" = None,
_op: LogicalOp | None = None,
):
"""Initialize a FuzzyMapExpr.
When called with column parameters, creates a leaf node wrapping a FuzzyMapping.
Internal parameters (prefixed with _) are used for creating combined expressions.
Args:
left_col: The name of the column in the left dataframe.
right_col: The name of the column in the right dataframe.
If None, defaults to left_col.
threshold_score: The similarity threshold for a match (0-100). Defaults to 80.0.
fuzzy_type: The fuzzy matching algorithm to use. Defaults to "levenshtein".
output_column_name: Name for the output score column. Defaults to None.
_mapping: Internal - pre-existing FuzzyMapping (for wrapping).
_left: Internal - left child expression.
_right: Internal - right child expression.
_op: Internal - logical operator.
"""
if _left is not None and _right is not None and _op is not None:
# Internal node (combined expression)
self.mapping: FuzzyMapping | None = None
self.left: FuzzyMapExpr | None = _left
self.right: FuzzyMapExpr | None = _right
self.op: LogicalOp | None = _op
elif _mapping is not None:
# Wrap existing FuzzyMapping
self.mapping = _mapping
self.left = None
self.right = None
self.op = None
elif left_col is not None:
# Create a new FuzzyMapping (leaf node)
self.mapping = FuzzyMapping(
left_col=left_col,
right_col=right_col,
threshold_score=threshold_score,
fuzzy_type=fuzzy_type,
output_column_name=output_column_name,
)
self.left = None
self.right = None
self.op = None
else:
raise ValueError(
"FuzzyMapExpr must be initialized with either column parameters "
"or internal tree structure parameters."
)
[docs]
@classmethod
def from_mapping(cls, mapping: "FuzzyMapping") -> "FuzzyMapExpr":
"""Create a FuzzyMapExpr from an existing FuzzyMapping.
Args:
mapping: The FuzzyMapping to wrap.
Returns:
A new FuzzyMapExpr wrapping the provided mapping.
"""
return cls(_mapping=mapping)
[docs]
def __and__(self, other: "FuzzyMapExpr") -> "FuzzyMapExpr":
"""Combine two expressions with AND logic.
When both this expression and the other must match for a row to be included.
Args:
other: The other FuzzyMapExpr to combine with.
Returns:
A new FuzzyMapExpr representing (self AND other).
"""
return FuzzyMapExpr(_left=self, _right=other, _op=LogicalOp.AND)
[docs]
def __or__(self, other: "FuzzyMapExpr") -> "FuzzyMapExpr":
"""Combine two expressions with OR logic.
When either this expression or the other must match for a row to be included.
Args:
other: The other FuzzyMapExpr to combine with.
Returns:
A new FuzzyMapExpr representing (self OR other).
"""
return FuzzyMapExpr(_left=self, _right=other, _op=LogicalOp.OR)
[docs]
def is_leaf(self) -> bool:
"""Check if this expression is a leaf node (contains a single FuzzyMapping).
Returns:
True if this is a leaf node, False if it's a combined expression.
"""
return self.mapping is not None
[docs]
def to_branches(self) -> list[list["FuzzyMapping"]]:
"""Convert the expression tree to a list of branches (Disjunctive Normal Form).
Each branch is a list of FuzzyMappings that should be AND-ed together.
The branches are then OR-ed (union) together.
For example:
- `(A & B) | (C & D) | E` becomes `[[A, B], [C, D], [E]]`
- `A & B & C` becomes `[[A, B, C]]`
- `A | B | C` becomes `[[A], [B], [C]]`
Returns:
A list of branches, where each branch is a list of FuzzyMappings.
"""
if self.is_leaf():
assert self.mapping is not None
return [[self.mapping]]
assert self.left is not None and self.right is not None and self.op is not None
left_branches = self.left.to_branches()
right_branches = self.right.to_branches()
if self.op == LogicalOp.OR:
# OR: concatenate branches
return left_branches + right_branches
else: # AND
# AND: combine each left branch with each right branch
combined_branches = []
for left_branch in left_branches:
for right_branch in right_branches:
combined_branches.append(left_branch + right_branch)
return combined_branches
[docs]
def get_all_mappings(self) -> list["FuzzyMapping"]:
"""Get all FuzzyMappings in this expression tree (deduplicated).
Returns:
A list of all unique FuzzyMappings in the expression.
"""
seen: set[int] = set()
result: list[FuzzyMapping] = []
def collect(expr: FuzzyMapExpr) -> None:
if expr.is_leaf():
assert expr.mapping is not None
mapping_id = id(expr.mapping)
if mapping_id not in seen:
seen.add(mapping_id)
result.append(expr.mapping)
else:
if expr.left:
collect(expr.left)
if expr.right:
collect(expr.right)
collect(self)
return result
[docs]
def __repr__(self) -> str:
"""Return a string representation of the expression."""
if self.is_leaf():
assert self.mapping is not None
return f"FuzzyMapExpr({self.mapping.left_col} vs {self.mapping.right_col})"
else:
op_str = "&" if self.op == LogicalOp.AND else "|"
return f"({self.left!r} {op_str} {self.right!r})"