Examples
Simple Matching
Basic fuzzy matching between two dataframes:
import polars as pl
from pl_fuzzy_frame_match import fuzzy_match_dfs, FuzzyMapping
import logging
logger = logging.getLogger(__name__)
# Create sample data
companies = pl.DataFrame({
"company_id": [1, 2, 3, 4, 5],
"company_name": [
"Apple Inc.",
"Microsoft Corporation",
"Amazon.com Inc",
"Google LLC",
"Meta Platforms Inc"
]
}).lazy()
vendors = pl.DataFrame({
"vendor_id": ["V001", "V002", "V003", "V004", "V005"],
"vendor_name": [
"Apple",
"Microsoft Corp.",
"Amazon",
"Alphabet/Google",
"Facebook/Meta"
]
}).lazy()
# Define fuzzy matching
fuzzy_maps = [
FuzzyMapping(
left_col="company_name",
right_col="vendor_name",
threshold_score=70.0,
fuzzy_type="jaro_winkler"
)
]
# Perform matching
result = fuzzy_match_dfs(
left_df=companies,
right_df=vendors,
fuzzy_maps=fuzzy_maps,
logger=logger
)
# Display results
print(result.select([
"company_id",
"company_name",
"vendor_id",
"vendor_name",
"fuzzy_score_0"
]).sort("fuzzy_score_0", descending=True))
Multi-Column Matching
Match on multiple columns with different algorithms and thresholds:
# Customer database
customers = pl.DataFrame({
"customer_id": [1, 2, 3, 4],
"name": ["John Smith", "Jane Doe", "Bob Johnson", "Alice Brown"],
"address": ["123 Main St", "456 Oak Ave", "789 Pine Rd", "321 Elm St"],
"city": ["New York", "Los Angeles", "Chicago", "Houston"]
}).lazy()
# Vendor database with potential matches
vendors = pl.DataFrame({
"vendor_id": ["A", "B", "C", "D"],
"vendor_name": ["Jon Smith", "Jane Do", "Robert Johnson", "Alicia Brown"],
"vendor_address": ["123 Main Street", "456 Oak Avenue", "789 Pine Road", "321 Elm Street"],
"vendor_city": ["New York", "Los Angeles", "Chicago", "Houston"]
}).lazy()
# Multi-column fuzzy matching
fuzzy_maps = [
FuzzyMapping(
left_col="name",
right_col="vendor_name",
threshold_score=85.0,
fuzzy_type="jaro_winkler"
),
FuzzyMapping(
left_col="address",
right_col="vendor_address",
threshold_score=80.0,
fuzzy_type="levenshtein"
),
FuzzyMapping(
left_col="city",
right_col="vendor_city",
threshold_score=95.0,
fuzzy_type="jaro"
)
]
# Perform matching
result = fuzzy_match_dfs(
left_df=customers,
right_df=vendors,
fuzzy_maps=fuzzy_maps,
logger=logger
)
# Calculate combined score
result = result.with_columns(
(
pl.col("fuzzy_score_0") * 0.5 + # Name weight: 50%
pl.col("fuzzy_score_1") * 0.3 + # Address weight: 30%
pl.col("fuzzy_score_2") * 0.2 # City weight: 20%
).alias("combined_score")
)
Large Dataset Optimization
Handling large datasets with automatic optimization:
import time
# For large datasets, the library automatically optimizes
# Let's simulate with medium-sized data
left_df = pl.DataFrame({
"id": range(10000),
"text": [f"Company Name {i}" for i in range(10000)]
}).lazy()
right_df = pl.DataFrame({
"id": range(8000),
"text": [f"Company Name {i}" for i in range(8000)]
}).lazy()
fuzzy_maps = [
FuzzyMapping(
left_col="text",
right_col="text",
threshold_score=90.0,
fuzzy_type="levenshtein"
)
]
# Time the operation
start = time.time()
result = fuzzy_match_dfs(
left_df=left_df,
right_df=right_df,
fuzzy_maps=fuzzy_maps,
logger=logger
)
duration = time.time() - start
print(f"Matched {len(result)} records in {duration:.2f} seconds")
print(f"Potential matches: {10000 * 8000:,}")
Controlling Join Strategy
You can explicitly control the join strategy:
# Force approximate matching (requires polars-simed)
result = fuzzy_match_dfs(
left_df, right_df, fuzzy_maps, logger,
use_appr_nearest_neighbor_for_new_matches=True
)
# Force standard cross join
result = fuzzy_match_dfs(
left_df, right_df, fuzzy_maps, logger,
use_appr_nearest_neighbor_for_new_matches=False
)
# Let the library decide (default)
result = fuzzy_match_dfs(
left_df, right_df, fuzzy_maps, logger,
use_appr_nearest_neighbor_for_new_matches=None
)