Experimented with fuzzy matching but ultimatly not a good idea. 2 false positives and nothing more

This commit is contained in:
Jonas Linter
2025-11-18 19:45:37 +01:00
parent b4522d2e2a
commit 84caa3590a

View File

@@ -2,8 +2,9 @@
import asyncio
import xml.etree.ElementTree as ET
from datetime import datetime
from datetime import datetime, date
from decimal import Decimal
from difflib import SequenceMatcher
from typing import Any
from sqlalchemy import or_, select
@@ -430,6 +431,7 @@ class ConversionService:
guest_first_name,
guest_last_name,
guest_email,
guest_birth_date,
advertising_partner,
session,
)
@@ -703,6 +705,7 @@ class ConversionService:
guest_first_name: str | None = None,
guest_last_name: str | None = None,
guest_email: str | None = None,
guest_birth_date: date | None = None,
advertising_partner: str | None = None,
session: AsyncSession | None = None,
) -> dict[str, Any]:
@@ -719,6 +722,7 @@ class ConversionService:
guest_first_name: Guest first name for matching
guest_last_name: Guest last name for matching
guest_email: Guest email for matching
guest_birth_date: Guest birth date (optional, improves matching confidence)
advertising_partner: Partner info (matches utm_medium for additional filtering)
session: AsyncSession to use. If None, uses self.session.
@@ -765,7 +769,7 @@ class ConversionService:
guest_email or guest_first_name or guest_last_name
):
matched_reservation = await self._match_by_guest_details(
hotel_id, guest_first_name, guest_last_name, guest_email, session
hotel_id, guest_first_name, guest_last_name, guest_email, guest_birth_date, session
)
if matched_reservation:
@@ -896,9 +900,10 @@ class ConversionService:
guest_first_name: str | None,
guest_last_name: str | None,
guest_email: str | None,
guest_birth_date: date | None = None,
session: AsyncSession | None = None,
) -> Reservation | None:
"""Match reservation by guest name and email using cached data.
"""Match reservation by guest name, email, and birth date using cached data.
This method uses the reservation cache populated at the start of XML processing.
If cache is not available, falls back to database queries.
@@ -908,6 +913,7 @@ class ConversionService:
guest_first_name: Guest first name
guest_last_name: Guest last name
guest_email: Guest email
guest_birth_date: Guest birth date (optional, improves matching confidence)
session: AsyncSession to use. If None, uses self.session.
Returns:
@@ -956,7 +962,7 @@ class ConversionService:
all_reservations = db_result.scalars().all()
return self._match_reservations_by_guest_details(
all_reservations, guest_first_name, guest_last_name, guest_email
all_reservations, guest_first_name, guest_last_name, guest_email, guest_birth_date
)
def _match_reservations_by_guest_details(
@@ -965,65 +971,148 @@ class ConversionService:
guest_first_name: str | None,
guest_last_name: str | None,
guest_email: str | None,
guest_birth_date: date | None = None,
) -> Reservation | None:
"""Match a reservation from a list by guest name and email (non-async).
"""Match a reservation from a list by guest name, email, and birth date (non-async).
Uses a scoring system to find the best match:
- Exact email match: 100 points (highest confidence)
- Exact name + exact birth date: 90 points (very high confidence)
- Fuzzy name + exact birth date: 75-85 points (medium-high confidence)
- Exact name only: 50 points (lower confidence, could be common names)
- Fuzzy name only: 50-60 points (requires high similarity, works without birth date)
Args:
reservations: List of reservations to search through
guest_first_name: Guest first name
guest_last_name: Guest last name
guest_email: Guest email
guest_birth_date: Guest birth date (optional, improves confidence)
Returns:
Matched Reservation or None
Matched Reservation with highest score (only if score >= 50), or None
"""
# Filter by guest details
FUZZY_THRESHOLD = 0.75 # Minimum similarity ratio for fuzzy matching
MIN_SCORE = 50 # Minimum score to consider a match valid
candidates = []
for reservation in reservations:
customer = reservation.customer
if not customer:
continue
# Match by email (highest priority)
if guest_email:
if (
customer.email_address
and customer.email_address.lower() == guest_email.lower()
):
score = 0
match_details = []
# Strategy 1: Match by email (highest priority)
if guest_email and customer.email_address:
if customer.email_address.lower() == guest_email.lower():
score = 100
match_details.append("exact_email")
_LOGGER.info(
"Found exact email match for %s (reservation_id=%s)",
"Found exact email match for %s (reservation_id=%s, score=%d)",
guest_email,
reservation.id,
score,
)
candidates.append((reservation, 3)) # Highest score
candidates.append((reservation, score))
continue
# Match by name (first + last)
# Strategy 2: Match by name and birth date
if guest_first_name and guest_last_name:
first_match = (
# Exact name matching
first_exact = (
customer.given_name
and customer.given_name.lower() == guest_first_name.lower()
)
last_match = (
last_exact = (
customer.surname
and customer.surname.lower() == guest_last_name.lower()
)
if first_match and last_match:
# Fuzzy name matching
first_fuzzy_ratio = 0.0
last_fuzzy_ratio = 0.0
if customer.given_name:
first_fuzzy_ratio = SequenceMatcher(
None,
customer.given_name.lower(),
guest_first_name.lower(),
).ratio()
if customer.surname:
last_fuzzy_ratio = SequenceMatcher(
None,
customer.surname.lower(),
guest_last_name.lower(),
).ratio()
# Birth date matching
birth_date_match = (
guest_birth_date
and customer.birth_date
and customer.birth_date == str(guest_birth_date)
)
# Calculate score based on name matching
if first_exact and last_exact:
if birth_date_match:
score = 90 # Exact name + exact birth date
match_details.append("exact_name_exact_birthdate")
else:
score = 50 # Exact name only
match_details.append("exact_name_only")
_LOGGER.info(
"Found exact name match for %s %s (reservation_id=%s)",
"Found exact name match for %s %s (reservation_id=%s, score=%d, match_details=%s)",
guest_first_name,
guest_last_name,
reservation.id,
score,
match_details,
)
candidates.append((reservation, 2)) # Medium-high score
continue
elif (
first_fuzzy_ratio >= FUZZY_THRESHOLD
and last_fuzzy_ratio >= FUZZY_THRESHOLD
):
# Both first and last names are fuzzy matched
fuzzy_score = min(first_fuzzy_ratio, last_fuzzy_ratio)
if birth_date_match:
# Fuzzy name + exact birth date (high confidence)
score = int(75 + (fuzzy_score - FUZZY_THRESHOLD) * 40)
match_details.append(
f"fuzzy_name_exact_birthdate(ratio={fuzzy_score:.2f})"
)
else:
# Fuzzy name only (lower confidence, but still usable)
# Scale from 50-60 based on similarity ratio above threshold
score = int(50 + (fuzzy_score - FUZZY_THRESHOLD) * 40)
match_details.append(f"fuzzy_name_only(ratio={fuzzy_score:.2f})")
_LOGGER.info(
"Found fuzzy name match for %s %s (reservation_id=%s, score=%d, match_details=%s)",
guest_first_name,
guest_last_name,
reservation.id,
score,
match_details,
)
# Only add candidates that meet minimum score threshold
if score >= MIN_SCORE:
candidates.append((reservation, score))
# Return highest-scoring match
if candidates:
candidates.sort(key=lambda x: x[1], reverse=True)
return candidates[0][0]
best_match = candidates[0][0]
best_score = candidates[0][1]
_LOGGER.debug(
"Selected best match (reservation_id=%s) with score=%d out of %d candidates",
best_match.id,
best_score,
len(candidates),
)
return best_match
return None