Experimented with fuzzy matching but ultimatly not a good idea. 2 false positives and nothing more

This commit is contained in:
Jonas Linter
2025-11-18 19:45:37 +01:00
parent b4522d2e2a
commit 84caa3590a

View File

@@ -2,8 +2,9 @@
import asyncio import asyncio
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from datetime import datetime from datetime import datetime, date
from decimal import Decimal from decimal import Decimal
from difflib import SequenceMatcher
from typing import Any from typing import Any
from sqlalchemy import or_, select from sqlalchemy import or_, select
@@ -430,6 +431,7 @@ class ConversionService:
guest_first_name, guest_first_name,
guest_last_name, guest_last_name,
guest_email, guest_email,
guest_birth_date,
advertising_partner, advertising_partner,
session, session,
) )
@@ -703,6 +705,7 @@ class ConversionService:
guest_first_name: str | None = None, guest_first_name: str | None = None,
guest_last_name: str | None = None, guest_last_name: str | None = None,
guest_email: str | None = None, guest_email: str | None = None,
guest_birth_date: date | None = None,
advertising_partner: str | None = None, advertising_partner: str | None = None,
session: AsyncSession | None = None, session: AsyncSession | None = None,
) -> dict[str, Any]: ) -> dict[str, Any]:
@@ -719,6 +722,7 @@ class ConversionService:
guest_first_name: Guest first name for matching guest_first_name: Guest first name for matching
guest_last_name: Guest last name for matching guest_last_name: Guest last name for matching
guest_email: Guest email for matching guest_email: Guest email for matching
guest_birth_date: Guest birth date (optional, improves matching confidence)
advertising_partner: Partner info (matches utm_medium for additional filtering) advertising_partner: Partner info (matches utm_medium for additional filtering)
session: AsyncSession to use. If None, uses self.session. session: AsyncSession to use. If None, uses self.session.
@@ -765,7 +769,7 @@ class ConversionService:
guest_email or guest_first_name or guest_last_name guest_email or guest_first_name or guest_last_name
): ):
matched_reservation = await self._match_by_guest_details( matched_reservation = await self._match_by_guest_details(
hotel_id, guest_first_name, guest_last_name, guest_email, session hotel_id, guest_first_name, guest_last_name, guest_email, guest_birth_date, session
) )
if matched_reservation: if matched_reservation:
@@ -896,9 +900,10 @@ class ConversionService:
guest_first_name: str | None, guest_first_name: str | None,
guest_last_name: str | None, guest_last_name: str | None,
guest_email: str | None, guest_email: str | None,
guest_birth_date: date | None = None,
session: AsyncSession | None = None, session: AsyncSession | None = None,
) -> Reservation | None: ) -> Reservation | None:
"""Match reservation by guest name and email using cached data. """Match reservation by guest name, email, and birth date using cached data.
This method uses the reservation cache populated at the start of XML processing. This method uses the reservation cache populated at the start of XML processing.
If cache is not available, falls back to database queries. If cache is not available, falls back to database queries.
@@ -908,6 +913,7 @@ class ConversionService:
guest_first_name: Guest first name guest_first_name: Guest first name
guest_last_name: Guest last name guest_last_name: Guest last name
guest_email: Guest email guest_email: Guest email
guest_birth_date: Guest birth date (optional, improves matching confidence)
session: AsyncSession to use. If None, uses self.session. session: AsyncSession to use. If None, uses self.session.
Returns: Returns:
@@ -956,7 +962,7 @@ class ConversionService:
all_reservations = db_result.scalars().all() all_reservations = db_result.scalars().all()
return self._match_reservations_by_guest_details( return self._match_reservations_by_guest_details(
all_reservations, guest_first_name, guest_last_name, guest_email all_reservations, guest_first_name, guest_last_name, guest_email, guest_birth_date
) )
def _match_reservations_by_guest_details( def _match_reservations_by_guest_details(
@@ -965,65 +971,148 @@ class ConversionService:
guest_first_name: str | None, guest_first_name: str | None,
guest_last_name: str | None, guest_last_name: str | None,
guest_email: str | None, guest_email: str | None,
guest_birth_date: date | None = None,
) -> Reservation | None: ) -> Reservation | None:
"""Match a reservation from a list by guest name and email (non-async). """Match a reservation from a list by guest name, email, and birth date (non-async).
Uses a scoring system to find the best match:
- Exact email match: 100 points (highest confidence)
- Exact name + exact birth date: 90 points (very high confidence)
- Fuzzy name + exact birth date: 75-85 points (medium-high confidence)
- Exact name only: 50 points (lower confidence, could be common names)
- Fuzzy name only: 50-60 points (requires high similarity, works without birth date)
Args: Args:
reservations: List of reservations to search through reservations: List of reservations to search through
guest_first_name: Guest first name guest_first_name: Guest first name
guest_last_name: Guest last name guest_last_name: Guest last name
guest_email: Guest email guest_email: Guest email
guest_birth_date: Guest birth date (optional, improves confidence)
Returns: Returns:
Matched Reservation or None Matched Reservation with highest score (only if score >= 50), or None
""" """
# Filter by guest details FUZZY_THRESHOLD = 0.75 # Minimum similarity ratio for fuzzy matching
MIN_SCORE = 50 # Minimum score to consider a match valid
candidates = [] candidates = []
for reservation in reservations: for reservation in reservations:
customer = reservation.customer customer = reservation.customer
if not customer: if not customer:
continue continue
# Match by email (highest priority) score = 0
if guest_email: match_details = []
if (
customer.email_address # Strategy 1: Match by email (highest priority)
and customer.email_address.lower() == guest_email.lower() if guest_email and customer.email_address:
): if customer.email_address.lower() == guest_email.lower():
score = 100
match_details.append("exact_email")
_LOGGER.info( _LOGGER.info(
"Found exact email match for %s (reservation_id=%s)", "Found exact email match for %s (reservation_id=%s, score=%d)",
guest_email, guest_email,
reservation.id, reservation.id,
score,
) )
candidates.append((reservation, 3)) # Highest score candidates.append((reservation, score))
continue continue
# Match by name (first + last) # Strategy 2: Match by name and birth date
if guest_first_name and guest_last_name: if guest_first_name and guest_last_name:
first_match = ( # Exact name matching
first_exact = (
customer.given_name customer.given_name
and customer.given_name.lower() == guest_first_name.lower() and customer.given_name.lower() == guest_first_name.lower()
) )
last_match = ( last_exact = (
customer.surname customer.surname
and customer.surname.lower() == guest_last_name.lower() and customer.surname.lower() == guest_last_name.lower()
) )
if first_match and last_match: # Fuzzy name matching
first_fuzzy_ratio = 0.0
last_fuzzy_ratio = 0.0
if customer.given_name:
first_fuzzy_ratio = SequenceMatcher(
None,
customer.given_name.lower(),
guest_first_name.lower(),
).ratio()
if customer.surname:
last_fuzzy_ratio = SequenceMatcher(
None,
customer.surname.lower(),
guest_last_name.lower(),
).ratio()
# Birth date matching
birth_date_match = (
guest_birth_date
and customer.birth_date
and customer.birth_date == str(guest_birth_date)
)
# Calculate score based on name matching
if first_exact and last_exact:
if birth_date_match:
score = 90 # Exact name + exact birth date
match_details.append("exact_name_exact_birthdate")
else:
score = 50 # Exact name only
match_details.append("exact_name_only")
_LOGGER.info( _LOGGER.info(
"Found exact name match for %s %s (reservation_id=%s)", "Found exact name match for %s %s (reservation_id=%s, score=%d, match_details=%s)",
guest_first_name, guest_first_name,
guest_last_name, guest_last_name,
reservation.id, reservation.id,
score,
match_details,
) )
candidates.append((reservation, 2)) # Medium-high score elif (
continue first_fuzzy_ratio >= FUZZY_THRESHOLD
and last_fuzzy_ratio >= FUZZY_THRESHOLD
):
# Both first and last names are fuzzy matched
fuzzy_score = min(first_fuzzy_ratio, last_fuzzy_ratio)
if birth_date_match:
# Fuzzy name + exact birth date (high confidence)
score = int(75 + (fuzzy_score - FUZZY_THRESHOLD) * 40)
match_details.append(
f"fuzzy_name_exact_birthdate(ratio={fuzzy_score:.2f})"
)
else:
# Fuzzy name only (lower confidence, but still usable)
# Scale from 50-60 based on similarity ratio above threshold
score = int(50 + (fuzzy_score - FUZZY_THRESHOLD) * 40)
match_details.append(f"fuzzy_name_only(ratio={fuzzy_score:.2f})")
_LOGGER.info(
"Found fuzzy name match for %s %s (reservation_id=%s, score=%d, match_details=%s)",
guest_first_name,
guest_last_name,
reservation.id,
score,
match_details,
)
# Only add candidates that meet minimum score threshold
if score >= MIN_SCORE:
candidates.append((reservation, score))
# Return highest-scoring match # Return highest-scoring match
if candidates: if candidates:
candidates.sort(key=lambda x: x[1], reverse=True) candidates.sort(key=lambda x: x[1], reverse=True)
return candidates[0][0] best_match = candidates[0][0]
best_score = candidates[0][1]
_LOGGER.debug(
"Selected best match (reservation_id=%s) with score=%d out of %d candidates",
best_match.id,
best_score,
len(candidates),
)
return best_match
return None return None