Experimented with fuzzy matching but ultimatly not a good idea. 2 false positives and nothing more
This commit is contained in:
@@ -2,8 +2,9 @@
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from datetime import datetime
|
from datetime import datetime, date
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
|
from difflib import SequenceMatcher
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from sqlalchemy import or_, select
|
from sqlalchemy import or_, select
|
||||||
@@ -430,6 +431,7 @@ class ConversionService:
|
|||||||
guest_first_name,
|
guest_first_name,
|
||||||
guest_last_name,
|
guest_last_name,
|
||||||
guest_email,
|
guest_email,
|
||||||
|
guest_birth_date,
|
||||||
advertising_partner,
|
advertising_partner,
|
||||||
session,
|
session,
|
||||||
)
|
)
|
||||||
@@ -703,6 +705,7 @@ class ConversionService:
|
|||||||
guest_first_name: str | None = None,
|
guest_first_name: str | None = None,
|
||||||
guest_last_name: str | None = None,
|
guest_last_name: str | None = None,
|
||||||
guest_email: str | None = None,
|
guest_email: str | None = None,
|
||||||
|
guest_birth_date: date | None = None,
|
||||||
advertising_partner: str | None = None,
|
advertising_partner: str | None = None,
|
||||||
session: AsyncSession | None = None,
|
session: AsyncSession | None = None,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
@@ -719,6 +722,7 @@ class ConversionService:
|
|||||||
guest_first_name: Guest first name for matching
|
guest_first_name: Guest first name for matching
|
||||||
guest_last_name: Guest last name for matching
|
guest_last_name: Guest last name for matching
|
||||||
guest_email: Guest email for matching
|
guest_email: Guest email for matching
|
||||||
|
guest_birth_date: Guest birth date (optional, improves matching confidence)
|
||||||
advertising_partner: Partner info (matches utm_medium for additional filtering)
|
advertising_partner: Partner info (matches utm_medium for additional filtering)
|
||||||
session: AsyncSession to use. If None, uses self.session.
|
session: AsyncSession to use. If None, uses self.session.
|
||||||
|
|
||||||
@@ -765,7 +769,7 @@ class ConversionService:
|
|||||||
guest_email or guest_first_name or guest_last_name
|
guest_email or guest_first_name or guest_last_name
|
||||||
):
|
):
|
||||||
matched_reservation = await self._match_by_guest_details(
|
matched_reservation = await self._match_by_guest_details(
|
||||||
hotel_id, guest_first_name, guest_last_name, guest_email, session
|
hotel_id, guest_first_name, guest_last_name, guest_email, guest_birth_date, session
|
||||||
)
|
)
|
||||||
|
|
||||||
if matched_reservation:
|
if matched_reservation:
|
||||||
@@ -896,9 +900,10 @@ class ConversionService:
|
|||||||
guest_first_name: str | None,
|
guest_first_name: str | None,
|
||||||
guest_last_name: str | None,
|
guest_last_name: str | None,
|
||||||
guest_email: str | None,
|
guest_email: str | None,
|
||||||
|
guest_birth_date: date | None = None,
|
||||||
session: AsyncSession | None = None,
|
session: AsyncSession | None = None,
|
||||||
) -> Reservation | None:
|
) -> Reservation | None:
|
||||||
"""Match reservation by guest name and email using cached data.
|
"""Match reservation by guest name, email, and birth date using cached data.
|
||||||
|
|
||||||
This method uses the reservation cache populated at the start of XML processing.
|
This method uses the reservation cache populated at the start of XML processing.
|
||||||
If cache is not available, falls back to database queries.
|
If cache is not available, falls back to database queries.
|
||||||
@@ -908,6 +913,7 @@ class ConversionService:
|
|||||||
guest_first_name: Guest first name
|
guest_first_name: Guest first name
|
||||||
guest_last_name: Guest last name
|
guest_last_name: Guest last name
|
||||||
guest_email: Guest email
|
guest_email: Guest email
|
||||||
|
guest_birth_date: Guest birth date (optional, improves matching confidence)
|
||||||
session: AsyncSession to use. If None, uses self.session.
|
session: AsyncSession to use. If None, uses self.session.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -956,7 +962,7 @@ class ConversionService:
|
|||||||
all_reservations = db_result.scalars().all()
|
all_reservations = db_result.scalars().all()
|
||||||
|
|
||||||
return self._match_reservations_by_guest_details(
|
return self._match_reservations_by_guest_details(
|
||||||
all_reservations, guest_first_name, guest_last_name, guest_email
|
all_reservations, guest_first_name, guest_last_name, guest_email, guest_birth_date
|
||||||
)
|
)
|
||||||
|
|
||||||
def _match_reservations_by_guest_details(
|
def _match_reservations_by_guest_details(
|
||||||
@@ -965,65 +971,148 @@ class ConversionService:
|
|||||||
guest_first_name: str | None,
|
guest_first_name: str | None,
|
||||||
guest_last_name: str | None,
|
guest_last_name: str | None,
|
||||||
guest_email: str | None,
|
guest_email: str | None,
|
||||||
|
guest_birth_date: date | None = None,
|
||||||
) -> Reservation | None:
|
) -> Reservation | None:
|
||||||
"""Match a reservation from a list by guest name and email (non-async).
|
"""Match a reservation from a list by guest name, email, and birth date (non-async).
|
||||||
|
|
||||||
|
Uses a scoring system to find the best match:
|
||||||
|
- Exact email match: 100 points (highest confidence)
|
||||||
|
- Exact name + exact birth date: 90 points (very high confidence)
|
||||||
|
- Fuzzy name + exact birth date: 75-85 points (medium-high confidence)
|
||||||
|
- Exact name only: 50 points (lower confidence, could be common names)
|
||||||
|
- Fuzzy name only: 50-60 points (requires high similarity, works without birth date)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
reservations: List of reservations to search through
|
reservations: List of reservations to search through
|
||||||
guest_first_name: Guest first name
|
guest_first_name: Guest first name
|
||||||
guest_last_name: Guest last name
|
guest_last_name: Guest last name
|
||||||
guest_email: Guest email
|
guest_email: Guest email
|
||||||
|
guest_birth_date: Guest birth date (optional, improves confidence)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Matched Reservation or None
|
Matched Reservation with highest score (only if score >= 50), or None
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# Filter by guest details
|
FUZZY_THRESHOLD = 0.75 # Minimum similarity ratio for fuzzy matching
|
||||||
|
MIN_SCORE = 50 # Minimum score to consider a match valid
|
||||||
|
|
||||||
candidates = []
|
candidates = []
|
||||||
for reservation in reservations:
|
for reservation in reservations:
|
||||||
customer = reservation.customer
|
customer = reservation.customer
|
||||||
if not customer:
|
if not customer:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Match by email (highest priority)
|
score = 0
|
||||||
if guest_email:
|
match_details = []
|
||||||
if (
|
|
||||||
customer.email_address
|
# Strategy 1: Match by email (highest priority)
|
||||||
and customer.email_address.lower() == guest_email.lower()
|
if guest_email and customer.email_address:
|
||||||
):
|
if customer.email_address.lower() == guest_email.lower():
|
||||||
|
score = 100
|
||||||
|
match_details.append("exact_email")
|
||||||
_LOGGER.info(
|
_LOGGER.info(
|
||||||
"Found exact email match for %s (reservation_id=%s)",
|
"Found exact email match for %s (reservation_id=%s, score=%d)",
|
||||||
guest_email,
|
guest_email,
|
||||||
reservation.id,
|
reservation.id,
|
||||||
|
score,
|
||||||
)
|
)
|
||||||
candidates.append((reservation, 3)) # Highest score
|
candidates.append((reservation, score))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Match by name (first + last)
|
# Strategy 2: Match by name and birth date
|
||||||
if guest_first_name and guest_last_name:
|
if guest_first_name and guest_last_name:
|
||||||
first_match = (
|
# Exact name matching
|
||||||
|
first_exact = (
|
||||||
customer.given_name
|
customer.given_name
|
||||||
and customer.given_name.lower() == guest_first_name.lower()
|
and customer.given_name.lower() == guest_first_name.lower()
|
||||||
)
|
)
|
||||||
last_match = (
|
last_exact = (
|
||||||
customer.surname
|
customer.surname
|
||||||
and customer.surname.lower() == guest_last_name.lower()
|
and customer.surname.lower() == guest_last_name.lower()
|
||||||
)
|
)
|
||||||
|
|
||||||
if first_match and last_match:
|
# Fuzzy name matching
|
||||||
|
first_fuzzy_ratio = 0.0
|
||||||
|
last_fuzzy_ratio = 0.0
|
||||||
|
if customer.given_name:
|
||||||
|
first_fuzzy_ratio = SequenceMatcher(
|
||||||
|
None,
|
||||||
|
customer.given_name.lower(),
|
||||||
|
guest_first_name.lower(),
|
||||||
|
).ratio()
|
||||||
|
if customer.surname:
|
||||||
|
last_fuzzy_ratio = SequenceMatcher(
|
||||||
|
None,
|
||||||
|
customer.surname.lower(),
|
||||||
|
guest_last_name.lower(),
|
||||||
|
).ratio()
|
||||||
|
|
||||||
|
# Birth date matching
|
||||||
|
birth_date_match = (
|
||||||
|
guest_birth_date
|
||||||
|
and customer.birth_date
|
||||||
|
and customer.birth_date == str(guest_birth_date)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate score based on name matching
|
||||||
|
if first_exact and last_exact:
|
||||||
|
if birth_date_match:
|
||||||
|
score = 90 # Exact name + exact birth date
|
||||||
|
match_details.append("exact_name_exact_birthdate")
|
||||||
|
else:
|
||||||
|
score = 50 # Exact name only
|
||||||
|
match_details.append("exact_name_only")
|
||||||
_LOGGER.info(
|
_LOGGER.info(
|
||||||
"Found exact name match for %s %s (reservation_id=%s)",
|
"Found exact name match for %s %s (reservation_id=%s, score=%d, match_details=%s)",
|
||||||
guest_first_name,
|
guest_first_name,
|
||||||
guest_last_name,
|
guest_last_name,
|
||||||
reservation.id,
|
reservation.id,
|
||||||
|
score,
|
||||||
|
match_details,
|
||||||
)
|
)
|
||||||
candidates.append((reservation, 2)) # Medium-high score
|
elif (
|
||||||
continue
|
first_fuzzy_ratio >= FUZZY_THRESHOLD
|
||||||
|
and last_fuzzy_ratio >= FUZZY_THRESHOLD
|
||||||
|
):
|
||||||
|
# Both first and last names are fuzzy matched
|
||||||
|
fuzzy_score = min(first_fuzzy_ratio, last_fuzzy_ratio)
|
||||||
|
if birth_date_match:
|
||||||
|
# Fuzzy name + exact birth date (high confidence)
|
||||||
|
score = int(75 + (fuzzy_score - FUZZY_THRESHOLD) * 40)
|
||||||
|
match_details.append(
|
||||||
|
f"fuzzy_name_exact_birthdate(ratio={fuzzy_score:.2f})"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Fuzzy name only (lower confidence, but still usable)
|
||||||
|
# Scale from 50-60 based on similarity ratio above threshold
|
||||||
|
score = int(50 + (fuzzy_score - FUZZY_THRESHOLD) * 40)
|
||||||
|
match_details.append(f"fuzzy_name_only(ratio={fuzzy_score:.2f})")
|
||||||
|
|
||||||
|
_LOGGER.info(
|
||||||
|
"Found fuzzy name match for %s %s (reservation_id=%s, score=%d, match_details=%s)",
|
||||||
|
guest_first_name,
|
||||||
|
guest_last_name,
|
||||||
|
reservation.id,
|
||||||
|
score,
|
||||||
|
match_details,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Only add candidates that meet minimum score threshold
|
||||||
|
if score >= MIN_SCORE:
|
||||||
|
candidates.append((reservation, score))
|
||||||
|
|
||||||
# Return highest-scoring match
|
# Return highest-scoring match
|
||||||
if candidates:
|
if candidates:
|
||||||
candidates.sort(key=lambda x: x[1], reverse=True)
|
candidates.sort(key=lambda x: x[1], reverse=True)
|
||||||
return candidates[0][0]
|
best_match = candidates[0][0]
|
||||||
|
best_score = candidates[0][1]
|
||||||
|
_LOGGER.debug(
|
||||||
|
"Selected best match (reservation_id=%s) with score=%d out of %d candidates",
|
||||||
|
best_match.id,
|
||||||
|
best_score,
|
||||||
|
len(candidates),
|
||||||
|
)
|
||||||
|
return best_match
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user