From fab981f71fa4c7fea2ab0b91dbe99216da0deffe Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Tue, 18 Nov 2025 19:45:37 +0100 Subject: [PATCH] Experimented with fuzzy matching but ultimatly not a good idea. 2 false positives and nothing more --- src/alpine_bits_python/conversion_service.py | 135 +++++++++++++++---- 1 file changed, 112 insertions(+), 23 deletions(-) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 67add2f..dfa3af9 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -2,8 +2,9 @@ import asyncio import xml.etree.ElementTree as ET -from datetime import datetime +from datetime import datetime, date from decimal import Decimal +from difflib import SequenceMatcher from typing import Any from sqlalchemy import or_, select @@ -430,6 +431,7 @@ class ConversionService: guest_first_name, guest_last_name, guest_email, + guest_birth_date, advertising_partner, session, ) @@ -703,6 +705,7 @@ class ConversionService: guest_first_name: str | None = None, guest_last_name: str | None = None, guest_email: str | None = None, + guest_birth_date: date | None = None, advertising_partner: str | None = None, session: AsyncSession | None = None, ) -> dict[str, Any]: @@ -719,6 +722,7 @@ class ConversionService: guest_first_name: Guest first name for matching guest_last_name: Guest last name for matching guest_email: Guest email for matching + guest_birth_date: Guest birth date (optional, improves matching confidence) advertising_partner: Partner info (matches utm_medium for additional filtering) session: AsyncSession to use. If None, uses self.session. @@ -765,7 +769,7 @@ class ConversionService: guest_email or guest_first_name or guest_last_name ): matched_reservation = await self._match_by_guest_details( - hotel_id, guest_first_name, guest_last_name, guest_email, session + hotel_id, guest_first_name, guest_last_name, guest_email, guest_birth_date, session ) if matched_reservation: @@ -896,9 +900,10 @@ class ConversionService: guest_first_name: str | None, guest_last_name: str | None, guest_email: str | None, + guest_birth_date: date | None = None, session: AsyncSession | None = None, ) -> Reservation | None: - """Match reservation by guest name and email using cached data. + """Match reservation by guest name, email, and birth date using cached data. This method uses the reservation cache populated at the start of XML processing. If cache is not available, falls back to database queries. @@ -908,6 +913,7 @@ class ConversionService: guest_first_name: Guest first name guest_last_name: Guest last name guest_email: Guest email + guest_birth_date: Guest birth date (optional, improves matching confidence) session: AsyncSession to use. If None, uses self.session. Returns: @@ -956,7 +962,7 @@ class ConversionService: all_reservations = db_result.scalars().all() return self._match_reservations_by_guest_details( - all_reservations, guest_first_name, guest_last_name, guest_email + all_reservations, guest_first_name, guest_last_name, guest_email, guest_birth_date ) def _match_reservations_by_guest_details( @@ -965,65 +971,148 @@ class ConversionService: guest_first_name: str | None, guest_last_name: str | None, guest_email: str | None, + guest_birth_date: date | None = None, ) -> Reservation | None: - """Match a reservation from a list by guest name and email (non-async). + """Match a reservation from a list by guest name, email, and birth date (non-async). + + Uses a scoring system to find the best match: + - Exact email match: 100 points (highest confidence) + - Exact name + exact birth date: 90 points (very high confidence) + - Fuzzy name + exact birth date: 75-85 points (medium-high confidence) + - Exact name only: 50 points (lower confidence, could be common names) + - Fuzzy name only: 50-60 points (requires high similarity, works without birth date) Args: reservations: List of reservations to search through guest_first_name: Guest first name guest_last_name: Guest last name guest_email: Guest email + guest_birth_date: Guest birth date (optional, improves confidence) Returns: - Matched Reservation or None + Matched Reservation with highest score (only if score >= 50), or None """ - # Filter by guest details + FUZZY_THRESHOLD = 0.75 # Minimum similarity ratio for fuzzy matching + MIN_SCORE = 50 # Minimum score to consider a match valid + candidates = [] for reservation in reservations: customer = reservation.customer if not customer: continue - # Match by email (highest priority) - if guest_email: - if ( - customer.email_address - and customer.email_address.lower() == guest_email.lower() - ): + score = 0 + match_details = [] + + # Strategy 1: Match by email (highest priority) + if guest_email and customer.email_address: + if customer.email_address.lower() == guest_email.lower(): + score = 100 + match_details.append("exact_email") _LOGGER.info( - "Found exact email match for %s (reservation_id=%s)", + "Found exact email match for %s (reservation_id=%s, score=%d)", guest_email, reservation.id, + score, ) - candidates.append((reservation, 3)) # Highest score + candidates.append((reservation, score)) continue - # Match by name (first + last) + # Strategy 2: Match by name and birth date if guest_first_name and guest_last_name: - first_match = ( + # Exact name matching + first_exact = ( customer.given_name and customer.given_name.lower() == guest_first_name.lower() ) - last_match = ( + last_exact = ( customer.surname and customer.surname.lower() == guest_last_name.lower() ) - if first_match and last_match: + # Fuzzy name matching + first_fuzzy_ratio = 0.0 + last_fuzzy_ratio = 0.0 + if customer.given_name: + first_fuzzy_ratio = SequenceMatcher( + None, + customer.given_name.lower(), + guest_first_name.lower(), + ).ratio() + if customer.surname: + last_fuzzy_ratio = SequenceMatcher( + None, + customer.surname.lower(), + guest_last_name.lower(), + ).ratio() + + # Birth date matching + birth_date_match = ( + guest_birth_date + and customer.birth_date + and customer.birth_date == str(guest_birth_date) + ) + + # Calculate score based on name matching + if first_exact and last_exact: + if birth_date_match: + score = 90 # Exact name + exact birth date + match_details.append("exact_name_exact_birthdate") + else: + score = 50 # Exact name only + match_details.append("exact_name_only") _LOGGER.info( - "Found exact name match for %s %s (reservation_id=%s)", + "Found exact name match for %s %s (reservation_id=%s, score=%d, match_details=%s)", guest_first_name, guest_last_name, reservation.id, + score, + match_details, ) - candidates.append((reservation, 2)) # Medium-high score - continue + elif ( + first_fuzzy_ratio >= FUZZY_THRESHOLD + and last_fuzzy_ratio >= FUZZY_THRESHOLD + ): + # Both first and last names are fuzzy matched + fuzzy_score = min(first_fuzzy_ratio, last_fuzzy_ratio) + if birth_date_match: + # Fuzzy name + exact birth date (high confidence) + score = int(75 + (fuzzy_score - FUZZY_THRESHOLD) * 40) + match_details.append( + f"fuzzy_name_exact_birthdate(ratio={fuzzy_score:.2f})" + ) + else: + # Fuzzy name only (lower confidence, but still usable) + # Scale from 50-60 based on similarity ratio above threshold + score = int(50 + (fuzzy_score - FUZZY_THRESHOLD) * 40) + match_details.append(f"fuzzy_name_only(ratio={fuzzy_score:.2f})") + + _LOGGER.info( + "Found fuzzy name match for %s %s (reservation_id=%s, score=%d, match_details=%s)", + guest_first_name, + guest_last_name, + reservation.id, + score, + match_details, + ) + + # Only add candidates that meet minimum score threshold + if score >= MIN_SCORE: + candidates.append((reservation, score)) # Return highest-scoring match if candidates: candidates.sort(key=lambda x: x[1], reverse=True) - return candidates[0][0] + best_match = candidates[0][0] + best_score = candidates[0][1] + _LOGGER.debug( + "Selected best match (reservation_id=%s) with score=%d out of %d candidates", + best_match.id, + best_score, + len(candidates), + ) + return best_match return None