From a6e4bcbe1b182c8018689bebc40dd48cc1e4986d Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 15:23:10 +0100 Subject: [PATCH] Significant matching fix. --- src/alpine_bits_python/conversion_service.py | 395 +++++++++++++++---- 1 file changed, 316 insertions(+), 79 deletions(-) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 869a6f7..e76db31 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -631,6 +631,8 @@ class ConversionService: selectinload(Reservation.customer), ) + + query = query.filter(Reservation.hotel_id == self.hotel_id) if self.hotel_id else query result = await session.execute(query) reservations = result.scalars().all() @@ -1050,22 +1052,28 @@ class ConversionService: self, advertising_campagne: str, hotel_id: str | None, - guest_first_name: str | None, - guest_last_name: str | None, - guest_email: str | None, + hashed_first_name: str | None, + hashed_last_name: str | None, + hashed_email: str | None, advertising_partner: str | None, session: AsyncSession | None = None, + raw_first_name: str | None = None, + raw_last_name: str | None = None, + raw_email: str | None = None, ) -> Reservation | None: """Match reservation by advertising tracking data (fbclid/gclid/md5_unique_id). Args: advertising_campagne: Tracking ID from PMS (could be truncated click_id or md5_unique_id) hotel_id: Hotel ID for filtering - guest_first_name: Guest first name for disambiguation - guest_last_name: Guest last name for disambiguation - guest_email: Guest email for disambiguation + hashed_first_name: Guest first name (hashed) for disambiguation + hashed_last_name: Guest last name (hashed) for disambiguation + hashed_email: Guest email (hashed) for disambiguation advertising_partner: Partner info (matches utm_medium) session: AsyncSession to use. If None, uses self.session. + raw_first_name: Plain guest first name (optional fallback) + raw_last_name: Plain guest last name (optional fallback) + raw_email: Plain guest email (optional fallback) Returns: Matched Reservation or None @@ -1098,25 +1106,36 @@ class ConversionService: if not reservations: return None - # If single match, return it - if len(reservations) == 1: + # Determine if this looks like a md5_unique_id (32 hex characters) or a click_id + is_md5_lookup = len(advertising_campagne or "") == 32 + + needs_filtering = len(reservations) > 1 or not is_md5_lookup + + if not needs_filtering: + # Confident single match via md5_unique_id return reservations[0] - # If multiple matches, try to narrow down using guest details + # If multiple matches or click-id matches, try to narrow down using hashed guest details _LOGGER.debug( - "Multiple reservations match advertisingCampagne %s (hotel=%s): found %d matches. " - "Attempting to narrow down using guest details.", + ( + "Ambiguous advertising match for %s (hotel=%s, candidates=%d, md5_lookup=%s). " + "Applying guest detail filtering." + ), advertising_campagne, hotel_id, len(reservations), + is_md5_lookup, ) matched_reservation = self._filter_reservations_by_guest_details( reservations, - guest_first_name, - guest_last_name, - guest_email, + raw_first_name, + raw_last_name, + raw_email, advertising_partner, + hashed_first_name=hashed_first_name, + hashed_last_name=hashed_last_name, + hashed_email=hashed_email, ) if matched_reservation is None: @@ -1126,9 +1145,9 @@ class ConversionService: "(hotel=%s, guest=%s %s, email=%s). Using first match.", advertising_campagne, hotel_id, - guest_first_name, - guest_last_name, - guest_email, + raw_first_name, + raw_last_name, + raw_email, ) matched_reservation = reservations[0] @@ -1210,18 +1229,26 @@ class ConversionService: guest_last_name: str | None, guest_email: str | None, advertising_partner: str | None, + *, + hashed_first_name: str | None = None, + hashed_last_name: str | None = None, + hashed_email: str | None = None, ) -> Reservation | None: """Filter reservations using guest details to find a single match. - First tries to match by guest name and email. If that doesn't yield a single match, - tries matching by advertising_partner against utm_medium. + Prefers hashed comparisons (exact match on hashed email or hashed name pair) and + falls back to plaintext comparison if hashes are unavailable. Finally tries + advertising partner vs utm_medium. Args: reservations: List of candidate reservations - guest_first_name: Guest first name - guest_last_name: Guest last name - guest_email: Guest email + guest_first_name: Guest first name (plaintext, optional) + guest_last_name: Guest last name (plaintext, optional) + guest_email: Guest email (plaintext, optional) advertising_partner: Partner info (e.g., "Facebook_Mobile_Feed") + hashed_first_name: Hashed first name for cross-checking + hashed_last_name: Hashed last name for cross-checking + hashed_email: Hashed email for cross-checking Returns: Single best-match Reservation, or None if no good match found @@ -1229,40 +1256,71 @@ class ConversionService: """ candidates = reservations - # Try to narrow down by guest name and email + # Attempt hashed email match first + if hashed_email: + email_matches = [ + reservation + for reservation in candidates + if reservation.customer + and reservation.customer.hashed_email + and reservation.customer.hashed_email == hashed_email + ] + if len(email_matches) == 1: + _LOGGER.debug("Found unique match via hashed email") + return email_matches[0] + if email_matches: + candidates = email_matches + + # Attempt hashed name match (first + last) + if hashed_first_name and hashed_last_name: + name_matches = [ + reservation + for reservation in candidates + if reservation.customer + and reservation.customer.hashed_given_name == hashed_first_name + and reservation.customer.hashed_surname == hashed_last_name + ] + if len(name_matches) == 1: + _LOGGER.debug("Found unique match via hashed names") + return name_matches[0] + if name_matches: + candidates = name_matches + + # Fallback to plaintext comparison if provided if guest_first_name or guest_last_name or guest_email: - # First try exact match on all available fields for reservation in candidates: customer = reservation.customer - if customer: - name_match = True - email_match = True + if not customer: + continue - if guest_first_name: - name_match = name_match and ( - customer.given_name - and customer.given_name.lower() == guest_first_name.lower() - ) + name_match = True + email_match = True - if guest_last_name: - name_match = name_match and ( - customer.surname - and customer.surname.lower() == guest_last_name.lower() - ) + if guest_first_name: + name_match = name_match and ( + customer.given_name + and customer.given_name.lower() == guest_first_name.lower() + ) - if guest_email: - email_match = ( - customer.email_address - and customer.email_address.lower() == guest_email.lower() - ) + if guest_last_name: + name_match = name_match and ( + customer.surname + and customer.surname.lower() == guest_last_name.lower() + ) - if name_match and email_match: - _LOGGER.debug( - "Found exact match on guest name/email for %s %s", - guest_first_name, - guest_last_name, - ) - return reservation + if guest_email: + email_match = email_match and ( + customer.email_address + and customer.email_address.lower() == guest_email.lower() + ) + + if name_match and email_match: + _LOGGER.debug( + "Found exact plaintext match on guest details for %s %s", + guest_first_name, + guest_last_name, + ) + return reservation # Try to narrow down by advertising_partner matching utm_medium if advertising_partner: @@ -1470,9 +1528,9 @@ class ConversionService: session: AsyncSession for database queries """ - # Collect every guest/customer pair derived from conversions. + # Collect every (hotel, guest) -> customer pair derived from conversions. result = await session.execute( - select(Conversion.guest_id, Conversion.customer_id).where( + select(Conversion.hotel_id, Conversion.guest_id, Conversion.customer_id).where( Conversion.guest_id.isnot(None), Conversion.customer_id.isnot(None) ) ) @@ -1482,27 +1540,54 @@ class ConversionService: _LOGGER.debug("Phase 3d: No matched guests to check for regularity") return - # Deduplicate by guest_id to avoid recalculating when multiple conversions share the same guest. - guest_to_customer: dict[int, int] = {} - for guest_id, customer_id in guest_customer_rows: - if guest_id is None or customer_id is None: + # Group by (hotel_id, guest_id) to detect conflicts. + guest_customer_sets: dict[tuple[str | None, int], set[int]] = {} + for hotel_id, guest_id, customer_id in guest_customer_rows: + if hotel_id is None or guest_id is None or customer_id is None: continue - if guest_id not in guest_to_customer: - guest_to_customer[guest_id] = customer_id - elif guest_to_customer[guest_id] != customer_id: - _LOGGER.warning( - "Guest %s linked to multiple customers (%s, %s); keeping first match", - guest_id, - guest_to_customer[guest_id], - customer_id, + key = (hotel_id, guest_id) + guest_customer_sets.setdefault(key, set()).add(customer_id) + + if not guest_customer_sets: + _LOGGER.debug("Phase 3d: No matched guests to check for regularity") + return + + duplicates = { + key: customer_ids + for key, customer_ids in guest_customer_sets.items() + if len(customer_ids) > 1 + } + if duplicates: + await self._deduplicate_guest_customer_links(duplicates, session) + + guest_to_customer: dict[tuple[str | None, int], int] = {} + for key, customer_ids in guest_customer_sets.items(): + hotel_id, guest_id = key + # After deduplication, reload conversions for this guest to find the remaining customer (if any) + result = await session.execute( + select(Conversion.customer_id) + .where( + Conversion.hotel_id == hotel_id, + Conversion.guest_id == guest_id, + Conversion.customer_id.isnot(None), ) + .limit(1) + ) + chosen_customer = result.scalar_one_or_none() + if chosen_customer: + guest_to_customer[key] = chosen_customer + + if not guest_to_customer: + _LOGGER.debug( + "Phase 3d: No guests remained linked to customers after deduplication" + ) + return _LOGGER.debug( - "Phase 3d: Checking regularity for %d matched guests", - len(guest_to_customer), + "Phase 3d: Checking regularity for %d matched guests", len(guest_to_customer) ) - for guest_id, customer_id in guest_to_customer.items(): + for (hotel_id, guest_id), customer_id in guest_to_customer.items(): await self._check_if_guest_is_regular(guest_id, customer_id, session) async def _match_conversions_from_db_sequential( @@ -1705,15 +1790,21 @@ class ConversionService: # Get conversion_guest if it exists (has the hashed data) conversion_guest = conversion.guest - # Extract hashed data from conversion_guest (already hashed) + # Extract hashed and raw data from conversion_guest hashed_first_name = None hashed_last_name = None hashed_email = None + raw_first_name = None + raw_last_name = None + raw_email = None if conversion_guest: hashed_first_name = conversion_guest.hashed_first_name hashed_last_name = conversion_guest.hashed_last_name hashed_email = conversion_guest.hashed_email + raw_first_name = conversion_guest.guest_first_name + raw_last_name = conversion_guest.guest_last_name + raw_email = conversion_guest.guest_email # Phase 3a: Only try ID-based matching (fbclid/gclid/md5_unique_id) # Guest detail matching is deferred to Phase 3b/3c @@ -1729,6 +1820,9 @@ class ConversionService: hashed_email, conversion.advertising_partner, session, + raw_first_name=raw_first_name, + raw_last_name=raw_last_name, + raw_email=raw_email, ) if matched_reservation: @@ -1838,21 +1932,164 @@ class ConversionService: ) conversion_guest.is_regular = is_regular - if is_regular: - _LOGGER.debug( - "Marking guest %s as regular: earliest paying conversion %s predates first reservation created at %s", - guest_id, - earliest_paying_conversion.reservation_date, - earliest_reservation.created_at, + async def _deduplicate_guest_customer_links( + self, + duplicates: dict[tuple[str | None, int], set[int]], + session: AsyncSession, + ) -> None: + """Resolve guest/customer conflicts by comparing hashed details and severing bad links.""" + for (hotel_id, guest_id), customer_ids in duplicates.items(): + guest_result = await session.execute( + select(ConversionGuest).where( + ConversionGuest.hotel_id == hotel_id, + ConversionGuest.guest_id == guest_id, + ) ) - else: - _LOGGER.debug( - "Guest %s is not regular: first paying conversion %s is after/equal to first reservation created at %s", - guest_id, - earliest_paying_conversion.reservation_date, - earliest_reservation.created_at, + conversion_guest = guest_result.scalar_one_or_none() + + if not conversion_guest: + _LOGGER.warning( + "Guest %s (hotel=%s) missing when resolving duplicates; removing links to customers %s", + guest_id, + hotel_id, + sorted(customer_ids), + ) + for customer_id in customer_ids: + await self._sever_guest_customer_link( + hotel_id, guest_id, customer_id, session + ) + continue + + preferred_customer_id = await self._choose_best_customer_for_guest( + conversion_guest, customer_ids, session ) + if preferred_customer_id: + _LOGGER.warning( + "Guest %s (hotel=%s) linked to multiple customers %s; keeping %s based on hashed data", + guest_id, + hotel_id, + sorted(customer_ids), + preferred_customer_id, + ) + else: + _LOGGER.warning( + "Guest %s (hotel=%s) linked to multiple customers %s but none matched hashed data. Removing all links.", + guest_id, + hotel_id, + sorted(customer_ids), + ) + + for customer_id in customer_ids: + if customer_id != preferred_customer_id: + await self._sever_guest_customer_link( + hotel_id, guest_id, customer_id, session + ) + + async def _choose_best_customer_for_guest( + self, + conversion_guest: ConversionGuest, + candidate_customer_ids: set[int], + session: AsyncSession, + ) -> int | None: + """Pick the most likely customer based on hashed data.""" + if not candidate_customer_ids: + return None + + result = await session.execute( + select(Customer).where(Customer.id.in_(candidate_customer_ids)) + ) + candidates = result.scalars().all() + + if not candidates: + return None + + def score_customer(customer: Customer) -> int: + score = 0 + if ( + conversion_guest.hashed_email + and customer.hashed_email == conversion_guest.hashed_email + ): + score += 100 + if ( + conversion_guest.hashed_first_name + and customer.hashed_given_name == conversion_guest.hashed_first_name + ): + score += 10 + if ( + conversion_guest.hashed_last_name + and customer.hashed_surname == conversion_guest.hashed_last_name + ): + score += 10 + if ( + conversion_guest.hashed_country_code + and customer.hashed_country_code == conversion_guest.hashed_country_code + ): + score += 2 + if ( + conversion_guest.hashed_birth_date + and customer.hashed_birth_date == conversion_guest.hashed_birth_date + ): + score += 1 + return score + + best_customer_id = None + best_score = -1 + is_tied = False + + for customer in candidates: + score = score_customer(customer) + if score > best_score: + best_score = score + best_customer_id = customer.id + is_tied = False + elif score == best_score and score != -1: + is_tied = True + + if best_score <= 0 or is_tied: + return None + + return best_customer_id + + async def _sever_guest_customer_link( + self, + hotel_id: str | None, + guest_id: int, + customer_id: int, + session: AsyncSession, + ) -> None: + """Remove incorrect guest/customer links from conversions.""" + result = await session.execute( + select(Conversion) + .where( + Conversion.hotel_id == hotel_id, + Conversion.guest_id == guest_id, + Conversion.customer_id == customer_id, + ) + .options(selectinload(Conversion.conversion_rooms)) + ) + conversions = result.scalars().all() + + if not conversions: + return + + for conversion in conversions: + conversion.customer_id = None + conversion.reservation_id = None + conversion.directly_attributable = False + conversion.guest_matched = False + conversion.updated_at = datetime.now() + + _LOGGER.warning( + "Removed %d conversion links for guest %s (hotel=%s) customer %s", + len(conversions), + guest_id, + hotel_id, + customer_id, + ) + + + async def _check_if_attributable( self, matched_customer_id: int,