Significant matching fix.

This commit is contained in:
Jonas Linter
2025-12-03 15:23:10 +01:00
parent 50ce0ec486
commit e1bbefb9a3

View File

@@ -631,6 +631,8 @@ class ConversionService:
selectinload(Reservation.customer),
)
query = query.filter(Reservation.hotel_id == self.hotel_id) if self.hotel_id else query
result = await session.execute(query)
reservations = result.scalars().all()
@@ -1050,22 +1052,28 @@ class ConversionService:
self,
advertising_campagne: str,
hotel_id: str | None,
guest_first_name: str | None,
guest_last_name: str | None,
guest_email: str | None,
hashed_first_name: str | None,
hashed_last_name: str | None,
hashed_email: str | None,
advertising_partner: str | None,
session: AsyncSession | None = None,
raw_first_name: str | None = None,
raw_last_name: str | None = None,
raw_email: str | None = None,
) -> Reservation | None:
"""Match reservation by advertising tracking data (fbclid/gclid/md5_unique_id).
Args:
advertising_campagne: Tracking ID from PMS (could be truncated click_id or md5_unique_id)
hotel_id: Hotel ID for filtering
guest_first_name: Guest first name for disambiguation
guest_last_name: Guest last name for disambiguation
guest_email: Guest email for disambiguation
hashed_first_name: Guest first name (hashed) for disambiguation
hashed_last_name: Guest last name (hashed) for disambiguation
hashed_email: Guest email (hashed) for disambiguation
advertising_partner: Partner info (matches utm_medium)
session: AsyncSession to use. If None, uses self.session.
raw_first_name: Plain guest first name (optional fallback)
raw_last_name: Plain guest last name (optional fallback)
raw_email: Plain guest email (optional fallback)
Returns:
Matched Reservation or None
@@ -1098,25 +1106,36 @@ class ConversionService:
if not reservations:
return None
# If single match, return it
if len(reservations) == 1:
# Determine if this looks like a md5_unique_id (32 hex characters) or a click_id
is_md5_lookup = len(advertising_campagne or "") == 32
needs_filtering = len(reservations) > 1 or not is_md5_lookup
if not needs_filtering:
# Confident single match via md5_unique_id
return reservations[0]
# If multiple matches, try to narrow down using guest details
# If multiple matches or click-id matches, try to narrow down using hashed guest details
_LOGGER.debug(
"Multiple reservations match advertisingCampagne %s (hotel=%s): found %d matches. "
"Attempting to narrow down using guest details.",
(
"Ambiguous advertising match for %s (hotel=%s, candidates=%d, md5_lookup=%s). "
"Applying guest detail filtering."
),
advertising_campagne,
hotel_id,
len(reservations),
is_md5_lookup,
)
matched_reservation = self._filter_reservations_by_guest_details(
reservations,
guest_first_name,
guest_last_name,
guest_email,
raw_first_name,
raw_last_name,
raw_email,
advertising_partner,
hashed_first_name=hashed_first_name,
hashed_last_name=hashed_last_name,
hashed_email=hashed_email,
)
if matched_reservation is None:
@@ -1126,9 +1145,9 @@ class ConversionService:
"(hotel=%s, guest=%s %s, email=%s). Using first match.",
advertising_campagne,
hotel_id,
guest_first_name,
guest_last_name,
guest_email,
raw_first_name,
raw_last_name,
raw_email,
)
matched_reservation = reservations[0]
@@ -1210,18 +1229,26 @@ class ConversionService:
guest_last_name: str | None,
guest_email: str | None,
advertising_partner: str | None,
*,
hashed_first_name: str | None = None,
hashed_last_name: str | None = None,
hashed_email: str | None = None,
) -> Reservation | None:
"""Filter reservations using guest details to find a single match.
First tries to match by guest name and email. If that doesn't yield a single match,
tries matching by advertising_partner against utm_medium.
Prefers hashed comparisons (exact match on hashed email or hashed name pair) and
falls back to plaintext comparison if hashes are unavailable. Finally tries
advertising partner vs utm_medium.
Args:
reservations: List of candidate reservations
guest_first_name: Guest first name
guest_last_name: Guest last name
guest_email: Guest email
guest_first_name: Guest first name (plaintext, optional)
guest_last_name: Guest last name (plaintext, optional)
guest_email: Guest email (plaintext, optional)
advertising_partner: Partner info (e.g., "Facebook_Mobile_Feed")
hashed_first_name: Hashed first name for cross-checking
hashed_last_name: Hashed last name for cross-checking
hashed_email: Hashed email for cross-checking
Returns:
Single best-match Reservation, or None if no good match found
@@ -1229,12 +1256,43 @@ class ConversionService:
"""
candidates = reservations
# Try to narrow down by guest name and email
# Attempt hashed email match first
if hashed_email:
email_matches = [
reservation
for reservation in candidates
if reservation.customer
and reservation.customer.hashed_email
and reservation.customer.hashed_email == hashed_email
]
if len(email_matches) == 1:
_LOGGER.debug("Found unique match via hashed email")
return email_matches[0]
if email_matches:
candidates = email_matches
# Attempt hashed name match (first + last)
if hashed_first_name and hashed_last_name:
name_matches = [
reservation
for reservation in candidates
if reservation.customer
and reservation.customer.hashed_given_name == hashed_first_name
and reservation.customer.hashed_surname == hashed_last_name
]
if len(name_matches) == 1:
_LOGGER.debug("Found unique match via hashed names")
return name_matches[0]
if name_matches:
candidates = name_matches
# Fallback to plaintext comparison if provided
if guest_first_name or guest_last_name or guest_email:
# First try exact match on all available fields
for reservation in candidates:
customer = reservation.customer
if customer:
if not customer:
continue
name_match = True
email_match = True
@@ -1251,14 +1309,14 @@ class ConversionService:
)
if guest_email:
email_match = (
email_match = email_match and (
customer.email_address
and customer.email_address.lower() == guest_email.lower()
)
if name_match and email_match:
_LOGGER.debug(
"Found exact match on guest name/email for %s %s",
"Found exact plaintext match on guest details for %s %s",
guest_first_name,
guest_last_name,
)
@@ -1470,9 +1528,9 @@ class ConversionService:
session: AsyncSession for database queries
"""
# Collect every guest/customer pair derived from conversions.
# Collect every (hotel, guest) -> customer pair derived from conversions.
result = await session.execute(
select(Conversion.guest_id, Conversion.customer_id).where(
select(Conversion.hotel_id, Conversion.guest_id, Conversion.customer_id).where(
Conversion.guest_id.isnot(None), Conversion.customer_id.isnot(None)
)
)
@@ -1482,27 +1540,54 @@ class ConversionService:
_LOGGER.debug("Phase 3d: No matched guests to check for regularity")
return
# Deduplicate by guest_id to avoid recalculating when multiple conversions share the same guest.
guest_to_customer: dict[int, int] = {}
for guest_id, customer_id in guest_customer_rows:
if guest_id is None or customer_id is None:
# Group by (hotel_id, guest_id) to detect conflicts.
guest_customer_sets: dict[tuple[str | None, int], set[int]] = {}
for hotel_id, guest_id, customer_id in guest_customer_rows:
if hotel_id is None or guest_id is None or customer_id is None:
continue
if guest_id not in guest_to_customer:
guest_to_customer[guest_id] = customer_id
elif guest_to_customer[guest_id] != customer_id:
_LOGGER.warning(
"Guest %s linked to multiple customers (%s, %s); keeping first match",
guest_id,
guest_to_customer[guest_id],
customer_id,
key = (hotel_id, guest_id)
guest_customer_sets.setdefault(key, set()).add(customer_id)
if not guest_customer_sets:
_LOGGER.debug("Phase 3d: No matched guests to check for regularity")
return
duplicates = {
key: customer_ids
for key, customer_ids in guest_customer_sets.items()
if len(customer_ids) > 1
}
if duplicates:
await self._deduplicate_guest_customer_links(duplicates, session)
guest_to_customer: dict[tuple[str | None, int], int] = {}
for key, customer_ids in guest_customer_sets.items():
hotel_id, guest_id = key
# After deduplication, reload conversions for this guest to find the remaining customer (if any)
result = await session.execute(
select(Conversion.customer_id)
.where(
Conversion.hotel_id == hotel_id,
Conversion.guest_id == guest_id,
Conversion.customer_id.isnot(None),
)
.limit(1)
)
chosen_customer = result.scalar_one_or_none()
if chosen_customer:
guest_to_customer[key] = chosen_customer
if not guest_to_customer:
_LOGGER.debug(
"Phase 3d: No guests remained linked to customers after deduplication"
)
return
_LOGGER.debug(
"Phase 3d: Checking regularity for %d matched guests",
len(guest_to_customer),
"Phase 3d: Checking regularity for %d matched guests", len(guest_to_customer)
)
for guest_id, customer_id in guest_to_customer.items():
for (hotel_id, guest_id), customer_id in guest_to_customer.items():
await self._check_if_guest_is_regular(guest_id, customer_id, session)
async def _match_conversions_from_db_sequential(
@@ -1705,15 +1790,21 @@ class ConversionService:
# Get conversion_guest if it exists (has the hashed data)
conversion_guest = conversion.guest
# Extract hashed data from conversion_guest (already hashed)
# Extract hashed and raw data from conversion_guest
hashed_first_name = None
hashed_last_name = None
hashed_email = None
raw_first_name = None
raw_last_name = None
raw_email = None
if conversion_guest:
hashed_first_name = conversion_guest.hashed_first_name
hashed_last_name = conversion_guest.hashed_last_name
hashed_email = conversion_guest.hashed_email
raw_first_name = conversion_guest.guest_first_name
raw_last_name = conversion_guest.guest_last_name
raw_email = conversion_guest.guest_email
# Phase 3a: Only try ID-based matching (fbclid/gclid/md5_unique_id)
# Guest detail matching is deferred to Phase 3b/3c
@@ -1729,6 +1820,9 @@ class ConversionService:
hashed_email,
conversion.advertising_partner,
session,
raw_first_name=raw_first_name,
raw_last_name=raw_last_name,
raw_email=raw_email,
)
if matched_reservation:
@@ -1838,21 +1932,164 @@ class ConversionService:
)
conversion_guest.is_regular = is_regular
if is_regular:
_LOGGER.debug(
"Marking guest %s as regular: earliest paying conversion %s predates first reservation created at %s",
async def _deduplicate_guest_customer_links(
self,
duplicates: dict[tuple[str | None, int], set[int]],
session: AsyncSession,
) -> None:
"""Resolve guest/customer conflicts by comparing hashed details and severing bad links."""
for (hotel_id, guest_id), customer_ids in duplicates.items():
guest_result = await session.execute(
select(ConversionGuest).where(
ConversionGuest.hotel_id == hotel_id,
ConversionGuest.guest_id == guest_id,
)
)
conversion_guest = guest_result.scalar_one_or_none()
if not conversion_guest:
_LOGGER.warning(
"Guest %s (hotel=%s) missing when resolving duplicates; removing links to customers %s",
guest_id,
earliest_paying_conversion.reservation_date,
earliest_reservation.created_at,
hotel_id,
sorted(customer_ids),
)
for customer_id in customer_ids:
await self._sever_guest_customer_link(
hotel_id, guest_id, customer_id, session
)
continue
preferred_customer_id = await self._choose_best_customer_for_guest(
conversion_guest, customer_ids, session
)
if preferred_customer_id:
_LOGGER.warning(
"Guest %s (hotel=%s) linked to multiple customers %s; keeping %s based on hashed data",
guest_id,
hotel_id,
sorted(customer_ids),
preferred_customer_id,
)
else:
_LOGGER.debug(
"Guest %s is not regular: first paying conversion %s is after/equal to first reservation created at %s",
_LOGGER.warning(
"Guest %s (hotel=%s) linked to multiple customers %s but none matched hashed data. Removing all links.",
guest_id,
earliest_paying_conversion.reservation_date,
earliest_reservation.created_at,
hotel_id,
sorted(customer_ids),
)
for customer_id in customer_ids:
if customer_id != preferred_customer_id:
await self._sever_guest_customer_link(
hotel_id, guest_id, customer_id, session
)
async def _choose_best_customer_for_guest(
self,
conversion_guest: ConversionGuest,
candidate_customer_ids: set[int],
session: AsyncSession,
) -> int | None:
"""Pick the most likely customer based on hashed data."""
if not candidate_customer_ids:
return None
result = await session.execute(
select(Customer).where(Customer.id.in_(candidate_customer_ids))
)
candidates = result.scalars().all()
if not candidates:
return None
def score_customer(customer: Customer) -> int:
score = 0
if (
conversion_guest.hashed_email
and customer.hashed_email == conversion_guest.hashed_email
):
score += 100
if (
conversion_guest.hashed_first_name
and customer.hashed_given_name == conversion_guest.hashed_first_name
):
score += 10
if (
conversion_guest.hashed_last_name
and customer.hashed_surname == conversion_guest.hashed_last_name
):
score += 10
if (
conversion_guest.hashed_country_code
and customer.hashed_country_code == conversion_guest.hashed_country_code
):
score += 2
if (
conversion_guest.hashed_birth_date
and customer.hashed_birth_date == conversion_guest.hashed_birth_date
):
score += 1
return score
best_customer_id = None
best_score = -1
is_tied = False
for customer in candidates:
score = score_customer(customer)
if score > best_score:
best_score = score
best_customer_id = customer.id
is_tied = False
elif score == best_score and score != -1:
is_tied = True
if best_score <= 0 or is_tied:
return None
return best_customer_id
async def _sever_guest_customer_link(
self,
hotel_id: str | None,
guest_id: int,
customer_id: int,
session: AsyncSession,
) -> None:
"""Remove incorrect guest/customer links from conversions."""
result = await session.execute(
select(Conversion)
.where(
Conversion.hotel_id == hotel_id,
Conversion.guest_id == guest_id,
Conversion.customer_id == customer_id,
)
.options(selectinload(Conversion.conversion_rooms))
)
conversions = result.scalars().all()
if not conversions:
return
for conversion in conversions:
conversion.customer_id = None
conversion.reservation_id = None
conversion.directly_attributable = False
conversion.guest_matched = False
conversion.updated_at = datetime.now()
_LOGGER.warning(
"Removed %d conversion links for guest %s (hotel=%s) customer %s",
len(conversions),
guest_id,
hotel_id,
customer_id,
)
async def _check_if_attributable(
self,
matched_customer_id: int,