Hashed conversion matching and more. #12

Merged
jonas merged 13 commits from hashed_conversion_matching into main 2025-11-19 19:40:07 +00:00
2 changed files with 28 additions and 350 deletions
Showing only changes of commit e5f0aea870 - Show all commits

View File

@@ -1246,90 +1246,6 @@ class ConversionService:
all_reservations, guest_first_name, guest_last_name, guest_email all_reservations, guest_first_name, guest_last_name, guest_email
) )
def _match_reservations_by_guest_details(
self,
reservations: list[Reservation],
guest_first_name: str | None,
guest_last_name: str | None,
guest_email: str | None,
) -> Reservation | None:
"""Match a reservation from a list by guest name and email (non-async).
Uses hashed data matching for privacy. The guest_first_name, guest_last_name,
and guest_email parameters should be pre-hashed values from ConversionGuest.
Compares hashed values directly against hashed_customer hash fields.
Args:
reservations: List of reservations to search through
guest_first_name: Pre-hashed guest first name (or None)
guest_last_name: Pre-hashed guest last name (or None)
guest_email: Pre-hashed guest email (or None)
Returns:
Matched Reservation or None
"""
# Filter by guest details using hashed data
candidates = []
for reservation in reservations:
customer = reservation.customer
if not customer:
continue
# Access hashed_version - handle both direct objects and relationships
hashed_customer = customer.hashed_version
if not hashed_customer:
continue
# Defensive check: ensure hashed_customer is actually a HashedCustomer object
# (not an InstrumentedList or other SQLAlchemy construct)
if not hasattr(hashed_customer, 'hashed_email'):
_LOGGER.debug(
"Customer %s has invalid hashed_version type: %s",
customer.id,
type(hashed_customer),
)
continue
# Match by email (highest priority) using hashed comparison
if guest_email and hashed_customer.hashed_email:
if hashed_customer.hashed_email == guest_email:
_LOGGER.info(
"Found exact email match (reservation_id=%s, cust=%s, email=%s) via hash",
reservation.id,
customer.email_address,
guest_email,
)
candidates.append((reservation, 3)) # Highest score
continue
# Match by name (first + last) using hashed comparison
if guest_first_name and guest_last_name:
first_match = (
hashed_customer.hashed_given_name
and hashed_customer.hashed_given_name == guest_first_name
)
last_match = (
hashed_customer.hashed_surname
and hashed_customer.hashed_surname == guest_last_name
)
if first_match and last_match:
_LOGGER.info(
"Found exact name match (reservation_id=%s) via hash",
reservation.id,
)
candidates.append((reservation, 2)) # Medium-high score
continue
# Return highest-scoring match
if candidates:
candidates.sort(key=lambda x: x[1], reverse=True)
return candidates[0][0]
return None
def _filter_reservations_by_guest_details( def _filter_reservations_by_guest_details(
self, self,
reservations: list[Reservation], reservations: list[Reservation],
@@ -1569,10 +1485,6 @@ class ConversionService:
# Update the conversion with matched entities if found # Update the conversion with matched entities if found
if matched_reservation or matched_customer or matched_hashed_customer: if matched_reservation or matched_customer or matched_hashed_customer:
# Set attribution flags and matched entities based on match type
if match_type == "id":
# ID-based matches (fbclid/gclid/md5_unique_id) are always directly attributable
# Link to both reservation and customer
conversion.reservation_id = ( conversion.reservation_id = (
matched_reservation.id if matched_reservation else None matched_reservation.id if matched_reservation else None
) )
@@ -1582,37 +1494,25 @@ class ConversionService:
conversion.hashed_customer_id = ( conversion.hashed_customer_id = (
matched_hashed_customer.id if matched_hashed_customer else None matched_hashed_customer.id if matched_hashed_customer else None
) )
# Set attribution flags based on match type and check for regular guest
is_attributable = False
if match_type == "id":
# ID-based matches (fbclid/gclid/md5_unique_id) are always directly attributable
conversion.directly_attributable = True conversion.directly_attributable = True
conversion.guest_matched = False conversion.guest_matched = False
is_attributable = True is_attributable = True
# Check if guest is regular for ID matches
if matched_reservation:
await self._check_if_regular(conversion, matched_reservation, session)
elif match_type == "guest_details": elif match_type == "guest_details":
# Guest detail matches: link to customer/hashed_customer directly (NO reservation) # Guest detail matches: check if guest is regular or dates match
# Only link to reservation if dates match is_attributable = await self._check_if_attributable(
conversion.customer_id = ( matched_reservation, conversion, session
matched_customer.id if matched_customer else None
)
conversion.hashed_customer_id = (
matched_hashed_customer.id if matched_hashed_customer else None
)
conversion.guest_matched = True
# For guest-detail matches, we don't have a matched_reservation
# Instead, check if dates align with any existing reservations for this customer
is_attributable = await self._check_if_attributable_guest_match(
matched_customer, conversion, session
) )
conversion.directly_attributable = is_attributable conversion.directly_attributable = is_attributable
conversion.guest_matched = True
# Check if guest is regular (if we have a customer to reference) # Check if guest is regular for both ID and guest detail matches
if matched_customer: if matched_reservation:
await self._check_if_regular_by_customer( await self._check_if_regular(conversion, matched_reservation, session)
conversion, matched_customer, session
)
# Update conversion_guest with hashed_customer reference if matched # Update conversion_guest with hashed_customer reference if matched
if conversion_guest and matched_hashed_customer: if conversion_guest and matched_hashed_customer:
@@ -1743,127 +1643,3 @@ class ConversionService:
return True return True
return False return False
async def _check_if_attributable_guest_match(
self,
matched_customer: Customer | None,
conversion: Conversion,
session: AsyncSession,
) -> bool:
"""Check if a guest-detail matched conversion is attributable based on date alignment.
For guest-detail matches (without a specific reservation), check if the conversion's
room dates align with ANY of the customer's reservations (date tolerance ±7 days).
Args:
matched_customer: The matched Customer record
conversion: The Conversion record being evaluated
session: AsyncSession for database queries
Returns:
True if conversion dates align with any reservation, False otherwise
"""
if not matched_customer or not conversion.conversion_rooms:
return False
# Find all reservations for this customer
reservations_result = await session.execute(
select(Reservation).where(Reservation.customer_id == matched_customer.id)
)
reservations = reservations_result.scalars().all()
if not reservations:
return False
# Check if any conversion_room dates match any reservation dates
for room in conversion.conversion_rooms:
if not room.arrival_date or not room.departure_date:
continue
for reservation in reservations:
if not reservation.start_date or not reservation.end_date:
continue
# Check if dates match (within ±7 day tolerance)
arrival_match = abs(
(room.arrival_date - reservation.start_date).days
) <= 7
departure_match = abs(
(room.departure_date - reservation.end_date).days
) <= 7
if arrival_match and departure_match:
_LOGGER.info(
"Marking guest-detail match as attributable: room dates %s-%s match reservation dates %s-%s",
room.arrival_date,
room.departure_date,
reservation.start_date,
reservation.end_date,
)
return True
return False
async def _check_if_regular_by_customer(
self,
conversion: Conversion,
matched_customer: Customer,
session: AsyncSession,
) -> None:
"""Check if guest is regular based on customer without a specific reservation.
For guest-detail matches, determine if the guest is regular by checking if
their earliest paying conversion predates their earliest reservation.
Args:
conversion: The Conversion record being evaluated
matched_customer: The matched Customer record
session: AsyncSession for database queries
"""
if not conversion.guest or not matched_customer.id:
return
# Find the earliest paying conversion for this guest
# (booked reservations from hotel with actual revenue)
earliest_paying_conversion_result = await session.execute(
select(Conversion)
.join(ConversionRoom, Conversion.id == ConversionRoom.conversion_id)
.where(
Conversion.hotel_id == conversion.hotel_id,
Conversion.guest_id == conversion.guest_id,
ConversionRoom.total_revenue.isnot(None),
ConversionRoom.total_revenue > Decimal(0),
)
.order_by(Conversion.reservation_date.asc())
.limit(1)
)
earliest_paying_conversion = earliest_paying_conversion_result.scalar_one_or_none()
if not earliest_paying_conversion:
conversion.guest.is_regular = False
return
# Find the earliest reservation (booking request we sent) for this customer
earliest_reservation_result = await session.execute(
select(Reservation)
.where(Reservation.customer_id == matched_customer.id)
.order_by(Reservation.start_date.asc())
.limit(1)
)
earliest_reservation = earliest_reservation_result.scalar_one_or_none()
if not earliest_reservation:
conversion.guest.is_regular = False
return
# Guest is regular if their earliest paying conversion predates all their reservations
# (meaning they were already a customer before we started tracking reservations)
is_regular = earliest_paying_conversion.reservation_date < earliest_reservation.start_date
conversion.guest.is_regular = is_regular
if is_regular:
_LOGGER.info(
"Marking guest as regular (via customer): earliest paying conversion date %s is before first reservation %s",
earliest_paying_conversion.reservation_date,
earliest_reservation.start_date,
)

View File

@@ -159,7 +159,9 @@ class TestConversionServiceWithImportedData:
EXPECTED_TOTAL_ROOMS = 539 EXPECTED_TOTAL_ROOMS = 539
# Note: Currently no matches by tracking ID because XML data uses different formats # Note: Currently no matches by tracking ID because XML data uses different formats
# This is expected with the test data. Real PMS data would have higher match rates. # This is expected with the test data. Real PMS data would have higher match rates.
EXPECTED_MATCHED_TO_RESERVATION = 19 EXPECTED_MATCHED_TO_RESERVATION = 0
EXPECTED_MATCHED_TO_CUSTOMER = 19
print(f"\nBaseline Match Counts:") print(f"\nBaseline Match Counts:")
print(f" Total reservations in XML: {EXPECTED_TOTAL_RESERVATIONS}") print(f" Total reservations in XML: {EXPECTED_TOTAL_RESERVATIONS}")
@@ -168,6 +170,8 @@ class TestConversionServiceWithImportedData:
print(f" Matched to reservation: {EXPECTED_MATCHED_TO_RESERVATION}") print(f" Matched to reservation: {EXPECTED_MATCHED_TO_RESERVATION}")
match_rate = (EXPECTED_MATCHED_TO_RESERVATION / EXPECTED_TOTAL_RESERVATIONS * 100) if EXPECTED_TOTAL_RESERVATIONS > 0 else 0 match_rate = (EXPECTED_MATCHED_TO_RESERVATION / EXPECTED_TOTAL_RESERVATIONS * 100) if EXPECTED_TOTAL_RESERVATIONS > 0 else 0
print(f" Match rate: {match_rate:.1f}%") print(f" Match rate: {match_rate:.1f}%")
print(f" Matched to customer: {EXPECTED_MATCHED_TO_CUSTOMER}")
print(f" Match rate (to customer): {(EXPECTED_MATCHED_TO_CUSTOMER / EXPECTED_TOTAL_RESERVATIONS * 100) if EXPECTED_TOTAL_RESERVATIONS > 0 else 0:.1f}%")
# Verify baseline stability on subsequent runs # Verify baseline stability on subsequent runs
assert ( assert (
@@ -180,6 +184,10 @@ class TestConversionServiceWithImportedData:
stats["matched_to_reservation"] == EXPECTED_MATCHED_TO_RESERVATION stats["matched_to_reservation"] == EXPECTED_MATCHED_TO_RESERVATION
), f"Matched reservations should be {EXPECTED_MATCHED_TO_RESERVATION}, got {stats['matched_to_reservation']}" ), f"Matched reservations should be {EXPECTED_MATCHED_TO_RESERVATION}, got {stats['matched_to_reservation']}"
assert (
stats["matched_to_customer"] == EXPECTED_MATCHED_TO_CUSTOMER
), f"Matched customers should be {EXPECTED_MATCHED_TO_CUSTOMER}, got {stats['matched_to_customer']}"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_conversion_room_revenue_aggregation( async def test_conversion_room_revenue_aggregation(
self, test_db_session, test_config, test_data_dir self, test_db_session, test_config, test_data_dir
@@ -334,53 +342,6 @@ class TestHashedMatchingLogic:
"""Test the hashed matching logic used in ConversionService.""" """Test the hashed matching logic used in ConversionService."""
@pytest.mark.asyncio
async def test_no_match_without_hashed_customer(self, test_db_session):
"""Test that matching fails gracefully when customer has no hashed version."""
# Create a customer WITHOUT hashed data
customer = Customer(
given_name="Bob",
surname="Jones",
email_address="bob@example.com",
contact_id="test_contact_3",
)
test_db_session.add(customer)
await test_db_session.commit()
# Create a reservation
reservation = Reservation(
customer_id=customer.id,
unique_id="res_3",
hotel_code="test_hotel",
)
test_db_session.add(reservation)
await test_db_session.commit()
# Test the matching logic
service = ConversionService(test_db_session)
# Eagerly load reservations
from sqlalchemy.orm import selectinload
result = await test_db_session.execute(
select(Reservation)
.where(Reservation.id == reservation.id)
.options(selectinload(Reservation.customer).selectinload(Customer.hashed_version))
)
reservations = result.scalars().all()
hashed_email = hashlib.sha256(
"bob@example.com".lower().strip().encode("utf-8")
).hexdigest()
matched = service._match_reservations_by_guest_details(
reservations,
guest_first_name=None,
guest_last_name=None,
guest_email=hashed_email,
)
# Should not match because customer has no hashed version
assert matched is None, "Should not match without hashed customer"
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -505,66 +466,7 @@ class TestHashedMatchingLogic:
assert stats["total_reservations"] == 1 assert stats["total_reservations"] == 1
assert stats["total_daily_sales"] == 1 assert stats["total_daily_sales"] == 1
@pytest.mark.asyncio
async def test_hashed_customer_missing_fields_handled_gracefully(
self, test_db_session
):
"""Test that matching handles customers with missing hashed fields gracefully."""
# Create a customer
customer = Customer(
given_name="Eve",
surname="Taylor",
email_address="eve@example.com",
contact_id="test_contact_7",
)
test_db_session.add(customer)
await test_db_session.flush()
# Create hashed customer but simulate missing fields by manually setting to None
hashed_customer = HashedCustomer(
customer_id=customer.id,
contact_id="test_contact_7_hashed",
hashed_email=None, # Simulate missing hashed email
hashed_given_name=None, # Simulate missing hashed name
hashed_surname=None,
)
test_db_session.add(hashed_customer)
await test_db_session.flush()
# Create reservation
reservation = Reservation(
customer_id=customer.id,
unique_id="res_7",
hotel_code="test_hotel",
)
test_db_session.add(reservation)
await test_db_session.commit()
# Test matching - should not crash even with missing hashed fields
service = ConversionService(test_db_session)
# Eagerly load reservations
from sqlalchemy.orm import selectinload
result = await test_db_session.execute(
select(Reservation)
.where(Reservation.id == reservation.id)
.options(selectinload(Reservation.customer).selectinload(Customer.hashed_version))
)
reservations = result.scalars().all()
hashed_email = hashlib.sha256(
"eve@example.com".lower().strip().encode("utf-8")
).hexdigest()
matched = service._match_reservations_by_guest_details(
reservations,
guest_first_name=None,
guest_last_name=None,
guest_email=hashed_email,
)
# Should not match because hashed customer fields are None
assert matched is None, "Should not match with missing hashed fields"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_conversion_guest_composite_key_prevents_duplicates( async def test_conversion_guest_composite_key_prevents_duplicates(