Hashed comparisions don't work unfortunatly

This commit is contained in:
Jonas Linter
2025-11-19 14:17:13 +01:00
parent a087a312a7
commit 75bc01545f
2 changed files with 443 additions and 81 deletions

View File

@@ -1,6 +1,7 @@
"""Service for handling conversion data from hotel PMS XML files.""" """Service for handling conversion data from hotel PMS XML files."""
import asyncio import asyncio
import hashlib
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from datetime import UTC, datetime from datetime import UTC, datetime
from decimal import Decimal from decimal import Decimal
@@ -50,10 +51,11 @@ class ConversionService:
self.supports_concurrent = False self.supports_concurrent = False
# Cache for reservation and customer data within a single XML processing run # Cache for reservation and customer data within a single XML processing run
# Maps hotel_code -> list of (reservation, customer) tuples # Maps hotel_code -> list of (reservation, hashed_customer) tuples
# This significantly speeds up matching when processing large XML files # This significantly speeds up matching when processing large XML files
# Uses hashed data for matching to preserve privacy
self._reservation_cache: dict[ self._reservation_cache: dict[
str | None, list[tuple[Reservation, Customer | None]] str | None, list[tuple[Reservation, HashedCustomer | None]]
] = {} ] = {}
self._cache_initialized = False self._cache_initialized = False
@@ -91,6 +93,11 @@ class ConversionService:
If a guest with this key exists, updates it with new data. If a guest with this key exists, updates it with new data.
If not, creates a new guest record. If not, creates a new guest record.
NOTE: There is no database-level unique constraint on (hotel_id, guest_id),
so multiple ConversionGuest records can exist with the same key. This method
uses first() instead of scalar_one_or_none() to handle this gracefully and
update the most recently created record when duplicates exist.
Returns the ConversionGuest record, or None if no guest data provided. Returns the ConversionGuest record, or None if no guest data provided.
""" """
# Don't create a ConversionGuest if we have no guest information # Don't create a ConversionGuest if we have no guest information
@@ -104,12 +111,14 @@ class ConversionService:
# Try to find existing guest by (hotel_id, guest_id) # Try to find existing guest by (hotel_id, guest_id)
if guest_id: if guest_id:
result = await session.execute( result = await session.execute(
select(ConversionGuest).where( select(ConversionGuest)
.where(
(ConversionGuest.hotel_id == hotel_id) (ConversionGuest.hotel_id == hotel_id)
& (ConversionGuest.guest_id == guest_id) & (ConversionGuest.guest_id == guest_id)
) )
.order_by(ConversionGuest.last_seen.desc()) # Get most recently updated
) )
existing_guest = result.scalar_one_or_none() existing_guest = result.scalars().first()
if existing_guest: if existing_guest:
# Update with new data # Update with new data
@@ -210,16 +219,16 @@ class ConversionService:
return stats return stats
async def _load_reservation_cache(self) -> None: async def _load_reservation_cache(self) -> None:
"""Load all reservations and customers into cache for fast matching. """Load all reservations and hashed customers into cache for fast matching.
This method is called once at the start of processing a large XML file. This method is called once at the start of processing a large XML file.
It loads all reservations with their associated customers into an in-memory It loads all reservations with their associated hashed customers into an in-memory
cache organized by hotel_code. This avoids repeated database queries during cache organized by hotel_code. This avoids repeated database queries during
matching operations. matching operations and uses hashed data for privacy-preserving matching.
The cache structure: The cache structure:
- Key: hotel_code (str or None) - Key: hotel_code (str or None)
- Value: List of (reservation, customer) tuples - Value: List of (reservation, hashed_customer) tuples
This is especially beneficial for large XML files with many reservations This is especially beneficial for large XML files with many reservations
where matching criteria is the same across multiple reservations. where matching criteria is the same across multiple reservations.
@@ -241,8 +250,11 @@ class ConversionService:
return return
try: try:
# Load all reservations with their customers in one query # Load all reservations with their hashed customers in one query
query = select(Reservation).options(selectinload(Reservation.customer)) from sqlalchemy.orm import selectinload
query = select(Reservation).options(
selectinload(Reservation.customer).selectinload(Customer.hashed_version)
)
result = await session.execute(query) result = await session.execute(query)
reservations = result.scalars().all() reservations = result.scalars().all()
@@ -253,8 +265,12 @@ class ConversionService:
hotel_code = reservation.hotel_code hotel_code = reservation.hotel_code
if hotel_code not in self._reservation_cache: if hotel_code not in self._reservation_cache:
self._reservation_cache[hotel_code] = [] self._reservation_cache[hotel_code] = []
# Cache the hashed customer instead of raw customer
hashed_customer = None
if reservation.customer and reservation.customer.hashed_version:
hashed_customer = reservation.customer.hashed_version
self._reservation_cache[hotel_code].append( self._reservation_cache[hotel_code].append(
(reservation, reservation.customer) (reservation, hashed_customer)
) )
self._cache_initialized = True self._cache_initialized = True
@@ -482,26 +498,6 @@ class ConversionService:
except ValueError: except ValueError:
_LOGGER.warning("Invalid creation time format: %s", creation_time_str) _LOGGER.warning("Invalid creation time format: %s", creation_time_str)
# Find matching reservation, customer, and hashed_customer using advertising data and guest details
matched_reservation = None
matched_customer = None
matched_hashed_customer = None
if advertising_campagne or True:
match_result = await self._find_matching_entities(
advertising_campagne,
hotel_id,
reservation_date,
guest_first_name,
guest_last_name,
guest_email,
advertising_partner,
session,
)
matched_reservation = match_result["reservation"]
matched_customer = match_result["customer"]
matched_hashed_customer = match_result["hashed_customer"]
# Process all room reservations # Process all room reservations
room_reservations = reservation_elem.find("roomReservations") room_reservations = reservation_elem.find("roomReservations")
if room_reservations is None: if room_reservations is None:
@@ -519,16 +515,7 @@ class ConversionService:
existing_conversion = existing_result.scalar_one_or_none() existing_conversion = existing_result.scalar_one_or_none()
if existing_conversion: if existing_conversion:
# Update existing conversion # Update existing conversion (matching will be done later)
existing_conversion.reservation_id = (
matched_reservation.id if matched_reservation else None
)
existing_conversion.customer_id = (
matched_customer.id if matched_customer else None
)
existing_conversion.hashed_customer_id = (
matched_hashed_customer.id if matched_hashed_customer else None
)
existing_conversion.reservation_number = reservation_number existing_conversion.reservation_number = reservation_number
existing_conversion.reservation_date = reservation_date existing_conversion.reservation_date = reservation_date
existing_conversion.creation_time = creation_time existing_conversion.creation_time = creation_time
@@ -550,14 +537,12 @@ class ConversionService:
pms_reservation_id, pms_reservation_id,
) )
else: else:
# Create new conversion entry # Create new conversion entry (without matching - will be done later)
conversion = Conversion( conversion = Conversion(
# Links to existing entities (nullable) # Links to existing entities (nullable, will be filled in after matching)
reservation_id=matched_reservation.id if matched_reservation else None, reservation_id=None,
customer_id=matched_customer.id if matched_customer else None, customer_id=None,
hashed_customer_id=matched_hashed_customer.id hashed_customer_id=None,
if matched_hashed_customer
else None,
# Reservation metadata # Reservation metadata
hotel_id=hotel_id, hotel_id=hotel_id,
pms_reservation_id=pms_reservation_id, pms_reservation_id=pms_reservation_id,
@@ -604,16 +589,6 @@ class ConversionService:
if conversion_guest: if conversion_guest:
conversion.conversion_guest_id = conversion_guest.id conversion.conversion_guest_id = conversion_guest.id
# Update stats for the conversion record itself
if matched_reservation:
stats["matched_to_reservation"] += 1
if matched_customer:
stats["matched_to_customer"] += 1
if matched_hashed_customer:
stats["matched_to_hashed_customer"] += 1
if not any([matched_reservation, matched_customer, matched_hashed_customer]):
stats["unmatched"] += 1
# Batch-load existing room reservations to avoid N+1 queries # Batch-load existing room reservations to avoid N+1 queries
room_numbers = [ room_numbers = [
rm.get("roomNumber") for rm in room_reservations.findall("roomReservation") rm.get("roomNumber") for rm in room_reservations.findall("roomReservation")
@@ -772,6 +747,55 @@ class ConversionService:
num_adults, num_adults,
) )
# Now that conversion, conversion_guest, and conversion_room records exist,
# perform matching using hashed guest data from conversion_guest
matched_reservation = None
matched_customer = None
matched_hashed_customer = None
if advertising_campagne or True:
# Use hashed data from conversion_guest for matching
hashed_first_name = conversion_guest.hashed_first_name if conversion_guest else None
hashed_last_name = conversion_guest.hashed_last_name if conversion_guest else None
hashed_email = conversion_guest.hashed_email if conversion_guest else None
match_result = await self._find_matching_entities(
advertising_campagne,
hotel_id,
reservation_date,
hashed_first_name,
hashed_last_name,
hashed_email,
advertising_partner,
session,
)
matched_reservation = match_result["reservation"]
matched_customer = match_result["customer"]
matched_hashed_customer = match_result["hashed_customer"]
# Update the conversion with matched entities if found
if matched_reservation or matched_customer or matched_hashed_customer:
conversion.reservation_id = (
matched_reservation.id if matched_reservation else None
)
conversion.customer_id = (
matched_customer.id if matched_customer else None
)
conversion.hashed_customer_id = (
matched_hashed_customer.id if matched_hashed_customer else None
)
conversion.updated_at = datetime.now()
# Update stats for the conversion record
if matched_reservation:
stats["matched_to_reservation"] += 1
if matched_customer:
stats["matched_to_customer"] += 1
if matched_hashed_customer:
stats["matched_to_hashed_customer"] += 1
if not any([matched_reservation, matched_customer, matched_hashed_customer]):
stats["unmatched"] += 1
return stats return stats
async def _find_matching_entities( async def _find_matching_entities(
@@ -1028,7 +1052,10 @@ class ConversionService:
"Cache unavailable or empty, falling back to database query (hotel=%s)", "Cache unavailable or empty, falling back to database query (hotel=%s)",
hotel_id, hotel_id,
) )
query = select(Reservation).options(selectinload(Reservation.customer)) from sqlalchemy.orm import selectinload
query = select(Reservation).options(
selectinload(Reservation.customer).selectinload(Customer.hashed_version)
)
if hotel_id: if hotel_id:
query = query.where(Reservation.hotel_code == hotel_id) query = query.where(Reservation.hotel_code == hotel_id)
@@ -1049,53 +1076,67 @@ class ConversionService:
) -> Reservation | None: ) -> Reservation | None:
"""Match a reservation from a list by guest name and email (non-async). """Match a reservation from a list by guest name and email (non-async).
Uses hashed data matching for privacy. The guest_first_name, guest_last_name,
and guest_email parameters should be pre-hashed values from ConversionGuest.
Compares hashed values directly against hashed_customer hash fields.
Args: Args:
reservations: List of reservations to search through reservations: List of reservations to search through
guest_first_name: Guest first name guest_first_name: Pre-hashed guest first name (or None)
guest_last_name: Guest last name guest_last_name: Pre-hashed guest last name (or None)
guest_email: Guest email guest_email: Pre-hashed guest email (or None)
Returns: Returns:
Matched Reservation or None Matched Reservation or None
""" """
# Filter by guest details # Filter by guest details using hashed data
candidates = [] candidates = []
for reservation in reservations: for reservation in reservations:
customer = reservation.customer customer = reservation.customer
if not customer: if not customer:
continue continue
# Match by email (highest priority) # Access hashed_version - handle both direct objects and relationships
if guest_email: hashed_customer = customer.hashed_version
if ( if not hashed_customer:
customer.email_address continue
and customer.email_address.lower() == guest_email.lower()
): # Defensive check: ensure hashed_customer is actually a HashedCustomer object
# (not an InstrumentedList or other SQLAlchemy construct)
if not hasattr(hashed_customer, 'hashed_email'):
_LOGGER.debug(
"Customer %s has invalid hashed_version type: %s",
customer.id,
type(hashed_customer),
)
continue
# Match by email (highest priority) using hashed comparison
if guest_email and hashed_customer.hashed_email:
if hashed_customer.hashed_email == guest_email:
_LOGGER.info( _LOGGER.info(
"Found exact email match for %s (reservation_id=%s)", "Found exact email match (reservation_id=%s) via hash",
guest_email,
reservation.id, reservation.id,
) )
candidates.append((reservation, 3)) # Highest score candidates.append((reservation, 3)) # Highest score
continue continue
# Match by name (first + last) # Match by name (first + last) using hashed comparison
if guest_first_name and guest_last_name: if guest_first_name and guest_last_name:
first_match = ( first_match = (
customer.given_name hashed_customer.hashed_given_name
and customer.given_name.lower() == guest_first_name.lower() and hashed_customer.hashed_given_name == guest_first_name
) )
last_match = ( last_match = (
customer.surname hashed_customer.hashed_surname
and customer.surname.lower() == guest_last_name.lower() and hashed_customer.hashed_surname == guest_last_name
) )
if first_match and last_match: if first_match and last_match:
_LOGGER.info( _LOGGER.info(
"Found exact name match for %s %s (reservation_id=%s)", "Found exact name match (reservation_id=%s) via hash",
guest_first_name,
guest_last_name,
reservation.id, reservation.id,
) )
candidates.append((reservation, 2)) # Medium-high score candidates.append((reservation, 2)) # Medium-high score

View File

@@ -7,20 +7,31 @@ This test module:
The test data is designed to test realistic matching scenarios: The test data is designed to test realistic matching scenarios:
- Matching by advertising campaign data (fbclid/gclid) - Matching by advertising campaign data (fbclid/gclid)
- Matching by guest name and email - Matching by guest name and email using hashed data
- Handling unmatched conversions - Handling unmatched conversions
- Processing daily sales revenue data - Processing daily sales revenue data
- Testing hashed matching logic and edge cases
""" """
import hashlib
from pathlib import Path from pathlib import Path
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from alpine_bits_python.conversion_service import ConversionService from alpine_bits_python.conversion_service import ConversionService
from alpine_bits_python.csv_import import CSVImporter from alpine_bits_python.csv_import import CSVImporter
from alpine_bits_python.db import Base, Conversion, ConversionRoom from alpine_bits_python.db import (
Base,
Conversion,
ConversionGuest,
ConversionRoom,
Customer,
HashedCustomer,
Reservation,
)
@pytest_asyncio.fixture @pytest_asyncio.fixture
@@ -317,5 +328,315 @@ class TestConversionServiceWithImportedData:
assert stats["errors"] == 0 assert stats["errors"] == 0
class TestHashedMatchingLogic:
"""Test the hashed matching logic used in ConversionService."""
@pytest.mark.asyncio
async def test_no_match_without_hashed_customer(self, test_db_session):
"""Test that matching fails gracefully when customer has no hashed version."""
# Create a customer WITHOUT hashed data
customer = Customer(
given_name="Bob",
surname="Jones",
email_address="bob@example.com",
contact_id="test_contact_3",
)
test_db_session.add(customer)
await test_db_session.commit()
# Create a reservation
reservation = Reservation(
customer_id=customer.id,
unique_id="res_3",
hotel_code="test_hotel",
)
test_db_session.add(reservation)
await test_db_session.commit()
# Test the matching logic
service = ConversionService(test_db_session)
# Eagerly load reservations
from sqlalchemy.orm import selectinload
result = await test_db_session.execute(
select(Reservation)
.where(Reservation.id == reservation.id)
.options(selectinload(Reservation.customer).selectinload(Customer.hashed_version))
)
reservations = result.scalars().all()
hashed_email = hashlib.sha256(
"bob@example.com".lower().strip().encode("utf-8")
).hexdigest()
matched = service._match_reservations_by_guest_details(
reservations,
guest_first_name=None,
guest_last_name=None,
guest_email=hashed_email,
)
# Should not match because customer has no hashed version
assert matched is None, "Should not match without hashed customer"
@pytest.mark.asyncio
async def test_conversion_guest_hashed_fields_are_populated(
self, test_db_session
):
"""Test that ConversionGuest properly stores hashed versions of guest data."""
# Create a conversion guest
conversion_guest = ConversionGuest.create_from_conversion_data(
hotel_id="test_hotel",
guest_id="guest_123",
guest_first_name="Margaret",
guest_last_name="Brown",
guest_email="margaret@example.com",
guest_country_code="GB",
guest_birth_date=None,
now=None,
)
test_db_session.add(conversion_guest)
await test_db_session.flush()
# Verify hashed fields are populated
assert conversion_guest.hashed_first_name is not None
assert conversion_guest.hashed_last_name is not None
assert conversion_guest.hashed_email is not None
# Verify hashes are correct (SHA256)
expected_hashed_first = hashlib.sha256(
"margaret".lower().strip().encode("utf-8")
).hexdigest()
expected_hashed_last = hashlib.sha256(
"brown".lower().strip().encode("utf-8")
).hexdigest()
expected_hashed_email = hashlib.sha256(
"margaret@example.com".lower().strip().encode("utf-8")
).hexdigest()
assert conversion_guest.hashed_first_name == expected_hashed_first
assert conversion_guest.hashed_last_name == expected_hashed_last
assert conversion_guest.hashed_email == expected_hashed_email
@pytest.mark.asyncio
async def test_conversion_records_created_before_matching(
self, test_db_session, test_config
):
"""Test that conversion records exist before matching occurs."""
# Create customer and reservation for matching
customer = Customer(
given_name="David",
surname="Miller",
email_address="david@example.com",
contact_id="test_contact_6",
)
test_db_session.add(customer)
await test_db_session.flush()
hashed_customer = customer.create_hashed_customer()
test_db_session.add(hashed_customer)
await test_db_session.flush()
reservation = Reservation(
customer_id=customer.id,
unique_id="res_6",
hotel_code="hotel_1",
)
test_db_session.add(reservation)
await test_db_session.commit()
# Create conversion XML with matching hashed data
xml_content = f"""<?xml version="1.0"?>
<root>
<reservation id="pms_123" hotelID="hotel_1" number="RES001" date="2025-01-15">
<guest firstName="David" lastName="Miller" email="david@example.com"/>
<roomReservations>
<roomReservation roomNumber="101" arrival="2025-01-15" departure="2025-01-17" status="confirmed">
<dailySales>
<dailySale date="2025-01-15" revenueTotal="100.00"/>
</dailySales>
</roomReservation>
</roomReservations>
</reservation>
</root>"""
service = ConversionService(test_db_session)
stats = await service.process_conversion_xml(xml_content)
# Verify conversion was created
result = await test_db_session.execute(
select(Conversion).where(Conversion.pms_reservation_id == "pms_123")
)
conversion = result.scalar_one_or_none()
assert conversion is not None, "Conversion should be created"
assert conversion.hotel_id == "hotel_1"
assert conversion.guest_first_name == "David"
assert conversion.guest_last_name == "Miller"
assert conversion.guest_email == "david@example.com"
# Verify conversion_guest was created
assert conversion.conversion_guest_id is not None, "ConversionGuest should be created"
# Verify conversion_room was created
room_result = await test_db_session.execute(
select(ConversionRoom).where(
ConversionRoom.conversion_id == conversion.id
)
)
rooms = room_result.scalars().all()
assert len(rooms) > 0, "ConversionRoom should be created"
# Verify matching occurred (may or may not have matched depending on data)
# The important thing is that the records exist
assert stats["total_reservations"] == 1
assert stats["total_daily_sales"] == 1
@pytest.mark.asyncio
async def test_hashed_customer_missing_fields_handled_gracefully(
self, test_db_session
):
"""Test that matching handles customers with missing hashed fields gracefully."""
# Create a customer
customer = Customer(
given_name="Eve",
surname="Taylor",
email_address="eve@example.com",
contact_id="test_contact_7",
)
test_db_session.add(customer)
await test_db_session.flush()
# Create hashed customer but simulate missing fields by manually setting to None
hashed_customer = HashedCustomer(
customer_id=customer.id,
contact_id="test_contact_7_hashed",
hashed_email=None, # Simulate missing hashed email
hashed_given_name=None, # Simulate missing hashed name
hashed_surname=None,
)
test_db_session.add(hashed_customer)
await test_db_session.flush()
# Create reservation
reservation = Reservation(
customer_id=customer.id,
unique_id="res_7",
hotel_code="test_hotel",
)
test_db_session.add(reservation)
await test_db_session.commit()
# Test matching - should not crash even with missing hashed fields
service = ConversionService(test_db_session)
# Eagerly load reservations
from sqlalchemy.orm import selectinload
result = await test_db_session.execute(
select(Reservation)
.where(Reservation.id == reservation.id)
.options(selectinload(Reservation.customer).selectinload(Customer.hashed_version))
)
reservations = result.scalars().all()
hashed_email = hashlib.sha256(
"eve@example.com".lower().strip().encode("utf-8")
).hexdigest()
matched = service._match_reservations_by_guest_details(
reservations,
guest_first_name=None,
guest_last_name=None,
guest_email=hashed_email,
)
# Should not match because hashed customer fields are None
assert matched is None, "Should not match with missing hashed fields"
@pytest.mark.asyncio
async def test_duplicate_conversion_guests_with_same_hotel_and_guest_id(
self, test_db_session
):
"""Test handling of duplicate ConversionGuest records with same (hotel_id, guest_id).
This test reproduces the production issue where multiple ConversionGuest records
can be created with the same (hotel_id, guest_id) combination, causing
scalar_one_or_none() to fail with "Multiple rows were found when one or none was required".
This can happen when:
- Multiple conversions arrive for the same hotel and PMS guest within the same batch
- The XML is processed multiple times
- Race conditions in concurrent processing
"""
hotel_id = "test_hotel"
guest_id = "guest_123"
# Simulate the production scenario: multiple conversion guests with same (hotel_id, guest_id)
guest1 = ConversionGuest.create_from_conversion_data(
hotel_id=hotel_id,
guest_id=guest_id,
guest_first_name="John",
guest_last_name="Doe",
guest_email="john@example.com",
guest_country_code="US",
guest_birth_date=None,
now=None,
)
test_db_session.add(guest1)
await test_db_session.flush()
# Create a second guest with the SAME (hotel_id, guest_id)
# This should not happen, but can occur in production
guest2 = ConversionGuest.create_from_conversion_data(
hotel_id=hotel_id,
guest_id=guest_id,
guest_first_name="Jane", # Different first name
guest_last_name="Doe",
guest_email="jane@example.com",
guest_country_code="US",
guest_birth_date=None,
now=None,
)
test_db_session.add(guest2)
await test_db_session.commit()
# Now try to query for the guest by (hotel_id, guest_id)
# This should return multiple results
result = await test_db_session.execute(
select(ConversionGuest).where(
(ConversionGuest.hotel_id == hotel_id)
& (ConversionGuest.guest_id == guest_id)
)
)
guests = result.scalars().all()
# Verify we have duplicates (the production bug condition)
assert len(guests) == 2, "Should have created duplicate conversion guests"
# Verify that scalars().first() returns one of the guests (the fixed behavior)
result2 = await test_db_session.execute(
select(ConversionGuest).where(
(ConversionGuest.hotel_id == hotel_id)
& (ConversionGuest.guest_id == guest_id)
)
)
first_guest = result2.scalars().first()
assert first_guest is not None, "Should find at least one guest with scalars().first()"
# The old code would have raised an error here with scalar_one_or_none()
# when finding multiple results. Now it's fixed to use .first() instead.
result3 = await test_db_session.execute(
select(ConversionGuest).where(
(ConversionGuest.hotel_id == hotel_id)
& (ConversionGuest.guest_id == guest_id)
)
)
with pytest.raises(Exception): # MultipleResultsFound from old code path
result3.scalar_one_or_none()
if __name__ == "__main__": if __name__ == "__main__":
pytest.main([__file__, "-v"]) pytest.main([__file__, "-v"])