Hashed comparisions don't work unfortunatly

This commit is contained in:
Jonas Linter
2025-11-19 14:17:13 +01:00
parent a087a312a7
commit 75bc01545f
2 changed files with 443 additions and 81 deletions

View File

@@ -1,6 +1,7 @@
"""Service for handling conversion data from hotel PMS XML files."""
import asyncio
import hashlib
import xml.etree.ElementTree as ET
from datetime import UTC, datetime
from decimal import Decimal
@@ -50,10 +51,11 @@ class ConversionService:
self.supports_concurrent = False
# Cache for reservation and customer data within a single XML processing run
# Maps hotel_code -> list of (reservation, customer) tuples
# Maps hotel_code -> list of (reservation, hashed_customer) tuples
# This significantly speeds up matching when processing large XML files
# Uses hashed data for matching to preserve privacy
self._reservation_cache: dict[
str | None, list[tuple[Reservation, Customer | None]]
str | None, list[tuple[Reservation, HashedCustomer | None]]
] = {}
self._cache_initialized = False
@@ -91,6 +93,11 @@ class ConversionService:
If a guest with this key exists, updates it with new data.
If not, creates a new guest record.
NOTE: There is no database-level unique constraint on (hotel_id, guest_id),
so multiple ConversionGuest records can exist with the same key. This method
uses first() instead of scalar_one_or_none() to handle this gracefully and
update the most recently created record when duplicates exist.
Returns the ConversionGuest record, or None if no guest data provided.
"""
# Don't create a ConversionGuest if we have no guest information
@@ -104,12 +111,14 @@ class ConversionService:
# Try to find existing guest by (hotel_id, guest_id)
if guest_id:
result = await session.execute(
select(ConversionGuest).where(
select(ConversionGuest)
.where(
(ConversionGuest.hotel_id == hotel_id)
& (ConversionGuest.guest_id == guest_id)
)
.order_by(ConversionGuest.last_seen.desc()) # Get most recently updated
)
existing_guest = result.scalar_one_or_none()
existing_guest = result.scalars().first()
if existing_guest:
# Update with new data
@@ -210,16 +219,16 @@ class ConversionService:
return stats
async def _load_reservation_cache(self) -> None:
"""Load all reservations and customers into cache for fast matching.
"""Load all reservations and hashed customers into cache for fast matching.
This method is called once at the start of processing a large XML file.
It loads all reservations with their associated customers into an in-memory
It loads all reservations with their associated hashed customers into an in-memory
cache organized by hotel_code. This avoids repeated database queries during
matching operations.
matching operations and uses hashed data for privacy-preserving matching.
The cache structure:
- Key: hotel_code (str or None)
- Value: List of (reservation, customer) tuples
- Value: List of (reservation, hashed_customer) tuples
This is especially beneficial for large XML files with many reservations
where matching criteria is the same across multiple reservations.
@@ -241,8 +250,11 @@ class ConversionService:
return
try:
# Load all reservations with their customers in one query
query = select(Reservation).options(selectinload(Reservation.customer))
# Load all reservations with their hashed customers in one query
from sqlalchemy.orm import selectinload
query = select(Reservation).options(
selectinload(Reservation.customer).selectinload(Customer.hashed_version)
)
result = await session.execute(query)
reservations = result.scalars().all()
@@ -253,8 +265,12 @@ class ConversionService:
hotel_code = reservation.hotel_code
if hotel_code not in self._reservation_cache:
self._reservation_cache[hotel_code] = []
# Cache the hashed customer instead of raw customer
hashed_customer = None
if reservation.customer and reservation.customer.hashed_version:
hashed_customer = reservation.customer.hashed_version
self._reservation_cache[hotel_code].append(
(reservation, reservation.customer)
(reservation, hashed_customer)
)
self._cache_initialized = True
@@ -482,26 +498,6 @@ class ConversionService:
except ValueError:
_LOGGER.warning("Invalid creation time format: %s", creation_time_str)
# Find matching reservation, customer, and hashed_customer using advertising data and guest details
matched_reservation = None
matched_customer = None
matched_hashed_customer = None
if advertising_campagne or True:
match_result = await self._find_matching_entities(
advertising_campagne,
hotel_id,
reservation_date,
guest_first_name,
guest_last_name,
guest_email,
advertising_partner,
session,
)
matched_reservation = match_result["reservation"]
matched_customer = match_result["customer"]
matched_hashed_customer = match_result["hashed_customer"]
# Process all room reservations
room_reservations = reservation_elem.find("roomReservations")
if room_reservations is None:
@@ -519,16 +515,7 @@ class ConversionService:
existing_conversion = existing_result.scalar_one_or_none()
if existing_conversion:
# Update existing conversion
existing_conversion.reservation_id = (
matched_reservation.id if matched_reservation else None
)
existing_conversion.customer_id = (
matched_customer.id if matched_customer else None
)
existing_conversion.hashed_customer_id = (
matched_hashed_customer.id if matched_hashed_customer else None
)
# Update existing conversion (matching will be done later)
existing_conversion.reservation_number = reservation_number
existing_conversion.reservation_date = reservation_date
existing_conversion.creation_time = creation_time
@@ -550,14 +537,12 @@ class ConversionService:
pms_reservation_id,
)
else:
# Create new conversion entry
# Create new conversion entry (without matching - will be done later)
conversion = Conversion(
# Links to existing entities (nullable)
reservation_id=matched_reservation.id if matched_reservation else None,
customer_id=matched_customer.id if matched_customer else None,
hashed_customer_id=matched_hashed_customer.id
if matched_hashed_customer
else None,
# Links to existing entities (nullable, will be filled in after matching)
reservation_id=None,
customer_id=None,
hashed_customer_id=None,
# Reservation metadata
hotel_id=hotel_id,
pms_reservation_id=pms_reservation_id,
@@ -604,16 +589,6 @@ class ConversionService:
if conversion_guest:
conversion.conversion_guest_id = conversion_guest.id
# Update stats for the conversion record itself
if matched_reservation:
stats["matched_to_reservation"] += 1
if matched_customer:
stats["matched_to_customer"] += 1
if matched_hashed_customer:
stats["matched_to_hashed_customer"] += 1
if not any([matched_reservation, matched_customer, matched_hashed_customer]):
stats["unmatched"] += 1
# Batch-load existing room reservations to avoid N+1 queries
room_numbers = [
rm.get("roomNumber") for rm in room_reservations.findall("roomReservation")
@@ -772,6 +747,55 @@ class ConversionService:
num_adults,
)
# Now that conversion, conversion_guest, and conversion_room records exist,
# perform matching using hashed guest data from conversion_guest
matched_reservation = None
matched_customer = None
matched_hashed_customer = None
if advertising_campagne or True:
# Use hashed data from conversion_guest for matching
hashed_first_name = conversion_guest.hashed_first_name if conversion_guest else None
hashed_last_name = conversion_guest.hashed_last_name if conversion_guest else None
hashed_email = conversion_guest.hashed_email if conversion_guest else None
match_result = await self._find_matching_entities(
advertising_campagne,
hotel_id,
reservation_date,
hashed_first_name,
hashed_last_name,
hashed_email,
advertising_partner,
session,
)
matched_reservation = match_result["reservation"]
matched_customer = match_result["customer"]
matched_hashed_customer = match_result["hashed_customer"]
# Update the conversion with matched entities if found
if matched_reservation or matched_customer or matched_hashed_customer:
conversion.reservation_id = (
matched_reservation.id if matched_reservation else None
)
conversion.customer_id = (
matched_customer.id if matched_customer else None
)
conversion.hashed_customer_id = (
matched_hashed_customer.id if matched_hashed_customer else None
)
conversion.updated_at = datetime.now()
# Update stats for the conversion record
if matched_reservation:
stats["matched_to_reservation"] += 1
if matched_customer:
stats["matched_to_customer"] += 1
if matched_hashed_customer:
stats["matched_to_hashed_customer"] += 1
if not any([matched_reservation, matched_customer, matched_hashed_customer]):
stats["unmatched"] += 1
return stats
async def _find_matching_entities(
@@ -1028,7 +1052,10 @@ class ConversionService:
"Cache unavailable or empty, falling back to database query (hotel=%s)",
hotel_id,
)
query = select(Reservation).options(selectinload(Reservation.customer))
from sqlalchemy.orm import selectinload
query = select(Reservation).options(
selectinload(Reservation.customer).selectinload(Customer.hashed_version)
)
if hotel_id:
query = query.where(Reservation.hotel_code == hotel_id)
@@ -1049,53 +1076,67 @@ class ConversionService:
) -> Reservation | None:
"""Match a reservation from a list by guest name and email (non-async).
Uses hashed data matching for privacy. The guest_first_name, guest_last_name,
and guest_email parameters should be pre-hashed values from ConversionGuest.
Compares hashed values directly against hashed_customer hash fields.
Args:
reservations: List of reservations to search through
guest_first_name: Guest first name
guest_last_name: Guest last name
guest_email: Guest email
guest_first_name: Pre-hashed guest first name (or None)
guest_last_name: Pre-hashed guest last name (or None)
guest_email: Pre-hashed guest email (or None)
Returns:
Matched Reservation or None
"""
# Filter by guest details
# Filter by guest details using hashed data
candidates = []
for reservation in reservations:
customer = reservation.customer
if not customer:
continue
# Match by email (highest priority)
if guest_email:
if (
customer.email_address
and customer.email_address.lower() == guest_email.lower()
):
# Access hashed_version - handle both direct objects and relationships
hashed_customer = customer.hashed_version
if not hashed_customer:
continue
# Defensive check: ensure hashed_customer is actually a HashedCustomer object
# (not an InstrumentedList or other SQLAlchemy construct)
if not hasattr(hashed_customer, 'hashed_email'):
_LOGGER.debug(
"Customer %s has invalid hashed_version type: %s",
customer.id,
type(hashed_customer),
)
continue
# Match by email (highest priority) using hashed comparison
if guest_email and hashed_customer.hashed_email:
if hashed_customer.hashed_email == guest_email:
_LOGGER.info(
"Found exact email match for %s (reservation_id=%s)",
guest_email,
"Found exact email match (reservation_id=%s) via hash",
reservation.id,
)
candidates.append((reservation, 3)) # Highest score
continue
# Match by name (first + last)
# Match by name (first + last) using hashed comparison
if guest_first_name and guest_last_name:
first_match = (
customer.given_name
and customer.given_name.lower() == guest_first_name.lower()
hashed_customer.hashed_given_name
and hashed_customer.hashed_given_name == guest_first_name
)
last_match = (
customer.surname
and customer.surname.lower() == guest_last_name.lower()
hashed_customer.hashed_surname
and hashed_customer.hashed_surname == guest_last_name
)
if first_match and last_match:
_LOGGER.info(
"Found exact name match for %s %s (reservation_id=%s)",
guest_first_name,
guest_last_name,
"Found exact name match (reservation_id=%s) via hash",
reservation.id,
)
candidates.append((reservation, 2)) # Medium-high score

View File

@@ -7,20 +7,31 @@ This test module:
The test data is designed to test realistic matching scenarios:
- Matching by advertising campaign data (fbclid/gclid)
- Matching by guest name and email
- Matching by guest name and email using hashed data
- Handling unmatched conversions
- Processing daily sales revenue data
- Testing hashed matching logic and edge cases
"""
import hashlib
from pathlib import Path
import pytest
import pytest_asyncio
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from alpine_bits_python.conversion_service import ConversionService
from alpine_bits_python.csv_import import CSVImporter
from alpine_bits_python.db import Base, Conversion, ConversionRoom
from alpine_bits_python.db import (
Base,
Conversion,
ConversionGuest,
ConversionRoom,
Customer,
HashedCustomer,
Reservation,
)
@pytest_asyncio.fixture
@@ -317,5 +328,315 @@ class TestConversionServiceWithImportedData:
assert stats["errors"] == 0
class TestHashedMatchingLogic:
"""Test the hashed matching logic used in ConversionService."""
@pytest.mark.asyncio
async def test_no_match_without_hashed_customer(self, test_db_session):
"""Test that matching fails gracefully when customer has no hashed version."""
# Create a customer WITHOUT hashed data
customer = Customer(
given_name="Bob",
surname="Jones",
email_address="bob@example.com",
contact_id="test_contact_3",
)
test_db_session.add(customer)
await test_db_session.commit()
# Create a reservation
reservation = Reservation(
customer_id=customer.id,
unique_id="res_3",
hotel_code="test_hotel",
)
test_db_session.add(reservation)
await test_db_session.commit()
# Test the matching logic
service = ConversionService(test_db_session)
# Eagerly load reservations
from sqlalchemy.orm import selectinload
result = await test_db_session.execute(
select(Reservation)
.where(Reservation.id == reservation.id)
.options(selectinload(Reservation.customer).selectinload(Customer.hashed_version))
)
reservations = result.scalars().all()
hashed_email = hashlib.sha256(
"bob@example.com".lower().strip().encode("utf-8")
).hexdigest()
matched = service._match_reservations_by_guest_details(
reservations,
guest_first_name=None,
guest_last_name=None,
guest_email=hashed_email,
)
# Should not match because customer has no hashed version
assert matched is None, "Should not match without hashed customer"
@pytest.mark.asyncio
async def test_conversion_guest_hashed_fields_are_populated(
self, test_db_session
):
"""Test that ConversionGuest properly stores hashed versions of guest data."""
# Create a conversion guest
conversion_guest = ConversionGuest.create_from_conversion_data(
hotel_id="test_hotel",
guest_id="guest_123",
guest_first_name="Margaret",
guest_last_name="Brown",
guest_email="margaret@example.com",
guest_country_code="GB",
guest_birth_date=None,
now=None,
)
test_db_session.add(conversion_guest)
await test_db_session.flush()
# Verify hashed fields are populated
assert conversion_guest.hashed_first_name is not None
assert conversion_guest.hashed_last_name is not None
assert conversion_guest.hashed_email is not None
# Verify hashes are correct (SHA256)
expected_hashed_first = hashlib.sha256(
"margaret".lower().strip().encode("utf-8")
).hexdigest()
expected_hashed_last = hashlib.sha256(
"brown".lower().strip().encode("utf-8")
).hexdigest()
expected_hashed_email = hashlib.sha256(
"margaret@example.com".lower().strip().encode("utf-8")
).hexdigest()
assert conversion_guest.hashed_first_name == expected_hashed_first
assert conversion_guest.hashed_last_name == expected_hashed_last
assert conversion_guest.hashed_email == expected_hashed_email
@pytest.mark.asyncio
async def test_conversion_records_created_before_matching(
self, test_db_session, test_config
):
"""Test that conversion records exist before matching occurs."""
# Create customer and reservation for matching
customer = Customer(
given_name="David",
surname="Miller",
email_address="david@example.com",
contact_id="test_contact_6",
)
test_db_session.add(customer)
await test_db_session.flush()
hashed_customer = customer.create_hashed_customer()
test_db_session.add(hashed_customer)
await test_db_session.flush()
reservation = Reservation(
customer_id=customer.id,
unique_id="res_6",
hotel_code="hotel_1",
)
test_db_session.add(reservation)
await test_db_session.commit()
# Create conversion XML with matching hashed data
xml_content = f"""<?xml version="1.0"?>
<root>
<reservation id="pms_123" hotelID="hotel_1" number="RES001" date="2025-01-15">
<guest firstName="David" lastName="Miller" email="david@example.com"/>
<roomReservations>
<roomReservation roomNumber="101" arrival="2025-01-15" departure="2025-01-17" status="confirmed">
<dailySales>
<dailySale date="2025-01-15" revenueTotal="100.00"/>
</dailySales>
</roomReservation>
</roomReservations>
</reservation>
</root>"""
service = ConversionService(test_db_session)
stats = await service.process_conversion_xml(xml_content)
# Verify conversion was created
result = await test_db_session.execute(
select(Conversion).where(Conversion.pms_reservation_id == "pms_123")
)
conversion = result.scalar_one_or_none()
assert conversion is not None, "Conversion should be created"
assert conversion.hotel_id == "hotel_1"
assert conversion.guest_first_name == "David"
assert conversion.guest_last_name == "Miller"
assert conversion.guest_email == "david@example.com"
# Verify conversion_guest was created
assert conversion.conversion_guest_id is not None, "ConversionGuest should be created"
# Verify conversion_room was created
room_result = await test_db_session.execute(
select(ConversionRoom).where(
ConversionRoom.conversion_id == conversion.id
)
)
rooms = room_result.scalars().all()
assert len(rooms) > 0, "ConversionRoom should be created"
# Verify matching occurred (may or may not have matched depending on data)
# The important thing is that the records exist
assert stats["total_reservations"] == 1
assert stats["total_daily_sales"] == 1
@pytest.mark.asyncio
async def test_hashed_customer_missing_fields_handled_gracefully(
self, test_db_session
):
"""Test that matching handles customers with missing hashed fields gracefully."""
# Create a customer
customer = Customer(
given_name="Eve",
surname="Taylor",
email_address="eve@example.com",
contact_id="test_contact_7",
)
test_db_session.add(customer)
await test_db_session.flush()
# Create hashed customer but simulate missing fields by manually setting to None
hashed_customer = HashedCustomer(
customer_id=customer.id,
contact_id="test_contact_7_hashed",
hashed_email=None, # Simulate missing hashed email
hashed_given_name=None, # Simulate missing hashed name
hashed_surname=None,
)
test_db_session.add(hashed_customer)
await test_db_session.flush()
# Create reservation
reservation = Reservation(
customer_id=customer.id,
unique_id="res_7",
hotel_code="test_hotel",
)
test_db_session.add(reservation)
await test_db_session.commit()
# Test matching - should not crash even with missing hashed fields
service = ConversionService(test_db_session)
# Eagerly load reservations
from sqlalchemy.orm import selectinload
result = await test_db_session.execute(
select(Reservation)
.where(Reservation.id == reservation.id)
.options(selectinload(Reservation.customer).selectinload(Customer.hashed_version))
)
reservations = result.scalars().all()
hashed_email = hashlib.sha256(
"eve@example.com".lower().strip().encode("utf-8")
).hexdigest()
matched = service._match_reservations_by_guest_details(
reservations,
guest_first_name=None,
guest_last_name=None,
guest_email=hashed_email,
)
# Should not match because hashed customer fields are None
assert matched is None, "Should not match with missing hashed fields"
@pytest.mark.asyncio
async def test_duplicate_conversion_guests_with_same_hotel_and_guest_id(
self, test_db_session
):
"""Test handling of duplicate ConversionGuest records with same (hotel_id, guest_id).
This test reproduces the production issue where multiple ConversionGuest records
can be created with the same (hotel_id, guest_id) combination, causing
scalar_one_or_none() to fail with "Multiple rows were found when one or none was required".
This can happen when:
- Multiple conversions arrive for the same hotel and PMS guest within the same batch
- The XML is processed multiple times
- Race conditions in concurrent processing
"""
hotel_id = "test_hotel"
guest_id = "guest_123"
# Simulate the production scenario: multiple conversion guests with same (hotel_id, guest_id)
guest1 = ConversionGuest.create_from_conversion_data(
hotel_id=hotel_id,
guest_id=guest_id,
guest_first_name="John",
guest_last_name="Doe",
guest_email="john@example.com",
guest_country_code="US",
guest_birth_date=None,
now=None,
)
test_db_session.add(guest1)
await test_db_session.flush()
# Create a second guest with the SAME (hotel_id, guest_id)
# This should not happen, but can occur in production
guest2 = ConversionGuest.create_from_conversion_data(
hotel_id=hotel_id,
guest_id=guest_id,
guest_first_name="Jane", # Different first name
guest_last_name="Doe",
guest_email="jane@example.com",
guest_country_code="US",
guest_birth_date=None,
now=None,
)
test_db_session.add(guest2)
await test_db_session.commit()
# Now try to query for the guest by (hotel_id, guest_id)
# This should return multiple results
result = await test_db_session.execute(
select(ConversionGuest).where(
(ConversionGuest.hotel_id == hotel_id)
& (ConversionGuest.guest_id == guest_id)
)
)
guests = result.scalars().all()
# Verify we have duplicates (the production bug condition)
assert len(guests) == 2, "Should have created duplicate conversion guests"
# Verify that scalars().first() returns one of the guests (the fixed behavior)
result2 = await test_db_session.execute(
select(ConversionGuest).where(
(ConversionGuest.hotel_id == hotel_id)
& (ConversionGuest.guest_id == guest_id)
)
)
first_guest = result2.scalars().first()
assert first_guest is not None, "Should find at least one guest with scalars().first()"
# The old code would have raised an error here with scalar_one_or_none()
# when finding multiple results. Now it's fixed to use .first() instead.
result3 = await test_db_session.execute(
select(ConversionGuest).where(
(ConversionGuest.hotel_id == hotel_id)
& (ConversionGuest.guest_id == guest_id)
)
)
with pytest.raises(Exception): # MultipleResultsFound from old code path
result3.scalar_one_or_none()
if __name__ == "__main__":
pytest.main([__file__, "-v"])