From c74163b11eba0e4a83b26c5deb459a0b5d67c588 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 19 Nov 2025 14:17:13 +0100 Subject: [PATCH] Hashed comparisions don't work unfortunatly --- src/alpine_bits_python/conversion_service.py | 199 +++++++----- tests/test_conversion_service.py | 325 ++++++++++++++++++- 2 files changed, 443 insertions(+), 81 deletions(-) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 0b318a6..c6ccfea 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -1,6 +1,7 @@ """Service for handling conversion data from hotel PMS XML files.""" import asyncio +import hashlib import xml.etree.ElementTree as ET from datetime import UTC, datetime from decimal import Decimal @@ -50,10 +51,11 @@ class ConversionService: self.supports_concurrent = False # Cache for reservation and customer data within a single XML processing run - # Maps hotel_code -> list of (reservation, customer) tuples + # Maps hotel_code -> list of (reservation, hashed_customer) tuples # This significantly speeds up matching when processing large XML files + # Uses hashed data for matching to preserve privacy self._reservation_cache: dict[ - str | None, list[tuple[Reservation, Customer | None]] + str | None, list[tuple[Reservation, HashedCustomer | None]] ] = {} self._cache_initialized = False @@ -91,6 +93,11 @@ class ConversionService: If a guest with this key exists, updates it with new data. If not, creates a new guest record. + NOTE: There is no database-level unique constraint on (hotel_id, guest_id), + so multiple ConversionGuest records can exist with the same key. This method + uses first() instead of scalar_one_or_none() to handle this gracefully and + update the most recently created record when duplicates exist. + Returns the ConversionGuest record, or None if no guest data provided. """ # Don't create a ConversionGuest if we have no guest information @@ -104,12 +111,14 @@ class ConversionService: # Try to find existing guest by (hotel_id, guest_id) if guest_id: result = await session.execute( - select(ConversionGuest).where( + select(ConversionGuest) + .where( (ConversionGuest.hotel_id == hotel_id) & (ConversionGuest.guest_id == guest_id) ) + .order_by(ConversionGuest.last_seen.desc()) # Get most recently updated ) - existing_guest = result.scalar_one_or_none() + existing_guest = result.scalars().first() if existing_guest: # Update with new data @@ -210,16 +219,16 @@ class ConversionService: return stats async def _load_reservation_cache(self) -> None: - """Load all reservations and customers into cache for fast matching. + """Load all reservations and hashed customers into cache for fast matching. This method is called once at the start of processing a large XML file. - It loads all reservations with their associated customers into an in-memory + It loads all reservations with their associated hashed customers into an in-memory cache organized by hotel_code. This avoids repeated database queries during - matching operations. + matching operations and uses hashed data for privacy-preserving matching. The cache structure: - Key: hotel_code (str or None) - - Value: List of (reservation, customer) tuples + - Value: List of (reservation, hashed_customer) tuples This is especially beneficial for large XML files with many reservations where matching criteria is the same across multiple reservations. @@ -241,8 +250,11 @@ class ConversionService: return try: - # Load all reservations with their customers in one query - query = select(Reservation).options(selectinload(Reservation.customer)) + # Load all reservations with their hashed customers in one query + from sqlalchemy.orm import selectinload + query = select(Reservation).options( + selectinload(Reservation.customer).selectinload(Customer.hashed_version) + ) result = await session.execute(query) reservations = result.scalars().all() @@ -253,8 +265,12 @@ class ConversionService: hotel_code = reservation.hotel_code if hotel_code not in self._reservation_cache: self._reservation_cache[hotel_code] = [] + # Cache the hashed customer instead of raw customer + hashed_customer = None + if reservation.customer and reservation.customer.hashed_version: + hashed_customer = reservation.customer.hashed_version self._reservation_cache[hotel_code].append( - (reservation, reservation.customer) + (reservation, hashed_customer) ) self._cache_initialized = True @@ -482,26 +498,6 @@ class ConversionService: except ValueError: _LOGGER.warning("Invalid creation time format: %s", creation_time_str) - # Find matching reservation, customer, and hashed_customer using advertising data and guest details - matched_reservation = None - matched_customer = None - matched_hashed_customer = None - - if advertising_campagne or True: - match_result = await self._find_matching_entities( - advertising_campagne, - hotel_id, - reservation_date, - guest_first_name, - guest_last_name, - guest_email, - advertising_partner, - session, - ) - matched_reservation = match_result["reservation"] - matched_customer = match_result["customer"] - matched_hashed_customer = match_result["hashed_customer"] - # Process all room reservations room_reservations = reservation_elem.find("roomReservations") if room_reservations is None: @@ -519,16 +515,7 @@ class ConversionService: existing_conversion = existing_result.scalar_one_or_none() if existing_conversion: - # Update existing conversion - existing_conversion.reservation_id = ( - matched_reservation.id if matched_reservation else None - ) - existing_conversion.customer_id = ( - matched_customer.id if matched_customer else None - ) - existing_conversion.hashed_customer_id = ( - matched_hashed_customer.id if matched_hashed_customer else None - ) + # Update existing conversion (matching will be done later) existing_conversion.reservation_number = reservation_number existing_conversion.reservation_date = reservation_date existing_conversion.creation_time = creation_time @@ -550,14 +537,12 @@ class ConversionService: pms_reservation_id, ) else: - # Create new conversion entry + # Create new conversion entry (without matching - will be done later) conversion = Conversion( - # Links to existing entities (nullable) - reservation_id=matched_reservation.id if matched_reservation else None, - customer_id=matched_customer.id if matched_customer else None, - hashed_customer_id=matched_hashed_customer.id - if matched_hashed_customer - else None, + # Links to existing entities (nullable, will be filled in after matching) + reservation_id=None, + customer_id=None, + hashed_customer_id=None, # Reservation metadata hotel_id=hotel_id, pms_reservation_id=pms_reservation_id, @@ -604,16 +589,6 @@ class ConversionService: if conversion_guest: conversion.conversion_guest_id = conversion_guest.id - # Update stats for the conversion record itself - if matched_reservation: - stats["matched_to_reservation"] += 1 - if matched_customer: - stats["matched_to_customer"] += 1 - if matched_hashed_customer: - stats["matched_to_hashed_customer"] += 1 - if not any([matched_reservation, matched_customer, matched_hashed_customer]): - stats["unmatched"] += 1 - # Batch-load existing room reservations to avoid N+1 queries room_numbers = [ rm.get("roomNumber") for rm in room_reservations.findall("roomReservation") @@ -772,6 +747,55 @@ class ConversionService: num_adults, ) + # Now that conversion, conversion_guest, and conversion_room records exist, + # perform matching using hashed guest data from conversion_guest + matched_reservation = None + matched_customer = None + matched_hashed_customer = None + + if advertising_campagne or True: + # Use hashed data from conversion_guest for matching + hashed_first_name = conversion_guest.hashed_first_name if conversion_guest else None + hashed_last_name = conversion_guest.hashed_last_name if conversion_guest else None + hashed_email = conversion_guest.hashed_email if conversion_guest else None + + match_result = await self._find_matching_entities( + advertising_campagne, + hotel_id, + reservation_date, + hashed_first_name, + hashed_last_name, + hashed_email, + advertising_partner, + session, + ) + matched_reservation = match_result["reservation"] + matched_customer = match_result["customer"] + matched_hashed_customer = match_result["hashed_customer"] + + # Update the conversion with matched entities if found + if matched_reservation or matched_customer or matched_hashed_customer: + conversion.reservation_id = ( + matched_reservation.id if matched_reservation else None + ) + conversion.customer_id = ( + matched_customer.id if matched_customer else None + ) + conversion.hashed_customer_id = ( + matched_hashed_customer.id if matched_hashed_customer else None + ) + conversion.updated_at = datetime.now() + + # Update stats for the conversion record + if matched_reservation: + stats["matched_to_reservation"] += 1 + if matched_customer: + stats["matched_to_customer"] += 1 + if matched_hashed_customer: + stats["matched_to_hashed_customer"] += 1 + if not any([matched_reservation, matched_customer, matched_hashed_customer]): + stats["unmatched"] += 1 + return stats async def _find_matching_entities( @@ -1028,7 +1052,10 @@ class ConversionService: "Cache unavailable or empty, falling back to database query (hotel=%s)", hotel_id, ) - query = select(Reservation).options(selectinload(Reservation.customer)) + from sqlalchemy.orm import selectinload + query = select(Reservation).options( + selectinload(Reservation.customer).selectinload(Customer.hashed_version) + ) if hotel_id: query = query.where(Reservation.hotel_code == hotel_id) @@ -1049,53 +1076,67 @@ class ConversionService: ) -> Reservation | None: """Match a reservation from a list by guest name and email (non-async). + Uses hashed data matching for privacy. The guest_first_name, guest_last_name, + and guest_email parameters should be pre-hashed values from ConversionGuest. + Compares hashed values directly against hashed_customer hash fields. + Args: reservations: List of reservations to search through - guest_first_name: Guest first name - guest_last_name: Guest last name - guest_email: Guest email + guest_first_name: Pre-hashed guest first name (or None) + guest_last_name: Pre-hashed guest last name (or None) + guest_email: Pre-hashed guest email (or None) Returns: Matched Reservation or None """ - # Filter by guest details + # Filter by guest details using hashed data candidates = [] + for reservation in reservations: customer = reservation.customer if not customer: continue - # Match by email (highest priority) - if guest_email: - if ( - customer.email_address - and customer.email_address.lower() == guest_email.lower() - ): + # Access hashed_version - handle both direct objects and relationships + hashed_customer = customer.hashed_version + if not hashed_customer: + continue + + # Defensive check: ensure hashed_customer is actually a HashedCustomer object + # (not an InstrumentedList or other SQLAlchemy construct) + if not hasattr(hashed_customer, 'hashed_email'): + _LOGGER.debug( + "Customer %s has invalid hashed_version type: %s", + customer.id, + type(hashed_customer), + ) + continue + + # Match by email (highest priority) using hashed comparison + if guest_email and hashed_customer.hashed_email: + if hashed_customer.hashed_email == guest_email: _LOGGER.info( - "Found exact email match for %s (reservation_id=%s)", - guest_email, + "Found exact email match (reservation_id=%s) via hash", reservation.id, ) candidates.append((reservation, 3)) # Highest score continue - # Match by name (first + last) + # Match by name (first + last) using hashed comparison if guest_first_name and guest_last_name: first_match = ( - customer.given_name - and customer.given_name.lower() == guest_first_name.lower() + hashed_customer.hashed_given_name + and hashed_customer.hashed_given_name == guest_first_name ) last_match = ( - customer.surname - and customer.surname.lower() == guest_last_name.lower() + hashed_customer.hashed_surname + and hashed_customer.hashed_surname == guest_last_name ) if first_match and last_match: _LOGGER.info( - "Found exact name match for %s %s (reservation_id=%s)", - guest_first_name, - guest_last_name, + "Found exact name match (reservation_id=%s) via hash", reservation.id, ) candidates.append((reservation, 2)) # Medium-high score diff --git a/tests/test_conversion_service.py b/tests/test_conversion_service.py index 615f8d3..c10da92 100644 --- a/tests/test_conversion_service.py +++ b/tests/test_conversion_service.py @@ -7,20 +7,31 @@ This test module: The test data is designed to test realistic matching scenarios: - Matching by advertising campaign data (fbclid/gclid) -- Matching by guest name and email +- Matching by guest name and email using hashed data - Handling unmatched conversions - Processing daily sales revenue data +- Testing hashed matching logic and edge cases """ +import hashlib from pathlib import Path import pytest import pytest_asyncio +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from alpine_bits_python.conversion_service import ConversionService from alpine_bits_python.csv_import import CSVImporter -from alpine_bits_python.db import Base, Conversion, ConversionRoom +from alpine_bits_python.db import ( + Base, + Conversion, + ConversionGuest, + ConversionRoom, + Customer, + HashedCustomer, + Reservation, +) @pytest_asyncio.fixture @@ -317,5 +328,315 @@ class TestConversionServiceWithImportedData: assert stats["errors"] == 0 +class TestHashedMatchingLogic: + """Test the hashed matching logic used in ConversionService.""" + + + @pytest.mark.asyncio + async def test_no_match_without_hashed_customer(self, test_db_session): + """Test that matching fails gracefully when customer has no hashed version.""" + # Create a customer WITHOUT hashed data + customer = Customer( + given_name="Bob", + surname="Jones", + email_address="bob@example.com", + contact_id="test_contact_3", + ) + test_db_session.add(customer) + await test_db_session.commit() + + # Create a reservation + reservation = Reservation( + customer_id=customer.id, + unique_id="res_3", + hotel_code="test_hotel", + ) + test_db_session.add(reservation) + await test_db_session.commit() + + # Test the matching logic + service = ConversionService(test_db_session) + + # Eagerly load reservations + from sqlalchemy.orm import selectinload + result = await test_db_session.execute( + select(Reservation) + .where(Reservation.id == reservation.id) + .options(selectinload(Reservation.customer).selectinload(Customer.hashed_version)) + ) + reservations = result.scalars().all() + + hashed_email = hashlib.sha256( + "bob@example.com".lower().strip().encode("utf-8") + ).hexdigest() + + matched = service._match_reservations_by_guest_details( + reservations, + guest_first_name=None, + guest_last_name=None, + guest_email=hashed_email, + ) + + # Should not match because customer has no hashed version + assert matched is None, "Should not match without hashed customer" + + + @pytest.mark.asyncio + async def test_conversion_guest_hashed_fields_are_populated( + self, test_db_session + ): + """Test that ConversionGuest properly stores hashed versions of guest data.""" + # Create a conversion guest + conversion_guest = ConversionGuest.create_from_conversion_data( + hotel_id="test_hotel", + guest_id="guest_123", + guest_first_name="Margaret", + guest_last_name="Brown", + guest_email="margaret@example.com", + guest_country_code="GB", + guest_birth_date=None, + now=None, + ) + test_db_session.add(conversion_guest) + await test_db_session.flush() + + # Verify hashed fields are populated + assert conversion_guest.hashed_first_name is not None + assert conversion_guest.hashed_last_name is not None + assert conversion_guest.hashed_email is not None + + # Verify hashes are correct (SHA256) + expected_hashed_first = hashlib.sha256( + "margaret".lower().strip().encode("utf-8") + ).hexdigest() + expected_hashed_last = hashlib.sha256( + "brown".lower().strip().encode("utf-8") + ).hexdigest() + expected_hashed_email = hashlib.sha256( + "margaret@example.com".lower().strip().encode("utf-8") + ).hexdigest() + + assert conversion_guest.hashed_first_name == expected_hashed_first + assert conversion_guest.hashed_last_name == expected_hashed_last + assert conversion_guest.hashed_email == expected_hashed_email + + + @pytest.mark.asyncio + async def test_conversion_records_created_before_matching( + self, test_db_session, test_config + ): + """Test that conversion records exist before matching occurs.""" + # Create customer and reservation for matching + customer = Customer( + given_name="David", + surname="Miller", + email_address="david@example.com", + contact_id="test_contact_6", + ) + test_db_session.add(customer) + await test_db_session.flush() + + hashed_customer = customer.create_hashed_customer() + test_db_session.add(hashed_customer) + await test_db_session.flush() + + reservation = Reservation( + customer_id=customer.id, + unique_id="res_6", + hotel_code="hotel_1", + ) + test_db_session.add(reservation) + await test_db_session.commit() + + # Create conversion XML with matching hashed data + xml_content = f""" + + + + + + + + + + + +""" + + service = ConversionService(test_db_session) + stats = await service.process_conversion_xml(xml_content) + + # Verify conversion was created + result = await test_db_session.execute( + select(Conversion).where(Conversion.pms_reservation_id == "pms_123") + ) + conversion = result.scalar_one_or_none() + + assert conversion is not None, "Conversion should be created" + assert conversion.hotel_id == "hotel_1" + assert conversion.guest_first_name == "David" + assert conversion.guest_last_name == "Miller" + assert conversion.guest_email == "david@example.com" + + # Verify conversion_guest was created + assert conversion.conversion_guest_id is not None, "ConversionGuest should be created" + + # Verify conversion_room was created + room_result = await test_db_session.execute( + select(ConversionRoom).where( + ConversionRoom.conversion_id == conversion.id + ) + ) + rooms = room_result.scalars().all() + assert len(rooms) > 0, "ConversionRoom should be created" + + # Verify matching occurred (may or may not have matched depending on data) + # The important thing is that the records exist + assert stats["total_reservations"] == 1 + assert stats["total_daily_sales"] == 1 + + @pytest.mark.asyncio + async def test_hashed_customer_missing_fields_handled_gracefully( + self, test_db_session + ): + """Test that matching handles customers with missing hashed fields gracefully.""" + # Create a customer + customer = Customer( + given_name="Eve", + surname="Taylor", + email_address="eve@example.com", + contact_id="test_contact_7", + ) + test_db_session.add(customer) + await test_db_session.flush() + + # Create hashed customer but simulate missing fields by manually setting to None + hashed_customer = HashedCustomer( + customer_id=customer.id, + contact_id="test_contact_7_hashed", + hashed_email=None, # Simulate missing hashed email + hashed_given_name=None, # Simulate missing hashed name + hashed_surname=None, + ) + test_db_session.add(hashed_customer) + await test_db_session.flush() + + # Create reservation + reservation = Reservation( + customer_id=customer.id, + unique_id="res_7", + hotel_code="test_hotel", + ) + test_db_session.add(reservation) + await test_db_session.commit() + + # Test matching - should not crash even with missing hashed fields + service = ConversionService(test_db_session) + + # Eagerly load reservations + from sqlalchemy.orm import selectinload + result = await test_db_session.execute( + select(Reservation) + .where(Reservation.id == reservation.id) + .options(selectinload(Reservation.customer).selectinload(Customer.hashed_version)) + ) + reservations = result.scalars().all() + + hashed_email = hashlib.sha256( + "eve@example.com".lower().strip().encode("utf-8") + ).hexdigest() + + matched = service._match_reservations_by_guest_details( + reservations, + guest_first_name=None, + guest_last_name=None, + guest_email=hashed_email, + ) + + # Should not match because hashed customer fields are None + assert matched is None, "Should not match with missing hashed fields" + + @pytest.mark.asyncio + async def test_duplicate_conversion_guests_with_same_hotel_and_guest_id( + self, test_db_session + ): + """Test handling of duplicate ConversionGuest records with same (hotel_id, guest_id). + + This test reproduces the production issue where multiple ConversionGuest records + can be created with the same (hotel_id, guest_id) combination, causing + scalar_one_or_none() to fail with "Multiple rows were found when one or none was required". + + This can happen when: + - Multiple conversions arrive for the same hotel and PMS guest within the same batch + - The XML is processed multiple times + - Race conditions in concurrent processing + """ + hotel_id = "test_hotel" + guest_id = "guest_123" + + # Simulate the production scenario: multiple conversion guests with same (hotel_id, guest_id) + guest1 = ConversionGuest.create_from_conversion_data( + hotel_id=hotel_id, + guest_id=guest_id, + guest_first_name="John", + guest_last_name="Doe", + guest_email="john@example.com", + guest_country_code="US", + guest_birth_date=None, + now=None, + ) + test_db_session.add(guest1) + await test_db_session.flush() + + # Create a second guest with the SAME (hotel_id, guest_id) + # This should not happen, but can occur in production + guest2 = ConversionGuest.create_from_conversion_data( + hotel_id=hotel_id, + guest_id=guest_id, + guest_first_name="Jane", # Different first name + guest_last_name="Doe", + guest_email="jane@example.com", + guest_country_code="US", + guest_birth_date=None, + now=None, + ) + test_db_session.add(guest2) + await test_db_session.commit() + + # Now try to query for the guest by (hotel_id, guest_id) + # This should return multiple results + result = await test_db_session.execute( + select(ConversionGuest).where( + (ConversionGuest.hotel_id == hotel_id) + & (ConversionGuest.guest_id == guest_id) + ) + ) + guests = result.scalars().all() + + # Verify we have duplicates (the production bug condition) + assert len(guests) == 2, "Should have created duplicate conversion guests" + + # Verify that scalars().first() returns one of the guests (the fixed behavior) + result2 = await test_db_session.execute( + select(ConversionGuest).where( + (ConversionGuest.hotel_id == hotel_id) + & (ConversionGuest.guest_id == guest_id) + ) + ) + first_guest = result2.scalars().first() + assert first_guest is not None, "Should find at least one guest with scalars().first()" + + # The old code would have raised an error here with scalar_one_or_none() + # when finding multiple results. Now it's fixed to use .first() instead. + result3 = await test_db_session.execute( + select(ConversionGuest).where( + (ConversionGuest.hotel_id == hotel_id) + & (ConversionGuest.guest_id == guest_id) + ) + ) + with pytest.raises(Exception): # MultipleResultsFound from old code path + result3.scalar_one_or_none() + + if __name__ == "__main__": pytest.main([__file__, "-v"])