From 2b98973d1c0b1d9f5cac8ba3eeced33217532017 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 19 Nov 2025 18:40:44 +0100 Subject: [PATCH] On we go. Maybe soon this will be done --- ...hashed_customer_id_to_conversion_guests.py | 34 ++ src/alpine_bits_python/conversion_service.py | 503 +++++++++++++----- src/alpine_bits_python/db.py | 4 + 3 files changed, 416 insertions(+), 125 deletions(-) create mode 100644 alembic/versions/2025_11_19_1800-add_hashed_customer_id_to_conversion_guests.py diff --git a/alembic/versions/2025_11_19_1800-add_hashed_customer_id_to_conversion_guests.py b/alembic/versions/2025_11_19_1800-add_hashed_customer_id_to_conversion_guests.py new file mode 100644 index 0000000..b102902 --- /dev/null +++ b/alembic/versions/2025_11_19_1800-add_hashed_customer_id_to_conversion_guests.py @@ -0,0 +1,34 @@ +"""add hashed_customer_id to conversion_guests + +Revision ID: a1b2c3d4e5f6 +Revises: 08fe946414d8 +Create Date: 2025-11-19 18:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'a1b2c3d4e5f6' +down_revision: Union[str, Sequence[str], None] = '08fe946414d8' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # Add hashed_customer_id column to conversion_guests + op.add_column('conversion_guests', sa.Column('hashed_customer_id', sa.Integer(), nullable=True)) + op.create_index(op.f('ix_conversion_guests_hashed_customer_id'), 'conversion_guests', ['hashed_customer_id'], unique=False) + op.create_foreign_key(None, 'conversion_guests', 'hashed_customers', ['hashed_customer_id'], ['id'], ondelete='SET NULL') + + +def downgrade() -> None: + """Downgrade schema.""" + # Drop the hashed_customer_id column and its constraints + op.drop_constraint(None, 'conversion_guests', type_='foreignkey') + op.drop_index(op.f('ix_conversion_guests_hashed_customer_id'), table_name='conversion_guests') + op.drop_column('conversion_guests', 'hashed_customer_id') diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index f01fc29..8546156 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -177,7 +177,6 @@ class ConversionService: "hashed_birth_date": ConversionGuest._normalize_and_hash( guest_data["guest_birth_date"].isoformat() if guest_data["guest_birth_date"] else None ), - "is_regular": False, "first_seen": now, "last_seen": now, }) @@ -883,103 +882,6 @@ class ConversionService: return stats - async def _match_conversion( - self, - conversion: Conversion, - guest_first_name: str | None, - guest_last_name: str | None, - guest_email: str | None, - advertising_campagne: str | None, - advertising_partner: str | None, - hotel_id: str | None, - reservation_date: Any, - session: AsyncSession | None = None, - ) -> dict[str, int]: - """Match a conversion to reservations and customers using guest and advertising data. - - This is the matching phase that runs AFTER conversion data has been stored. - It uses hashed guest data to match conversions to existing reservations/customers. - - Args: - conversion: The Conversion record to match - guest_first_name: Guest first name (will be hashed for matching) - guest_last_name: Guest last name (will be hashed for matching) - guest_email: Guest email (will be hashed for matching) - advertising_campagne: Advertising campaign identifier - advertising_partner: Advertising partner info - hotel_id: Hotel ID for filtering matches - reservation_date: Reservation date for additional filtering - session: AsyncSession to use for database queries - - Returns: - Dictionary with match statistics: matched_to_reservation, matched_to_customer, - matched_to_hashed_customer, and unmatched (all counts of 0 or 1) - """ - if session is None: - session = self.session - - stats = { - "matched_to_reservation": 0, - "matched_to_customer": 0, - "matched_to_hashed_customer": 0, - "unmatched": 0, - } - - # Hash guest data for matching (same hashing logic as ConversionGuest) - hashed_first_name = ConversionGuest._normalize_and_hash(guest_first_name) - hashed_last_name = ConversionGuest._normalize_and_hash(guest_last_name) - hashed_email = ConversionGuest._normalize_and_hash(guest_email) - - # Find matching entities - match_result = await self._find_matching_entities( - advertising_campagne, - hotel_id, - reservation_date, - hashed_first_name, - hashed_last_name, - hashed_email, - advertising_partner, - session, - ) - - matched_reservation = match_result["reservation"] - matched_customer = match_result["customer"] - matched_hashed_customer = match_result["hashed_customer"] - match_type = match_result.get("match_type") # "id" or "guest_details" - - # Update the conversion with matched entities if found - if matched_reservation or matched_customer or matched_hashed_customer: - conversion.reservation_id = ( - matched_reservation.id if matched_reservation else None - ) - conversion.customer_id = ( - matched_customer.id if matched_customer else None - ) - conversion.hashed_customer_id = ( - matched_hashed_customer.id if matched_hashed_customer else None - ) - - # Set attribution flags based on match type - if match_type == "id": - conversion.directly_attributable = True - conversion.guest_matched = False - elif match_type == "guest_details": - conversion.directly_attributable = False - conversion.guest_matched = True - - conversion.updated_at = datetime.now() - - # Update stats - if matched_reservation: - stats["matched_to_reservation"] = 1 - if matched_customer: - stats["matched_to_customer"] = 1 - if matched_hashed_customer: - stats["matched_to_hashed_customer"] = 1 - if not any([matched_reservation, matched_customer, matched_hashed_customer]): - stats["unmatched"] = 1 - - return stats async def _find_matching_entities( self, @@ -994,9 +896,9 @@ class ConversionService: ) -> dict[str, Any]: """Find matching Reservation, Customer, and HashedCustomer. - Uses two strategies with separate attribution: - 1. ID-based matching (fbclid/gclid/md5_unique_id) - directly_attributable - 2. Guest detail matching (email/name) - guest_matched only + Uses two strategies with separate matching paths: + 1. ID-based matching (fbclid/gclid/md5_unique_id) - returns Reservation + Customer + HashedCustomer + 2. Guest detail matching (email/name) - returns Customer + HashedCustomer directly (no Reservation needed) Args: advertising_campagne: Truncated tracking ID from conversion XML @@ -1011,6 +913,7 @@ class ConversionService: Returns: Dictionary with 'reservation', 'customer', 'hashed_customer', and 'match_type' keys. match_type is either 'id' (high confidence) or 'guest_details' (lower confidence) + For guest_details matches, 'reservation' will be None. """ if session is None: @@ -1050,22 +953,31 @@ class ConversionService: ) # Strategy 2: If no ID-based match, try email/name-based matching - guest details, lower confidence + # For guest detail matches, match directly with HashedCustomer (skip Reservation table) if not result["reservation"] and ( guest_email or guest_first_name or guest_last_name ): - matched_reservation = await self._match_by_guest_details( - hotel_id, guest_first_name, guest_last_name, guest_email, session + matched_hashed_customer = await self._match_by_guest_details_hashed( + guest_first_name, guest_last_name, guest_email, session ) - if matched_reservation: - result["reservation"] = matched_reservation + if matched_hashed_customer: + result["hashed_customer"] = matched_hashed_customer result["match_type"] = "guest_details" # Matched by guest details only + + # Get the customer if it exists + if matched_hashed_customer.customer_id: + customer_query = select(Customer).where( + Customer.id == matched_hashed_customer.customer_id + ) + customer_result = await session.execute(customer_query) + result["customer"] = customer_result.scalar_one_or_none() + _LOGGER.info( - "Matched conversion by guest details (name=%s %s, email=%s, hotel=%s)", + "Matched conversion by guest details to hashed_customer (name=%s %s, email=%s)", guest_first_name, guest_last_name, guest_email, - hotel_id, ) else: _LOGGER.debug( @@ -1075,7 +987,7 @@ class ConversionService: guest_email, ) - # If we found a reservation, get its customer and hashed_customer + # If we found a reservation (ID-based match), get its customer and hashed_customer if result["reservation"]: if result["reservation"].customer_id: customer_query = select(Customer).where( @@ -1182,6 +1094,77 @@ class ConversionService: return matched_reservation + async def _match_by_guest_details_hashed( + self, + guest_first_name: str | None, + guest_last_name: str | None, + guest_email: str | None, + session: AsyncSession | None = None, + ) -> HashedCustomer | None: + """Match guest by name and email directly to HashedCustomer (no Reservation needed). + + This method bypasses the Reservation table entirely and matches directly against + hashed customer data. Used for guest-detail matching where we don't need to link + to a specific reservation. + + Args: + guest_first_name: Guest first name (pre-hashed) + guest_last_name: Guest last name (pre-hashed) + guest_email: Guest email (pre-hashed) + session: AsyncSession to use. If None, uses self.session. + + Returns: + Matched HashedCustomer or None + + """ + if session is None: + session = self.session + + # Query all hashed customers that match the guest details + query = select(HashedCustomer).options( + selectinload(HashedCustomer.customer) + ) + + # Build filter conditions + conditions = [] + if guest_email: + conditions.append(HashedCustomer.hashed_email == guest_email) + if guest_first_name and guest_last_name: + conditions.append( + (HashedCustomer.hashed_given_name == guest_first_name) + & (HashedCustomer.hashed_surname == guest_last_name) + ) + + if not conditions: + return None + + # Combine conditions with OR (match if email matches OR name matches) + query = query.where(or_(*conditions)) + + db_result = await session.execute(query) + matches = db_result.scalars().all() + + if not matches: + return None + + # If single match, return it + if len(matches) == 1: + return matches[0] + + # If multiple matches, prefer email match over name match + for match in matches: + if guest_email and match.hashed_email == guest_email: + _LOGGER.debug( + "Multiple hashed customer matches, preferring email match" + ) + return match + + # Otherwise return first match + _LOGGER.warning( + "Multiple hashed customer matches found for guest details, using first match" + ) + return matches[0] + async def _match_by_guest_details( self, hotel_id: str | None, @@ -1521,10 +1504,12 @@ class ConversionService: and uses their stored hashed data to match to existing reservations/customers. No XML parsing, no re-hashing - complete separation of concerns. - This enables: - - Matching historical data that wasn't just created - - Re-running matching logic independently - - Consistent hashing (using already-hashed data from DB) + Matching rules: + 1. Guest detail matches: Record in conversion_guests table with hashed_customer reference + 2. ID-based matches (md5_hash/click_id): Can also record directly in conversions-reservations + with directly_attributable=True + 3. Regular guest detection: Check if conversions exist with dates before all reservations, + or if conversion_room dates match reservation dates Updates stats dictionary in-place if provided. @@ -1540,7 +1525,7 @@ class ConversionService: result = await session.execute( select(Conversion) .where(Conversion.pms_reservation_id == pms_reservation_id) - .options(selectinload(Conversion.guest)) + .options(selectinload(Conversion.guest), selectinload(Conversion.conversion_rooms)) ) conversion = result.scalar_one_or_none() @@ -1584,24 +1569,55 @@ class ConversionService: # Update the conversion with matched entities if found if matched_reservation or matched_customer or matched_hashed_customer: - conversion.reservation_id = ( - matched_reservation.id if matched_reservation else None - ) - conversion.customer_id = ( - matched_customer.id if matched_customer else None - ) - conversion.hashed_customer_id = ( - matched_hashed_customer.id if matched_hashed_customer else None - ) - - # Set attribution flags based on match type + # Set attribution flags and matched entities based on match type if match_type == "id": + # ID-based matches (fbclid/gclid/md5_unique_id) are always directly attributable + # Link to both reservation and customer + conversion.reservation_id = ( + matched_reservation.id if matched_reservation else None + ) + conversion.customer_id = ( + matched_customer.id if matched_customer else None + ) + conversion.hashed_customer_id = ( + matched_hashed_customer.id if matched_hashed_customer else None + ) conversion.directly_attributable = True conversion.guest_matched = False + is_attributable = True + + # Check if guest is regular for ID matches + if matched_reservation: + await self._check_if_regular(conversion, matched_reservation, session) + elif match_type == "guest_details": - conversion.directly_attributable = False + # Guest detail matches: link to customer/hashed_customer directly (NO reservation) + # Only link to reservation if dates match + conversion.customer_id = ( + matched_customer.id if matched_customer else None + ) + conversion.hashed_customer_id = ( + matched_hashed_customer.id if matched_hashed_customer else None + ) conversion.guest_matched = True + # For guest-detail matches, we don't have a matched_reservation + # Instead, check if dates align with any existing reservations for this customer + is_attributable = await self._check_if_attributable_guest_match( + matched_customer, conversion, session + ) + conversion.directly_attributable = is_attributable + + # Check if guest is regular (if we have a customer to reference) + if matched_customer: + await self._check_if_regular_by_customer( + conversion, matched_customer, session + ) + + # Update conversion_guest with hashed_customer reference if matched + if conversion_guest and matched_hashed_customer: + conversion_guest.hashed_customer_id = matched_hashed_customer.id + conversion.updated_at = datetime.now() # Update stats if provided @@ -1614,3 +1630,240 @@ class ConversionService: stats["matched_to_hashed_customer"] += 1 else: stats["unmatched"] += 1 + + async def _check_if_regular( + self, + conversion: Conversion, + matched_reservation: Reservation, + session: AsyncSession, + ) -> None: + """Check if guest is a regular customer and update is_regular flag. + + A guest is regular if they have conversions with dates before their first completed reservation. + Otherwise, is_regular is set to False. + + Args: + conversion: The Conversion record being evaluated + matched_reservation: The matched Reservation record + session: AsyncSession for database queries + """ + if not conversion.guest or not matched_reservation.customer_id: + return + + # Find the earliest paying conversion for this customer + # (booked reservations from hotel with actual revenue) + earliest_paying_conversion_result = await session.execute( + select(Conversion) + .join(ConversionRoom, Conversion.id == ConversionRoom.conversion_id) + .where( + Conversion.hotel_id == conversion.hotel_id, + Conversion.guest_id == conversion.guest_id, + ConversionRoom.total_revenue.isnot(None), + ConversionRoom.total_revenue > Decimal(0), + ) + .order_by(Conversion.reservation_date.asc()) + .limit(1) + ) + earliest_paying_conversion = earliest_paying_conversion_result.scalar_one_or_none() + + if not earliest_paying_conversion: + conversion.guest.is_regular = False + return + + # Find the earliest reservation (booking request we sent) for this customer + earliest_reservation_result = await session.execute( + select(Reservation) + .where(Reservation.customer_id == matched_reservation.customer_id) + .order_by(Reservation.start_date.asc()) + .limit(1) + ) + earliest_reservation = earliest_reservation_result.scalar_one_or_none() + + if not earliest_reservation: + conversion.guest.is_regular = False + return + + # Guest is regular if their earliest paying conversion predates all their reservations + # (meaning they were already a customer before we started tracking reservations) + is_regular = earliest_paying_conversion.reservation_date < earliest_reservation.start_date + conversion.guest.is_regular = is_regular + + if is_regular: + _LOGGER.info( + "Marking guest as regular: earliest paying conversion date %s is before first reservation %s", + earliest_paying_conversion.reservation_date, + earliest_reservation.start_date, + ) + + async def _check_if_attributable( + self, + matched_reservation: Reservation, + conversion: Conversion, + session: AsyncSession, + ) -> bool: + """Check if a guest detail matched conversion should be marked as attributable. + + A conversion is attributable ONLY if the conversion_room dates match the reservation dates closely. + + Args: + matched_reservation: The matched Reservation record + conversion: The Conversion record being evaluated + session: AsyncSession for database queries + + Returns: + True if the conversion should be marked as attributable (based on date matching), False otherwise + """ + # Check if conversion_room dates match reservation dates (criterion for attributability) + if not conversion.conversion_rooms or not matched_reservation: + return False + + for room in conversion.conversion_rooms: + if ( + room.arrival_date + and room.departure_date + and matched_reservation.start_date + and matched_reservation.end_date + ): + # Check if dates match or mostly match (within 1 day tolerance) + arrival_match = abs( + (room.arrival_date - matched_reservation.start_date).days + ) <= 7 + departure_match = abs( + (room.departure_date - matched_reservation.end_date).days + ) <= 7 + + if arrival_match and departure_match: + _LOGGER.info( + "Marking conversion as attributable: room dates %s-%s match reservation dates %s-%s", + room.arrival_date, + room.departure_date, + matched_reservation.start_date, + matched_reservation.end_date, + ) + return True + + return False + + async def _check_if_attributable_guest_match( + self, + matched_customer: Customer | None, + conversion: Conversion, + session: AsyncSession, + ) -> bool: + """Check if a guest-detail matched conversion is attributable based on date alignment. + + For guest-detail matches (without a specific reservation), check if the conversion's + room dates align with ANY of the customer's reservations (date tolerance ±7 days). + + Args: + matched_customer: The matched Customer record + conversion: The Conversion record being evaluated + session: AsyncSession for database queries + + Returns: + True if conversion dates align with any reservation, False otherwise + """ + if not matched_customer or not conversion.conversion_rooms: + return False + + # Find all reservations for this customer + reservations_result = await session.execute( + select(Reservation).where(Reservation.customer_id == matched_customer.id) + ) + reservations = reservations_result.scalars().all() + + if not reservations: + return False + + # Check if any conversion_room dates match any reservation dates + for room in conversion.conversion_rooms: + if not room.arrival_date or not room.departure_date: + continue + + for reservation in reservations: + if not reservation.start_date or not reservation.end_date: + continue + + # Check if dates match (within ±7 day tolerance) + arrival_match = abs( + (room.arrival_date - reservation.start_date).days + ) <= 7 + departure_match = abs( + (room.departure_date - reservation.end_date).days + ) <= 7 + + if arrival_match and departure_match: + _LOGGER.info( + "Marking guest-detail match as attributable: room dates %s-%s match reservation dates %s-%s", + room.arrival_date, + room.departure_date, + reservation.start_date, + reservation.end_date, + ) + return True + + return False + + async def _check_if_regular_by_customer( + self, + conversion: Conversion, + matched_customer: Customer, + session: AsyncSession, + ) -> None: + """Check if guest is regular based on customer without a specific reservation. + + For guest-detail matches, determine if the guest is regular by checking if + their earliest paying conversion predates their earliest reservation. + + Args: + conversion: The Conversion record being evaluated + matched_customer: The matched Customer record + session: AsyncSession for database queries + """ + if not conversion.guest or not matched_customer.id: + return + + # Find the earliest paying conversion for this guest + # (booked reservations from hotel with actual revenue) + earliest_paying_conversion_result = await session.execute( + select(Conversion) + .join(ConversionRoom, Conversion.id == ConversionRoom.conversion_id) + .where( + Conversion.hotel_id == conversion.hotel_id, + Conversion.guest_id == conversion.guest_id, + ConversionRoom.total_revenue.isnot(None), + ConversionRoom.total_revenue > Decimal(0), + ) + .order_by(Conversion.reservation_date.asc()) + .limit(1) + ) + earliest_paying_conversion = earliest_paying_conversion_result.scalar_one_or_none() + + if not earliest_paying_conversion: + conversion.guest.is_regular = False + return + + # Find the earliest reservation (booking request we sent) for this customer + earliest_reservation_result = await session.execute( + select(Reservation) + .where(Reservation.customer_id == matched_customer.id) + .order_by(Reservation.start_date.asc()) + .limit(1) + ) + earliest_reservation = earliest_reservation_result.scalar_one_or_none() + + if not earliest_reservation: + conversion.guest.is_regular = False + return + + # Guest is regular if their earliest paying conversion predates all their reservations + # (meaning they were already a customer before we started tracking reservations) + is_regular = earliest_paying_conversion.reservation_date < earliest_reservation.start_date + conversion.guest.is_regular = is_regular + + if is_regular: + _LOGGER.info( + "Marking guest as regular (via customer): earliest paying conversion date %s is before first reservation %s", + earliest_paying_conversion.reservation_date, + earliest_reservation.start_date, + ) diff --git a/src/alpine_bits_python/db.py b/src/alpine_bits_python/db.py index ae2cfcd..319ad62 100644 --- a/src/alpine_bits_python/db.py +++ b/src/alpine_bits_python/db.py @@ -395,6 +395,9 @@ class ConversionGuest(Base): hashed_country_code = Column(String(64)) hashed_birth_date = Column(String(64)) + # Matched customer reference (nullable, filled after matching) + hashed_customer_id = Column(Integer, ForeignKey("hashed_customers.id"), nullable=True, index=True) + # Guest classification is_regular = Column(Boolean, default=False) # True if guest has many prior stays before appearing in our reservations @@ -404,6 +407,7 @@ class ConversionGuest(Base): # Relationships conversions = relationship("Conversion", back_populates="guest") + hashed_customer = relationship("HashedCustomer", backref="conversion_guests") @staticmethod def _normalize_and_hash(value):