On we go. Maybe soon this will be done

This commit is contained in:
Jonas Linter
2025-11-19 18:40:44 +01:00
parent 93207c3877
commit 434dabbb7a
3 changed files with 416 additions and 125 deletions

View File

@@ -0,0 +1,34 @@
"""add hashed_customer_id to conversion_guests
Revision ID: a1b2c3d4e5f6
Revises: 08fe946414d8
Create Date: 2025-11-19 18:00:00.000000
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = 'a1b2c3d4e5f6'
down_revision: Union[str, Sequence[str], None] = '08fe946414d8'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# Add hashed_customer_id column to conversion_guests
op.add_column('conversion_guests', sa.Column('hashed_customer_id', sa.Integer(), nullable=True))
op.create_index(op.f('ix_conversion_guests_hashed_customer_id'), 'conversion_guests', ['hashed_customer_id'], unique=False)
op.create_foreign_key(None, 'conversion_guests', 'hashed_customers', ['hashed_customer_id'], ['id'], ondelete='SET NULL')
def downgrade() -> None:
"""Downgrade schema."""
# Drop the hashed_customer_id column and its constraints
op.drop_constraint(None, 'conversion_guests', type_='foreignkey')
op.drop_index(op.f('ix_conversion_guests_hashed_customer_id'), table_name='conversion_guests')
op.drop_column('conversion_guests', 'hashed_customer_id')

View File

@@ -177,7 +177,6 @@ class ConversionService:
"hashed_birth_date": ConversionGuest._normalize_and_hash(
guest_data["guest_birth_date"].isoformat() if guest_data["guest_birth_date"] else None
),
"is_regular": False,
"first_seen": now,
"last_seen": now,
})
@@ -883,103 +882,6 @@ class ConversionService:
return stats
async def _match_conversion(
self,
conversion: Conversion,
guest_first_name: str | None,
guest_last_name: str | None,
guest_email: str | None,
advertising_campagne: str | None,
advertising_partner: str | None,
hotel_id: str | None,
reservation_date: Any,
session: AsyncSession | None = None,
) -> dict[str, int]:
"""Match a conversion to reservations and customers using guest and advertising data.
This is the matching phase that runs AFTER conversion data has been stored.
It uses hashed guest data to match conversions to existing reservations/customers.
Args:
conversion: The Conversion record to match
guest_first_name: Guest first name (will be hashed for matching)
guest_last_name: Guest last name (will be hashed for matching)
guest_email: Guest email (will be hashed for matching)
advertising_campagne: Advertising campaign identifier
advertising_partner: Advertising partner info
hotel_id: Hotel ID for filtering matches
reservation_date: Reservation date for additional filtering
session: AsyncSession to use for database queries
Returns:
Dictionary with match statistics: matched_to_reservation, matched_to_customer,
matched_to_hashed_customer, and unmatched (all counts of 0 or 1)
"""
if session is None:
session = self.session
stats = {
"matched_to_reservation": 0,
"matched_to_customer": 0,
"matched_to_hashed_customer": 0,
"unmatched": 0,
}
# Hash guest data for matching (same hashing logic as ConversionGuest)
hashed_first_name = ConversionGuest._normalize_and_hash(guest_first_name)
hashed_last_name = ConversionGuest._normalize_and_hash(guest_last_name)
hashed_email = ConversionGuest._normalize_and_hash(guest_email)
# Find matching entities
match_result = await self._find_matching_entities(
advertising_campagne,
hotel_id,
reservation_date,
hashed_first_name,
hashed_last_name,
hashed_email,
advertising_partner,
session,
)
matched_reservation = match_result["reservation"]
matched_customer = match_result["customer"]
matched_hashed_customer = match_result["hashed_customer"]
match_type = match_result.get("match_type") # "id" or "guest_details"
# Update the conversion with matched entities if found
if matched_reservation or matched_customer or matched_hashed_customer:
conversion.reservation_id = (
matched_reservation.id if matched_reservation else None
)
conversion.customer_id = (
matched_customer.id if matched_customer else None
)
conversion.hashed_customer_id = (
matched_hashed_customer.id if matched_hashed_customer else None
)
# Set attribution flags based on match type
if match_type == "id":
conversion.directly_attributable = True
conversion.guest_matched = False
elif match_type == "guest_details":
conversion.directly_attributable = False
conversion.guest_matched = True
conversion.updated_at = datetime.now()
# Update stats
if matched_reservation:
stats["matched_to_reservation"] = 1
if matched_customer:
stats["matched_to_customer"] = 1
if matched_hashed_customer:
stats["matched_to_hashed_customer"] = 1
if not any([matched_reservation, matched_customer, matched_hashed_customer]):
stats["unmatched"] = 1
return stats
async def _find_matching_entities(
self,
@@ -994,9 +896,9 @@ class ConversionService:
) -> dict[str, Any]:
"""Find matching Reservation, Customer, and HashedCustomer.
Uses two strategies with separate attribution:
1. ID-based matching (fbclid/gclid/md5_unique_id) - directly_attributable
2. Guest detail matching (email/name) - guest_matched only
Uses two strategies with separate matching paths:
1. ID-based matching (fbclid/gclid/md5_unique_id) - returns Reservation + Customer + HashedCustomer
2. Guest detail matching (email/name) - returns Customer + HashedCustomer directly (no Reservation needed)
Args:
advertising_campagne: Truncated tracking ID from conversion XML
@@ -1011,6 +913,7 @@ class ConversionService:
Returns:
Dictionary with 'reservation', 'customer', 'hashed_customer', and 'match_type' keys.
match_type is either 'id' (high confidence) or 'guest_details' (lower confidence)
For guest_details matches, 'reservation' will be None.
"""
if session is None:
@@ -1050,22 +953,31 @@ class ConversionService:
)
# Strategy 2: If no ID-based match, try email/name-based matching - guest details, lower confidence
# For guest detail matches, match directly with HashedCustomer (skip Reservation table)
if not result["reservation"] and (
guest_email or guest_first_name or guest_last_name
):
matched_reservation = await self._match_by_guest_details(
hotel_id, guest_first_name, guest_last_name, guest_email, session
matched_hashed_customer = await self._match_by_guest_details_hashed(
guest_first_name, guest_last_name, guest_email, session
)
if matched_reservation:
result["reservation"] = matched_reservation
if matched_hashed_customer:
result["hashed_customer"] = matched_hashed_customer
result["match_type"] = "guest_details" # Matched by guest details only
# Get the customer if it exists
if matched_hashed_customer.customer_id:
customer_query = select(Customer).where(
Customer.id == matched_hashed_customer.customer_id
)
customer_result = await session.execute(customer_query)
result["customer"] = customer_result.scalar_one_or_none()
_LOGGER.info(
"Matched conversion by guest details (name=%s %s, email=%s, hotel=%s)",
"Matched conversion by guest details to hashed_customer (name=%s %s, email=%s)",
guest_first_name,
guest_last_name,
guest_email,
hotel_id,
)
else:
_LOGGER.debug(
@@ -1075,7 +987,7 @@ class ConversionService:
guest_email,
)
# If we found a reservation, get its customer and hashed_customer
# If we found a reservation (ID-based match), get its customer and hashed_customer
if result["reservation"]:
if result["reservation"].customer_id:
customer_query = select(Customer).where(
@@ -1182,6 +1094,77 @@ class ConversionService:
return matched_reservation
async def _match_by_guest_details_hashed(
self,
guest_first_name: str | None,
guest_last_name: str | None,
guest_email: str | None,
session: AsyncSession | None = None,
) -> HashedCustomer | None:
"""Match guest by name and email directly to HashedCustomer (no Reservation needed).
This method bypasses the Reservation table entirely and matches directly against
hashed customer data. Used for guest-detail matching where we don't need to link
to a specific reservation.
Args:
guest_first_name: Guest first name (pre-hashed)
guest_last_name: Guest last name (pre-hashed)
guest_email: Guest email (pre-hashed)
session: AsyncSession to use. If None, uses self.session.
Returns:
Matched HashedCustomer or None
"""
if session is None:
session = self.session
# Query all hashed customers that match the guest details
query = select(HashedCustomer).options(
selectinload(HashedCustomer.customer)
)
# Build filter conditions
conditions = []
if guest_email:
conditions.append(HashedCustomer.hashed_email == guest_email)
if guest_first_name and guest_last_name:
conditions.append(
(HashedCustomer.hashed_given_name == guest_first_name)
& (HashedCustomer.hashed_surname == guest_last_name)
)
if not conditions:
return None
# Combine conditions with OR (match if email matches OR name matches)
query = query.where(or_(*conditions))
db_result = await session.execute(query)
matches = db_result.scalars().all()
if not matches:
return None
# If single match, return it
if len(matches) == 1:
return matches[0]
# If multiple matches, prefer email match over name match
for match in matches:
if guest_email and match.hashed_email == guest_email:
_LOGGER.debug(
"Multiple hashed customer matches, preferring email match"
)
return match
# Otherwise return first match
_LOGGER.warning(
"Multiple hashed customer matches found for guest details, using first match"
)
return matches[0]
async def _match_by_guest_details(
self,
hotel_id: str | None,
@@ -1521,10 +1504,12 @@ class ConversionService:
and uses their stored hashed data to match to existing reservations/customers.
No XML parsing, no re-hashing - complete separation of concerns.
This enables:
- Matching historical data that wasn't just created
- Re-running matching logic independently
- Consistent hashing (using already-hashed data from DB)
Matching rules:
1. Guest detail matches: Record in conversion_guests table with hashed_customer reference
2. ID-based matches (md5_hash/click_id): Can also record directly in conversions-reservations
with directly_attributable=True
3. Regular guest detection: Check if conversions exist with dates before all reservations,
or if conversion_room dates match reservation dates
Updates stats dictionary in-place if provided.
@@ -1540,7 +1525,7 @@ class ConversionService:
result = await session.execute(
select(Conversion)
.where(Conversion.pms_reservation_id == pms_reservation_id)
.options(selectinload(Conversion.guest))
.options(selectinload(Conversion.guest), selectinload(Conversion.conversion_rooms))
)
conversion = result.scalar_one_or_none()
@@ -1584,24 +1569,55 @@ class ConversionService:
# Update the conversion with matched entities if found
if matched_reservation or matched_customer or matched_hashed_customer:
conversion.reservation_id = (
matched_reservation.id if matched_reservation else None
)
conversion.customer_id = (
matched_customer.id if matched_customer else None
)
conversion.hashed_customer_id = (
matched_hashed_customer.id if matched_hashed_customer else None
)
# Set attribution flags based on match type
# Set attribution flags and matched entities based on match type
if match_type == "id":
# ID-based matches (fbclid/gclid/md5_unique_id) are always directly attributable
# Link to both reservation and customer
conversion.reservation_id = (
matched_reservation.id if matched_reservation else None
)
conversion.customer_id = (
matched_customer.id if matched_customer else None
)
conversion.hashed_customer_id = (
matched_hashed_customer.id if matched_hashed_customer else None
)
conversion.directly_attributable = True
conversion.guest_matched = False
is_attributable = True
# Check if guest is regular for ID matches
if matched_reservation:
await self._check_if_regular(conversion, matched_reservation, session)
elif match_type == "guest_details":
conversion.directly_attributable = False
# Guest detail matches: link to customer/hashed_customer directly (NO reservation)
# Only link to reservation if dates match
conversion.customer_id = (
matched_customer.id if matched_customer else None
)
conversion.hashed_customer_id = (
matched_hashed_customer.id if matched_hashed_customer else None
)
conversion.guest_matched = True
# For guest-detail matches, we don't have a matched_reservation
# Instead, check if dates align with any existing reservations for this customer
is_attributable = await self._check_if_attributable_guest_match(
matched_customer, conversion, session
)
conversion.directly_attributable = is_attributable
# Check if guest is regular (if we have a customer to reference)
if matched_customer:
await self._check_if_regular_by_customer(
conversion, matched_customer, session
)
# Update conversion_guest with hashed_customer reference if matched
if conversion_guest and matched_hashed_customer:
conversion_guest.hashed_customer_id = matched_hashed_customer.id
conversion.updated_at = datetime.now()
# Update stats if provided
@@ -1614,3 +1630,240 @@ class ConversionService:
stats["matched_to_hashed_customer"] += 1
else:
stats["unmatched"] += 1
async def _check_if_regular(
self,
conversion: Conversion,
matched_reservation: Reservation,
session: AsyncSession,
) -> None:
"""Check if guest is a regular customer and update is_regular flag.
A guest is regular if they have conversions with dates before their first completed reservation.
Otherwise, is_regular is set to False.
Args:
conversion: The Conversion record being evaluated
matched_reservation: The matched Reservation record
session: AsyncSession for database queries
"""
if not conversion.guest or not matched_reservation.customer_id:
return
# Find the earliest paying conversion for this customer
# (booked reservations from hotel with actual revenue)
earliest_paying_conversion_result = await session.execute(
select(Conversion)
.join(ConversionRoom, Conversion.id == ConversionRoom.conversion_id)
.where(
Conversion.hotel_id == conversion.hotel_id,
Conversion.guest_id == conversion.guest_id,
ConversionRoom.total_revenue.isnot(None),
ConversionRoom.total_revenue > Decimal(0),
)
.order_by(Conversion.reservation_date.asc())
.limit(1)
)
earliest_paying_conversion = earliest_paying_conversion_result.scalar_one_or_none()
if not earliest_paying_conversion:
conversion.guest.is_regular = False
return
# Find the earliest reservation (booking request we sent) for this customer
earliest_reservation_result = await session.execute(
select(Reservation)
.where(Reservation.customer_id == matched_reservation.customer_id)
.order_by(Reservation.start_date.asc())
.limit(1)
)
earliest_reservation = earliest_reservation_result.scalar_one_or_none()
if not earliest_reservation:
conversion.guest.is_regular = False
return
# Guest is regular if their earliest paying conversion predates all their reservations
# (meaning they were already a customer before we started tracking reservations)
is_regular = earliest_paying_conversion.reservation_date < earliest_reservation.start_date
conversion.guest.is_regular = is_regular
if is_regular:
_LOGGER.info(
"Marking guest as regular: earliest paying conversion date %s is before first reservation %s",
earliest_paying_conversion.reservation_date,
earliest_reservation.start_date,
)
async def _check_if_attributable(
self,
matched_reservation: Reservation,
conversion: Conversion,
session: AsyncSession,
) -> bool:
"""Check if a guest detail matched conversion should be marked as attributable.
A conversion is attributable ONLY if the conversion_room dates match the reservation dates closely.
Args:
matched_reservation: The matched Reservation record
conversion: The Conversion record being evaluated
session: AsyncSession for database queries
Returns:
True if the conversion should be marked as attributable (based on date matching), False otherwise
"""
# Check if conversion_room dates match reservation dates (criterion for attributability)
if not conversion.conversion_rooms or not matched_reservation:
return False
for room in conversion.conversion_rooms:
if (
room.arrival_date
and room.departure_date
and matched_reservation.start_date
and matched_reservation.end_date
):
# Check if dates match or mostly match (within 1 day tolerance)
arrival_match = abs(
(room.arrival_date - matched_reservation.start_date).days
) <= 7
departure_match = abs(
(room.departure_date - matched_reservation.end_date).days
) <= 7
if arrival_match and departure_match:
_LOGGER.info(
"Marking conversion as attributable: room dates %s-%s match reservation dates %s-%s",
room.arrival_date,
room.departure_date,
matched_reservation.start_date,
matched_reservation.end_date,
)
return True
return False
async def _check_if_attributable_guest_match(
self,
matched_customer: Customer | None,
conversion: Conversion,
session: AsyncSession,
) -> bool:
"""Check if a guest-detail matched conversion is attributable based on date alignment.
For guest-detail matches (without a specific reservation), check if the conversion's
room dates align with ANY of the customer's reservations (date tolerance ±7 days).
Args:
matched_customer: The matched Customer record
conversion: The Conversion record being evaluated
session: AsyncSession for database queries
Returns:
True if conversion dates align with any reservation, False otherwise
"""
if not matched_customer or not conversion.conversion_rooms:
return False
# Find all reservations for this customer
reservations_result = await session.execute(
select(Reservation).where(Reservation.customer_id == matched_customer.id)
)
reservations = reservations_result.scalars().all()
if not reservations:
return False
# Check if any conversion_room dates match any reservation dates
for room in conversion.conversion_rooms:
if not room.arrival_date or not room.departure_date:
continue
for reservation in reservations:
if not reservation.start_date or not reservation.end_date:
continue
# Check if dates match (within ±7 day tolerance)
arrival_match = abs(
(room.arrival_date - reservation.start_date).days
) <= 7
departure_match = abs(
(room.departure_date - reservation.end_date).days
) <= 7
if arrival_match and departure_match:
_LOGGER.info(
"Marking guest-detail match as attributable: room dates %s-%s match reservation dates %s-%s",
room.arrival_date,
room.departure_date,
reservation.start_date,
reservation.end_date,
)
return True
return False
async def _check_if_regular_by_customer(
self,
conversion: Conversion,
matched_customer: Customer,
session: AsyncSession,
) -> None:
"""Check if guest is regular based on customer without a specific reservation.
For guest-detail matches, determine if the guest is regular by checking if
their earliest paying conversion predates their earliest reservation.
Args:
conversion: The Conversion record being evaluated
matched_customer: The matched Customer record
session: AsyncSession for database queries
"""
if not conversion.guest or not matched_customer.id:
return
# Find the earliest paying conversion for this guest
# (booked reservations from hotel with actual revenue)
earliest_paying_conversion_result = await session.execute(
select(Conversion)
.join(ConversionRoom, Conversion.id == ConversionRoom.conversion_id)
.where(
Conversion.hotel_id == conversion.hotel_id,
Conversion.guest_id == conversion.guest_id,
ConversionRoom.total_revenue.isnot(None),
ConversionRoom.total_revenue > Decimal(0),
)
.order_by(Conversion.reservation_date.asc())
.limit(1)
)
earliest_paying_conversion = earliest_paying_conversion_result.scalar_one_or_none()
if not earliest_paying_conversion:
conversion.guest.is_regular = False
return
# Find the earliest reservation (booking request we sent) for this customer
earliest_reservation_result = await session.execute(
select(Reservation)
.where(Reservation.customer_id == matched_customer.id)
.order_by(Reservation.start_date.asc())
.limit(1)
)
earliest_reservation = earliest_reservation_result.scalar_one_or_none()
if not earliest_reservation:
conversion.guest.is_regular = False
return
# Guest is regular if their earliest paying conversion predates all their reservations
# (meaning they were already a customer before we started tracking reservations)
is_regular = earliest_paying_conversion.reservation_date < earliest_reservation.start_date
conversion.guest.is_regular = is_regular
if is_regular:
_LOGGER.info(
"Marking guest as regular (via customer): earliest paying conversion date %s is before first reservation %s",
earliest_paying_conversion.reservation_date,
earliest_reservation.start_date,
)

View File

@@ -395,6 +395,9 @@ class ConversionGuest(Base):
hashed_country_code = Column(String(64))
hashed_birth_date = Column(String(64))
# Matched customer reference (nullable, filled after matching)
hashed_customer_id = Column(Integer, ForeignKey("hashed_customers.id"), nullable=True, index=True)
# Guest classification
is_regular = Column(Boolean, default=False) # True if guest has many prior stays before appearing in our reservations
@@ -404,6 +407,7 @@ class ConversionGuest(Base):
# Relationships
conversions = relationship("Conversion", back_populates="guest")
hashed_customer = relationship("HashedCustomer", backref="conversion_guests")
@staticmethod
def _normalize_and_hash(value):