Finally it works

This commit is contained in:
Jonas Linter
2025-11-19 17:27:47 +01:00
parent d81ba79256
commit 42b353d241

View File

@@ -7,7 +7,8 @@ from datetime import UTC, datetime
from decimal import Decimal
from typing import Any
from sqlalchemy import or_, select
from sqlalchemy import insert, or_, select
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload
@@ -76,76 +77,138 @@ class ConversionService:
f"session must be AsyncSession or SessionMaker, got {type(session)}"
)
async def _get_or_create_conversion_guest(
self,
hotel_id: str,
guest_id: str | None,
guest_first_name: str | None,
guest_last_name: str | None,
guest_email: str | None,
guest_country_code: str | None,
guest_birth_date,
session: AsyncSession,
) -> ConversionGuest | None:
"""Get or create a ConversionGuest record for the given guest data.
async def _extract_unique_guests_from_xml(
self, reservations: list
) -> dict[tuple[str, str | None], dict]:
"""Extract and deduplicate all guest data from XML reservations.
Uses (hotel_id, guest_id) as the natural key to identify a guest.
If a guest with this key exists, updates it with new data.
If not, creates a new guest record.
Phase 0: Single pass through XML to collect all unique guests.
Uses (hotel_id, guest_id) as the key for deduplication.
NOTE: There is no database-level unique constraint on (hotel_id, guest_id),
so multiple ConversionGuest records can exist with the same key. This method
uses first() instead of scalar_one_or_none() to handle this gracefully and
update the most recently created record when duplicates exist.
Args:
reservations: List of XML reservation elements
Returns the ConversionGuest record, or None if no guest data provided.
Returns:
Dictionary mapping (hotel_id, guest_id) to guest data dict
"""
# Don't create a ConversionGuest if we have no guest information
if not any(
[guest_first_name, guest_last_name, guest_email, guest_country_code, guest_birth_date]
):
return None
guest_data_by_key = {}
now = datetime.now(UTC)
# Try to find existing guest by (hotel_id, guest_id)
if guest_id:
result = await session.execute(
select(ConversionGuest)
.where(
(ConversionGuest.hotel_id == hotel_id)
& (ConversionGuest.guest_id == guest_id)
)
.order_by(ConversionGuest.last_seen.desc()) # Get most recently updated
for reservation_elem in reservations:
hotel_id = reservation_elem.get("hotelID")
guest_elem = reservation_elem.find("guest")
if guest_elem is None:
_LOGGER.debug("No guest element found, skipping reservation %s (will be created with guest_id=None in Phase 2)", reservation_elem.get("id"))
continue
guest_id = guest_elem.get("id")
guest_first_name = guest_elem.get("firstName")
guest_last_name = guest_elem.get("lastName")
guest_email = guest_elem.get("email")
guest_country_code = guest_elem.get("countryCode")
guest_birth_date_str = guest_elem.get("dateOfBirth")
guest_birth_date = None
if guest_birth_date_str:
try:
guest_birth_date = datetime.strptime(guest_birth_date_str, "%Y-%m-%d").date()
except ValueError:
_LOGGER.warning("Invalid birth date format: %s", guest_birth_date_str)
key = (hotel_id, guest_id)
# Store guest data by key (will keep the last occurrence from XML)
guest_data_by_key[key] = {
"hotel_id": hotel_id,
"guest_id": guest_id,
"guest_first_name": guest_first_name,
"guest_last_name": guest_last_name,
"guest_email": guest_email,
"guest_country_code": guest_country_code,
"guest_birth_date": guest_birth_date,
"now": now,
}
return guest_data_by_key
async def _bulk_upsert_guests(
self, session: AsyncSession, guest_data_by_key: dict[tuple[str, str | None], dict]
) -> None:
"""Bulk upsert all unique guests to database using PostgreSQL ON CONFLICT.
Phase 1: Atomic upsert of all guests extracted in Phase 0.
Uses ON CONFLICT DO UPDATE to handle duplicates safely without race conditions.
Processes in batches to avoid overwhelming the database with large XML files.
Args:
session: AsyncSession to use
guest_data_by_key: Dictionary mapping (hotel_id, guest_id) to guest data
"""
if not guest_data_by_key:
return
# Process in batches to avoid overwhelming the database
batch_size = 1000
items = list(guest_data_by_key.items())
for batch_start in range(0, len(items), batch_size):
batch_end = min(batch_start + batch_size, len(items))
batch_items = items[batch_start:batch_end]
# Prepare list of values for this batch
values_list = []
for (hotel_id, guest_id), guest_data in batch_items:
now = guest_data["now"]
values_list.append({
"hotel_id": guest_data["hotel_id"],
"guest_id": guest_data["guest_id"],
"guest_first_name": guest_data["guest_first_name"],
"guest_last_name": guest_data["guest_last_name"],
"guest_email": guest_data["guest_email"],
"guest_country_code": guest_data["guest_country_code"],
"guest_birth_date": guest_data["guest_birth_date"],
"hashed_first_name": ConversionGuest._normalize_and_hash(guest_data["guest_first_name"]),
"hashed_last_name": ConversionGuest._normalize_and_hash(guest_data["guest_last_name"]),
"hashed_email": ConversionGuest._normalize_and_hash(guest_data["guest_email"]),
"hashed_country_code": ConversionGuest._normalize_and_hash(guest_data["guest_country_code"]),
"hashed_birth_date": ConversionGuest._normalize_and_hash(
guest_data["guest_birth_date"].isoformat() if guest_data["guest_birth_date"] else None
),
"is_regular": False,
"first_seen": now,
"last_seen": now,
})
# Use PostgreSQL ON CONFLICT DO UPDATE for atomic upsert
stmt = pg_insert(ConversionGuest).values(values_list)
stmt = stmt.on_conflict_do_update(
index_elements=["hotel_id", "guest_id"],
set_={
"guest_first_name": stmt.excluded.guest_first_name,
"guest_last_name": stmt.excluded.guest_last_name,
"guest_email": stmt.excluded.guest_email,
"guest_country_code": stmt.excluded.guest_country_code,
"guest_birth_date": stmt.excluded.guest_birth_date,
"hashed_first_name": stmt.excluded.hashed_first_name,
"hashed_last_name": stmt.excluded.hashed_last_name,
"hashed_email": stmt.excluded.hashed_email,
"hashed_country_code": stmt.excluded.hashed_country_code,
"hashed_birth_date": stmt.excluded.hashed_birth_date,
"last_seen": stmt.excluded.last_seen,
},
)
existing_guest = result.scalars().first()
if existing_guest:
# Update with new data
existing_guest.update_from_conversion_data(
guest_first_name,
guest_last_name,
guest_email,
guest_country_code,
guest_birth_date,
now,
)
return existing_guest
await session.execute(stmt)
# Create new ConversionGuest
new_guest = ConversionGuest.create_from_conversion_data(
hotel_id=hotel_id,
guest_id=guest_id,
guest_first_name=guest_first_name,
guest_last_name=guest_last_name,
guest_email=guest_email,
guest_country_code=guest_country_code,
guest_birth_date=guest_birth_date,
now=now,
)
session.add(new_guest)
await session.flush() # Ensure the guest has an ID
return new_guest
_LOGGER.info(
"Phase 1: Upserted batch %d-%d of %d guests",
batch_start + 1,
batch_end,
len(items),
)
async def process_conversion_xml(self, xml_content: str) -> dict[str, Any]:
"""Parse conversion XML and save daily sales data to database.
@@ -203,33 +266,69 @@ class ConversionService:
if self.session_maker:
await session.close()
# Process active reservations in two phases:
# Phase 1: Create/update all conversion records
# Phase 2: Match them to existing reservations/customers
# Process active reservations in four phases:
# Phase 0: Extract and deduplicate all guest data from XML
# Phase 1: Bulk upsert all unique guests to DB (no race conditions)
# Phase 2: Create/update all conversion records (ConversionGuests already exist)
# Phase 3: Match them to existing reservations/customers
reservations = root.findall("reservation")
stats["total_reservations"] = len(reservations)
if not reservations:
return stats
# Phase 1: Create/update all conversions (no matching, no XML parsing beyond this point)
_LOGGER.info("Processing %d reservations in xml", len(reservations))
# Phase 0: Extract and deduplicate all guest data from XML
_LOGGER.debug("Phase 0: Extracting and deduplicating guest data from XML")
guest_data_by_key = await self._extract_unique_guests_from_xml(reservations)
_LOGGER.info(
"Phase 0: Extracted %d unique guests from %d reservations",
len(guest_data_by_key),
len(reservations),
)
# Phase 1: Bulk upsert all unique guests to database
if guest_data_by_key:
_LOGGER.debug("Phase 1: Bulk upserting %d unique guests to database", len(guest_data_by_key))
if self.session_maker:
session = await self.session_maker.create_session()
else:
session = self.session
try:
await self._bulk_upsert_guests(session, guest_data_by_key)
await session.commit()
_LOGGER.info("Phase 1: Successfully upserted %d guests", len(guest_data_by_key))
except Exception as e:
await session.rollback()
_LOGGER.exception("Phase 1: Error during bulk guest upsert: %s", e)
stats["errors"] += len(guest_data_by_key)
return stats
finally:
if self.session_maker:
await session.close()
# Phase 2: Create/update all conversions (no matching, ConversionGuests already exist)
# Returns list of successfully created pms_reservation_ids
_LOGGER.debug("Phase 2: Creating/updating conversions")
if self.supports_concurrent:
pms_reservation_ids = await self._process_reservations_concurrent(reservations, stats)
else:
pms_reservation_ids = await self._process_reservations_sequential(reservations, stats)
_LOGGER.debug(
"Phase 2: Found %d successfully created conversions out of %d total reservations",
"Phase 3: Found %d successfully created conversions out of %d total reservations",
len(pms_reservation_ids),
len(reservations),
)
# Phase 2: Match all conversions using database data only
# Phase 3: Match all conversions using database data only
# Only match conversions that actually exist in the database
# No XML parsing, no re-hashing - complete separation of concerns
# This also enables matching historical data that wasn't just created
if pms_reservation_ids:
_LOGGER.debug("Phase 3: Matching conversions to reservations/customers")
if self.supports_concurrent:
await self._match_conversions_from_db_concurrent(pms_reservation_ids, stats)
else:
@@ -319,13 +418,24 @@ class ConversionService:
"""
semaphore = asyncio.Semaphore(1) # Process one at a time
results = []
async with asyncio.TaskGroup() as tg:
tasks = []
for reservation in reservations:
task = tg.create_task(
self._process_reservation_safe(reservation, semaphore, stats)
)
tasks.append(task)
tasks = []
try:
async with asyncio.TaskGroup() as tg:
for reservation in reservations:
task = tg.create_task(
self._process_reservation_safe(reservation, semaphore, stats)
)
tasks.append(task)
except ExceptionGroup as eg:
# Fail fast: cancel all remaining tasks and re-raise the first exception
for task in tasks:
if not task.done():
task.cancel()
# Re-raise the first exception from the group
if eg.exceptions:
raise eg.exceptions[0] from eg
raise
# Collect results from all tasks
for task in tasks:
@@ -355,13 +465,24 @@ class ConversionService:
semaphore = asyncio.Semaphore(MAX_CONCURRENT_RESERVATIONS)
results = []
async with asyncio.TaskGroup() as tg:
tasks = []
for reservation in reservations:
task = tg.create_task(
self._process_reservation_safe(reservation, semaphore, stats)
)
tasks.append(task)
tasks = []
try:
async with asyncio.TaskGroup() as tg:
for reservation in reservations:
task = tg.create_task(
self._process_reservation_safe(reservation, semaphore, stats)
)
tasks.append(task)
except ExceptionGroup as eg:
# Fail fast: cancel all remaining tasks and re-raise the first exception
for task in tasks:
if not task.done():
task.cancel()
# Re-raise the first exception from the group
if eg.exceptions:
raise eg.exceptions[0] from eg
raise
# Collect results from all tasks
for task in tasks:
@@ -548,6 +669,8 @@ class ConversionService:
)
return stats
# ConversionGuests have already been bulk-upserted in Phase 1,
# so we can safely create/update conversions now
# Check if conversion already exists (upsert logic)
existing_result = await session.execute(
select(Conversion).where(
@@ -609,21 +732,6 @@ class ConversionService:
# Flush to ensure conversion has an ID before creating room reservations
await session.flush()
# Create or update ConversionGuest and link it to the conversion
# The conversion is linked to ConversionGuest via composite FK (hotel_id, guest_id)
# So we just need to ensure ConversionGuest exists - the FK is already set via hotel_id + guest_id
conversion_guest = await self._get_or_create_conversion_guest(
hotel_id=hotel_id,
guest_id=guest_id,
guest_first_name=guest_first_name,
guest_last_name=guest_last_name,
guest_email=guest_email,
guest_country_code=guest_country_code,
guest_birth_date=guest_birth_date,
session=session,
)
# guest_id is already set on conversion, so the composite FK relationship is established
# Batch-load existing room reservations to avoid N+1 queries
room_numbers = [
rm.get("roomNumber") for rm in room_reservations.findall("roomReservation")
@@ -724,15 +832,6 @@ class ConversionService:
# Check if room reservation already exists using batch-loaded data
existing_room_reservation = existing_rooms.get(pms_hotel_reservation_id)
if total_revenue > 0 and (
guest_first_name is None
and guest_last_name is None
and guest_email is None
):
_LOGGER.info(
"Guest info missing but total revenue > 0 for PMS ID %s",
pms_reservation_id,
)
if existing_room_reservation:
# Update existing room reservation with all fields