From 08f85d1b2656d2c16eefa6721effa1d031542e11 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 10:41:34 +0100 Subject: [PATCH 01/20] Holy db migrations batman --- ...c3_remove_composite_fk_from_conversions.py | 51 +++ config/alpinebits.log | 231 ++++++++++ database_schema_analysis.md | 396 ++++++++++++++++++ reset_database.sh | 47 --- reset_db.sh | 28 ++ src/alpine_bits_python/alpine_bits_helpers.py | 4 +- src/alpine_bits_python/api.py | 2 +- src/alpine_bits_python/conversion_service.py | 4 +- src/alpine_bits_python/csv_import.py | 2 +- src/alpine_bits_python/db.py | 33 +- src/alpine_bits_python/db_setup.py | 6 +- src/alpine_bits_python/email_monitoring.py | 4 +- src/alpine_bits_python/reservation_service.py | 2 +- src/alpine_bits_python/schemas.py | 2 +- .../util/migrate_sqlite_to_postgres.py | 2 +- src/alpine_bits_python/webhook_processor.py | 4 +- tests/test_alpine_bits_server_read.py | 12 +- tests/test_api.py | 4 +- tests/test_conversion_service.py | 2 +- 19 files changed, 752 insertions(+), 84 deletions(-) create mode 100644 alembic/versions/2025_12_03_0950-694d52a883c3_remove_composite_fk_from_conversions.py create mode 100644 database_schema_analysis.md delete mode 100644 reset_database.sh create mode 100755 reset_db.sh diff --git a/alembic/versions/2025_12_03_0950-694d52a883c3_remove_composite_fk_from_conversions.py b/alembic/versions/2025_12_03_0950-694d52a883c3_remove_composite_fk_from_conversions.py new file mode 100644 index 0000000..e26cb73 --- /dev/null +++ b/alembic/versions/2025_12_03_0950-694d52a883c3_remove_composite_fk_from_conversions.py @@ -0,0 +1,51 @@ +"""remove_composite_fk_from_conversions + +Revision ID: 694d52a883c3 +Revises: b50c0f45030a +Create Date: 2025-12-03 09:50:18.506030 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '694d52a883c3' +down_revision: Union[str, Sequence[str], None] = 'b50c0f45030a' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(op.f('conversions_hotel_id_guest_id_fkey'), 'conversions', type_='foreignkey') + + # Rename hotel_code to hotel_id (preserving data) and add FK to hotels + op.add_column('reservations', sa.Column('hotel_id', sa.String(), nullable=True)) + op.execute('UPDATE reservations SET hotel_id = hotel_code') + op.drop_column('reservations', 'hotel_code') + + # Add FK constraint without immediate validation (NOT VALID) + # This allows existing rows with non-existent hotel_ids to remain + # Future inserts/updates will still be validated + op.execute( + 'ALTER TABLE reservations ADD CONSTRAINT fk_reservations_hotel_id_hotels ' + 'FOREIGN KEY (hotel_id) REFERENCES hotels (hotel_id) ON DELETE CASCADE NOT VALID' + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + # Drop FK and rename hotel_id back to hotel_code (preserving data) + op.drop_constraint(op.f('fk_reservations_hotel_id_hotels'), 'reservations', type_='foreignkey') + op.add_column('reservations', sa.Column('hotel_code', sa.VARCHAR(), autoincrement=False, nullable=True)) + op.execute('UPDATE reservations SET hotel_code = hotel_id') + op.drop_column('reservations', 'hotel_id') + + op.create_foreign_key(op.f('conversions_hotel_id_guest_id_fkey'), 'conversions', 'conversion_guests', ['hotel_id', 'guest_id'], ['hotel_id', 'guest_id'], ondelete='SET NULL') + # ### end Alembic commands ### diff --git a/config/alpinebits.log b/config/alpinebits.log index 9994643..9b084f3 100644 --- a/config/alpinebits.log +++ b/config/alpinebits.log @@ -392994,3 +392994,234 @@ DETAIL: Key (hotel_id, guest_id)=(39054_001, 28275) is not present in table "co 2025-11-25 12:03:35 - alpine_bits_python.api - INFO - Email service shut down 2025-11-25 12:03:35 - alpine_bits_python.api - INFO - Application shutdown complete 2025-11-25 12:03:35 - alpine_bits_python.worker_coordination - INFO - Released primary worker lock (pid=22943) +2025-12-03 08:59:46 - root - INFO - Logging to file: config/alpinebits.log +2025-12-03 08:59:46 - root - INFO - Logging configured at INFO level +2025-12-03 08:59:46 - alpine_bits_python.notification_service - INFO - Registered notification backend: pushover +2025-12-03 08:59:46 - alpine_bits_python.notification_manager - INFO - Registered pushover backend with priority 0 +2025-12-03 08:59:46 - alpine_bits_python.notification_manager - INFO - Notification service configured with backends: ['pushover'] +2025-12-03 08:59:46 - alpine_bits_python.api - INFO - Application startup initiated (primary_worker=True) +2025-12-03 08:59:46 - alpine_bits_python.db - INFO - Configured database schema: alpinebits +2025-12-03 08:59:46 - alpine_bits_python.db - INFO - Setting PostgreSQL search_path to: alpinebits,public +2025-12-03 08:59:46 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_HOTEL_NOTIF_REPORT +2025-12-03 08:59:46 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_PING +2025-12-03 08:59:46 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_HOTEL_RES_NOTIF_GUEST_REQUESTS +2025-12-03 08:59:46 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_READ +2025-12-03 08:59:46 - alpine_bits_python.webhook_processor - INFO - Registered webhook processor: wix_form +2025-12-03 08:59:46 - alpine_bits_python.webhook_processor - INFO - Registered webhook processor: generic +2025-12-03 08:59:46 - alpine_bits_python.webhook_processor - INFO - Webhook processors initialized +2025-12-03 08:59:46 - alpine_bits_python.api - INFO - Webhook processors initialized +2025-12-03 08:59:46 - alpine_bits_python.api - INFO - Hotel 39054_001 has no push_endpoint configured +2025-12-03 08:59:46 - alpine_bits_python.api - INFO - Hotel 135 has no push_endpoint configured +2025-12-03 08:59:46 - alpine_bits_python.api - INFO - Hotel 39052_001 has no push_endpoint configured +2025-12-03 08:59:46 - alpine_bits_python.api - INFO - Hotel 39040_001 has no push_endpoint configured +2025-12-03 08:59:46 - alpine_bits_python.api - INFO - Running startup tasks (primary worker)... +2025-12-03 08:59:46 - alpine_bits_python.hotel_service - INFO - Config sync complete: 0 hotels created, 4 updated, 0 endpoints created +2025-12-03 08:59:46 - alpine_bits_python.db_setup - INFO - Config sync: 0 hotels created, 4 updated, 0 endpoints created +2025-12-03 08:59:47 - alpine_bits_python.db_setup - INFO - Backfilling advertising account IDs for existing reservations... +2025-12-03 08:59:47 - alpine_bits_python.db_setup - INFO - Found 4 hotel(s) with account configurations +2025-12-03 08:59:47 - alpine_bits_python.db_setup - INFO - Backfilling usernames for existing acked_requests... +2025-12-03 08:59:47 - alpine_bits_python.db_setup - INFO - Found 4 hotel(s) with usernames in config +2025-12-03 08:59:47 - alpine_bits_python.db_setup - INFO - Checking for stuck webhooks to reprocess... +2025-12-03 08:59:47 - alpine_bits_python.db_setup - INFO - No stuck webhooks found +2025-12-03 08:59:47 - alpine_bits_python.api - INFO - Startup tasks completed +2025-12-03 08:59:47 - alpine_bits_python.api - INFO - Webhook periodic cleanup task started +2025-12-03 08:59:47 - alpine_bits_python.api - INFO - Application startup complete +2025-12-03 08:59:51 - alpine_bits_python.api - INFO - Application shutdown initiated +2025-12-03 08:59:51 - alpine_bits_python.api - INFO - Webhook cleanup task cancelled +2025-12-03 08:59:51 - alpine_bits_python.api - INFO - Webhook cleanup task stopped +2025-12-03 08:59:51 - alpine_bits_python.email_service - INFO - Shutting down email service thread pool +2025-12-03 08:59:51 - alpine_bits_python.email_service - INFO - Email service thread pool shut down complete +2025-12-03 08:59:51 - alpine_bits_python.api - INFO - Email service shut down +2025-12-03 08:59:51 - alpine_bits_python.api - INFO - Application shutdown complete +2025-12-03 08:59:51 - alpine_bits_python.worker_coordination - INFO - Released primary worker lock (pid=9801) +2025-12-03 10:38:22 - root - INFO - Logging to file: config/alpinebits.log +2025-12-03 10:38:22 - root - INFO - Logging configured at INFO level +2025-12-03 10:38:22 - alpine_bits_python.notification_service - INFO - Registered notification backend: pushover +2025-12-03 10:38:22 - alpine_bits_python.notification_manager - INFO - Registered pushover backend with priority 0 +2025-12-03 10:38:22 - alpine_bits_python.notification_manager - INFO - Notification service configured with backends: ['pushover'] +2025-12-03 10:38:22 - alpine_bits_python.api - INFO - Application startup initiated (primary_worker=True) +2025-12-03 10:38:22 - alpine_bits_python.db - INFO - Configured database schema: alpinebits +2025-12-03 10:38:22 - alpine_bits_python.db - INFO - Setting PostgreSQL search_path to: alpinebits,public +2025-12-03 10:38:22 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_HOTEL_NOTIF_REPORT +2025-12-03 10:38:22 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_PING +2025-12-03 10:38:22 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_HOTEL_RES_NOTIF_GUEST_REQUESTS +2025-12-03 10:38:22 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_READ +2025-12-03 10:38:22 - alpine_bits_python.webhook_processor - INFO - Registered webhook processor: wix_form +2025-12-03 10:38:22 - alpine_bits_python.webhook_processor - INFO - Registered webhook processor: generic +2025-12-03 10:38:22 - alpine_bits_python.webhook_processor - INFO - Webhook processors initialized +2025-12-03 10:38:22 - alpine_bits_python.api - INFO - Webhook processors initialized +2025-12-03 10:38:22 - alpine_bits_python.api - INFO - Hotel 39054_001 has no push_endpoint configured +2025-12-03 10:38:22 - alpine_bits_python.api - INFO - Hotel 135 has no push_endpoint configured +2025-12-03 10:38:22 - alpine_bits_python.api - INFO - Hotel 39052_001 has no push_endpoint configured +2025-12-03 10:38:22 - alpine_bits_python.api - INFO - Hotel 39040_001 has no push_endpoint configured +2025-12-03 10:38:22 - alpine_bits_python.api - INFO - Running startup tasks (primary worker)... +2025-12-03 10:38:22 - alpine_bits_python.hotel_service - INFO - Created hotel: 39054_001 +2025-12-03 10:38:22 - alpine_bits_python.hotel_service - INFO - Created webhook endpoint for hotel 39054_001, type=wix_form, secret=JWreZtpYZIMDALw71zlLStFcQFdZbBXGGhVd379GX6oeDJE2iZLebCi0Sw2d8A0T +2025-12-03 10:38:22 - alpine_bits_python.hotel_service - INFO - Created webhook endpoint for hotel 39054_001, type=generic, secret=BzBT1xmoHA4EIpupE8YOY2r9dfWG4FJY7pEU4eDD_5RW3cKRRMJXLp6JRlY3Egr3 +2025-12-03 10:38:23 - alpine_bits_python.hotel_service - INFO - Created hotel: 135 +2025-12-03 10:38:23 - alpine_bits_python.hotel_service - INFO - Created webhook endpoint for hotel 135, type=wix_form, secret=0vbn5mCJBIRcHtK2DS9AWFebF8LncbpcR0sDJ7zctD3wWgdPZLdiIO-743HwiljT +2025-12-03 10:38:23 - alpine_bits_python.hotel_service - INFO - Created webhook endpoint for hotel 135, type=generic, secret=ci12B1Q81uvSwpyHppL5n1T5tYRXeJnv2cP4OkWH2FoShlMCYWEuvkmxdLhvR50N +2025-12-03 10:38:23 - alpine_bits_python.hotel_service - INFO - Created hotel: 39052_001 +2025-12-03 10:38:23 - alpine_bits_python.hotel_service - INFO - Created webhook endpoint for hotel 39052_001, type=wix_form, secret=V4BcT_XGcGJg7hcHhH2IVupcW4u231R711tdI-eiv15a-cSyaMlRnqrhUqNh0csC +2025-12-03 10:38:23 - alpine_bits_python.hotel_service - INFO - Created webhook endpoint for hotel 39052_001, type=generic, secret=x1M6_NYYXrHEC3aXFPkyglprNC6U5OhBFT4TW9E8SmEnpSRq0xm_ApWv4-Vl-pe3 +2025-12-03 10:38:23 - alpine_bits_python.hotel_service - INFO - Created hotel: 39040_001 +2025-12-03 10:38:23 - alpine_bits_python.hotel_service - INFO - Created webhook endpoint for hotel 39040_001, type=wix_form, secret=5JMgT0EI0CnRgp7jaHE1rCHQwZFMv1t9wn1yWJEBR5j_2Zrcqz_4W5g6pJBvZw4l +2025-12-03 10:38:23 - alpine_bits_python.hotel_service - INFO - Created webhook endpoint for hotel 39040_001, type=generic, secret=lrYRwnHMq5B1I_XEH7cUoOPx95zzzfrmJcRoh9C_Rd-WD3kl4F0M-UNetAlRbMVU +2025-12-03 10:38:23 - alpine_bits_python.hotel_service - INFO - Config sync complete: 4 hotels created, 0 updated, 8 endpoints created +2025-12-03 10:38:23 - alpine_bits_python.db_setup - INFO - Config sync: 4 hotels created, 0 updated, 8 endpoints created +2025-12-03 10:38:24 - alpine_bits_python.db_setup - INFO - Backfilling advertising account IDs for existing reservations... +2025-12-03 10:38:24 - alpine_bits_python.db_setup - INFO - Found 4 hotel(s) with account configurations +2025-12-03 10:38:40 - root - INFO - Logging to file: config/alpinebits.log +2025-12-03 10:38:40 - root - INFO - Logging configured at INFO level +2025-12-03 10:38:40 - alpine_bits_python.notification_service - INFO - Registered notification backend: pushover +2025-12-03 10:38:40 - alpine_bits_python.notification_manager - INFO - Registered pushover backend with priority 0 +2025-12-03 10:38:40 - alpine_bits_python.notification_manager - INFO - Notification service configured with backends: ['pushover'] +2025-12-03 10:38:40 - alpine_bits_python.api - INFO - Application startup initiated (primary_worker=True) +2025-12-03 10:38:40 - alpine_bits_python.db - INFO - Configured database schema: alpinebits +2025-12-03 10:38:40 - alpine_bits_python.db - INFO - Setting PostgreSQL search_path to: alpinebits,public +2025-12-03 10:38:40 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_HOTEL_NOTIF_REPORT +2025-12-03 10:38:40 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_PING +2025-12-03 10:38:40 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_HOTEL_RES_NOTIF_GUEST_REQUESTS +2025-12-03 10:38:40 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_READ +2025-12-03 10:38:40 - alpine_bits_python.webhook_processor - INFO - Registered webhook processor: wix_form +2025-12-03 10:38:40 - alpine_bits_python.webhook_processor - INFO - Registered webhook processor: generic +2025-12-03 10:38:40 - alpine_bits_python.webhook_processor - INFO - Webhook processors initialized +2025-12-03 10:38:40 - alpine_bits_python.api - INFO - Webhook processors initialized +2025-12-03 10:38:40 - alpine_bits_python.api - INFO - Hotel 39054_001 has no push_endpoint configured +2025-12-03 10:38:40 - alpine_bits_python.api - INFO - Hotel 135 has no push_endpoint configured +2025-12-03 10:38:40 - alpine_bits_python.api - INFO - Hotel 39052_001 has no push_endpoint configured +2025-12-03 10:38:40 - alpine_bits_python.api - INFO - Hotel 39040_001 has no push_endpoint configured +2025-12-03 10:38:40 - alpine_bits_python.api - INFO - Running startup tasks (primary worker)... +2025-12-03 10:38:40 - alpine_bits_python.hotel_service - INFO - Config sync complete: 0 hotels created, 4 updated, 0 endpoints created +2025-12-03 10:38:40 - alpine_bits_python.db_setup - INFO - Config sync: 0 hotels created, 4 updated, 0 endpoints created +2025-12-03 10:38:41 - alpine_bits_python.db_setup - INFO - Backfilling advertising account IDs for existing reservations... +2025-12-03 10:38:41 - alpine_bits_python.db_setup - INFO - Found 4 hotel(s) with account configurations +2025-12-03 10:38:53 - root - INFO - Logging to file: config/alpinebits.log +2025-12-03 10:38:53 - root - INFO - Logging configured at INFO level +2025-12-03 10:38:53 - alpine_bits_python.notification_service - INFO - Registered notification backend: pushover +2025-12-03 10:38:53 - alpine_bits_python.notification_manager - INFO - Registered pushover backend with priority 0 +2025-12-03 10:38:53 - alpine_bits_python.notification_manager - INFO - Notification service configured with backends: ['pushover'] +2025-12-03 10:38:53 - alpine_bits_python.api - INFO - Application startup initiated (primary_worker=True) +2025-12-03 10:38:53 - alpine_bits_python.db - INFO - Configured database schema: alpinebits +2025-12-03 10:38:53 - alpine_bits_python.db - INFO - Setting PostgreSQL search_path to: alpinebits,public +2025-12-03 10:38:53 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_HOTEL_NOTIF_REPORT +2025-12-03 10:38:53 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_PING +2025-12-03 10:38:53 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_HOTEL_RES_NOTIF_GUEST_REQUESTS +2025-12-03 10:38:53 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_READ +2025-12-03 10:38:53 - alpine_bits_python.webhook_processor - INFO - Registered webhook processor: wix_form +2025-12-03 10:38:53 - alpine_bits_python.webhook_processor - INFO - Registered webhook processor: generic +2025-12-03 10:38:53 - alpine_bits_python.webhook_processor - INFO - Webhook processors initialized +2025-12-03 10:38:53 - alpine_bits_python.api - INFO - Webhook processors initialized +2025-12-03 10:38:53 - alpine_bits_python.api - INFO - Hotel 39054_001 has no push_endpoint configured +2025-12-03 10:38:53 - alpine_bits_python.api - INFO - Hotel 135 has no push_endpoint configured +2025-12-03 10:38:53 - alpine_bits_python.api - INFO - Hotel 39052_001 has no push_endpoint configured +2025-12-03 10:38:53 - alpine_bits_python.api - INFO - Hotel 39040_001 has no push_endpoint configured +2025-12-03 10:38:53 - alpine_bits_python.api - INFO - Running startup tasks (primary worker)... +2025-12-03 10:38:53 - alpine_bits_python.hotel_service - INFO - Config sync complete: 0 hotels created, 4 updated, 0 endpoints created +2025-12-03 10:38:53 - alpine_bits_python.db_setup - INFO - Config sync: 0 hotels created, 4 updated, 0 endpoints created +2025-12-03 10:38:54 - alpine_bits_python.db_setup - INFO - Backfilling advertising account IDs for existing reservations... +2025-12-03 10:38:54 - alpine_bits_python.db_setup - INFO - Found 4 hotel(s) with account configurations +2025-12-03 10:38:54 - alpine_bits_python.db_setup - INFO - Backfilling usernames for existing acked_requests... +2025-12-03 10:38:54 - alpine_bits_python.db_setup - INFO - Found 4 hotel(s) with usernames in config +2025-12-03 10:38:54 - alpine_bits_python.db_setup - INFO - Checking for stuck webhooks to reprocess... +2025-12-03 10:38:54 - alpine_bits_python.db_setup - INFO - No stuck webhooks found +2025-12-03 10:38:54 - alpine_bits_python.api - INFO - Startup tasks completed +2025-12-03 10:38:54 - alpine_bits_python.api - INFO - Webhook periodic cleanup task started +2025-12-03 10:38:54 - alpine_bits_python.api - INFO - Application startup complete +2025-12-03 10:39:31 - alpine_bits_python.api - INFO - Application shutdown initiated +2025-12-03 10:39:31 - alpine_bits_python.api - INFO - Webhook cleanup task cancelled +2025-12-03 10:39:31 - alpine_bits_python.api - INFO - Webhook cleanup task stopped +2025-12-03 10:39:31 - alpine_bits_python.email_service - INFO - Shutting down email service thread pool +2025-12-03 10:39:31 - alpine_bits_python.email_service - INFO - Email service thread pool shut down complete +2025-12-03 10:39:31 - alpine_bits_python.api - INFO - Email service shut down +2025-12-03 10:39:31 - alpine_bits_python.api - INFO - Application shutdown complete +2025-12-03 10:39:31 - alpine_bits_python.worker_coordination - INFO - Released primary worker lock (pid=34567) +2025-12-03 10:39:34 - root - INFO - Logging to file: config/alpinebits.log +2025-12-03 10:39:34 - root - INFO - Logging configured at INFO level +2025-12-03 10:39:34 - alpine_bits_python.notification_service - INFO - Registered notification backend: pushover +2025-12-03 10:39:34 - alpine_bits_python.notification_manager - INFO - Registered pushover backend with priority 0 +2025-12-03 10:39:34 - alpine_bits_python.notification_manager - INFO - Notification service configured with backends: ['pushover'] +2025-12-03 10:39:34 - alpine_bits_python.api - INFO - Application startup initiated (primary_worker=True) +2025-12-03 10:39:34 - alpine_bits_python.db - INFO - Configured database schema: alpinebits +2025-12-03 10:39:34 - alpine_bits_python.db - INFO - Setting PostgreSQL search_path to: alpinebits,public +2025-12-03 10:39:34 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_HOTEL_NOTIF_REPORT +2025-12-03 10:39:34 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_PING +2025-12-03 10:39:34 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_HOTEL_RES_NOTIF_GUEST_REQUESTS +2025-12-03 10:39:34 - alpine_bits_python.alpinebits_server - INFO - Initializing action instance for AlpineBitsActionName.OTA_READ +2025-12-03 10:39:34 - alpine_bits_python.webhook_processor - INFO - Registered webhook processor: wix_form +2025-12-03 10:39:34 - alpine_bits_python.webhook_processor - INFO - Registered webhook processor: generic +2025-12-03 10:39:34 - alpine_bits_python.webhook_processor - INFO - Webhook processors initialized +2025-12-03 10:39:34 - alpine_bits_python.api - INFO - Webhook processors initialized +2025-12-03 10:39:34 - alpine_bits_python.api - INFO - Hotel 39054_001 has no push_endpoint configured +2025-12-03 10:39:34 - alpine_bits_python.api - INFO - Hotel 135 has no push_endpoint configured +2025-12-03 10:39:34 - alpine_bits_python.api - INFO - Hotel 39052_001 has no push_endpoint configured +2025-12-03 10:39:34 - alpine_bits_python.api - INFO - Hotel 39040_001 has no push_endpoint configured +2025-12-03 10:39:34 - alpine_bits_python.api - INFO - Running startup tasks (primary worker)... +2025-12-03 10:39:34 - alpine_bits_python.hotel_service - INFO - Config sync complete: 0 hotels created, 4 updated, 0 endpoints created +2025-12-03 10:39:34 - alpine_bits_python.db_setup - INFO - Config sync: 0 hotels created, 4 updated, 0 endpoints created +2025-12-03 10:39:35 - alpine_bits_python.db_setup - INFO - Backfilling advertising account IDs for existing reservations... +2025-12-03 10:39:35 - alpine_bits_python.db_setup - INFO - Found 4 hotel(s) with account configurations +2025-12-03 10:39:35 - alpine_bits_python.db_setup - INFO - Backfilling usernames for existing acked_requests... +2025-12-03 10:39:35 - alpine_bits_python.db_setup - INFO - Found 4 hotel(s) with usernames in config +2025-12-03 10:39:35 - alpine_bits_python.db_setup - INFO - Checking for stuck webhooks to reprocess... +2025-12-03 10:39:35 - alpine_bits_python.db_setup - INFO - No stuck webhooks found +2025-12-03 10:39:35 - alpine_bits_python.api - INFO - Startup tasks completed +2025-12-03 10:39:35 - alpine_bits_python.api - INFO - Webhook periodic cleanup task started +2025-12-03 10:39:35 - alpine_bits_python.api - INFO - Application startup complete +2025-12-03 10:39:50 - alpine_bits_python.api - INFO - AlpineBits authentication successful for user: bemelman (from config) +2025-12-03 10:39:50 - alpine_bits_python.api - INFO - XML file queued for processing: logs/conversions_import/file_bemelman_20251203_103950.xml by user bemelman (original: file.xml) +2025-12-03 10:39:50 - alpine_bits_python.api - INFO - Starting database processing of file.xml +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Loaded 1764 reservations into cache +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Reservation cache initialized with 6 hotel codes +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Processing deleted reservation: Hotel 39054_001, PMS ID 74423 +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Processing 32 reservations in xml +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Phase 0: Extracted 24 unique guests from 32 reservations +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Phase 1: Successfully upserted 24 guests +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 32770 (pms_id=65675) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 36046 (pms_id=71642) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 34158 (pms_id=68197) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 36811 (pms_id=73332) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 35904 (pms_id=71360) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37465 (pms_id=74400) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37466 (pms_id=74401) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37471 (pms_id=74406) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37472 (pms_id=74407) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37473 (pms_id=74408) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37474 (pms_id=74409) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37475 (pms_id=74410) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37476 (pms_id=74412) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37477 (pms_id=74411) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37478 (pms_id=74413) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37479 (pms_id=74414) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37398 (pms_id=74315) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37212 (pms_id=74028) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37480 (pms_id=74415) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37210 (pms_id=74027) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37481 (pms_id=74416) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37483 (pms_id=74417) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37446 (pms_id=74380) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37437 (pms_id=74369) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37484 (pms_id=74418) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37482 (pms_id=74419) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37486 (pms_id=74420) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37487 (pms_id=74421) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37489 (pms_id=74422) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37485 (pms_id=74424) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37488 (pms_id=74425) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Updated conversion 37490 (pms_id=74426) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Phase 3a: Matched conversion by advertising ID (pms_id=74401, reservation_id=1736) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Phase 3a: Matched conversion by advertising ID (pms_id=74411, reservation_id=1751) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Phase 3a: Matched conversion by advertising ID (pms_id=74027, reservation_id=503) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Phase 3a: Matched conversion by advertising ID (pms_id=74028, reservation_id=503) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Phase 3a: Matched conversion by advertising ID (pms_id=74413, reservation_id=1749) +2025-12-03 10:39:50 - alpine_bits_python.conversion_service - INFO - Phase 3a: Matched conversion by advertising ID (pms_id=74424, reservation_id=1754) +2025-12-03 10:39:54 - alpine_bits_python.conversion_service - INFO - Phase 3b: Found 22138 unique guests from 34438 unmatched conversions +2025-12-03 10:40:12 - alpine_bits_python.api - INFO - Conversion processing complete for file.xml: {'total_reservations': 32, 'deleted_reservations': 1, 'total_daily_sales': 501, 'matched_to_reservation': 6, 'matched_to_customer': 0, 'matched_to_hashed_customer': 0, 'unmatched': 26, 'errors': 0} +2025-12-03 10:41:22 - alpine_bits_python.api - INFO - Application shutdown initiated +2025-12-03 10:41:22 - alpine_bits_python.api - INFO - Webhook cleanup task cancelled +2025-12-03 10:41:22 - alpine_bits_python.api - INFO - Webhook cleanup task stopped +2025-12-03 10:41:22 - alpine_bits_python.email_service - INFO - Shutting down email service thread pool +2025-12-03 10:41:22 - alpine_bits_python.email_service - INFO - Email service thread pool shut down complete +2025-12-03 10:41:22 - alpine_bits_python.api - INFO - Email service shut down +2025-12-03 10:41:22 - alpine_bits_python.api - INFO - Application shutdown complete +2025-12-03 10:41:22 - alpine_bits_python.worker_coordination - INFO - Released primary worker lock (pid=34833) diff --git a/database_schema_analysis.md b/database_schema_analysis.md new file mode 100644 index 0000000..c2f981d --- /dev/null +++ b/database_schema_analysis.md @@ -0,0 +1,396 @@ +# Database Schema Analysis + +## Overview +This document analyzes the database schema for normalization issues, redundancy, and potential improvements. + +## Schema Summary +The database contains 13 tables organized around several core concepts: +- **Customer/Guest Management**: `customers`, `hashed_customers`, `conversion_guests` +- **Reservations**: `reservations`, `conversions`, `conversion_rooms` +- **Hotels**: `hotels`, `hotel_inventory`, `room_availability` +- **Webhooks**: `webhook_endpoints`, `webhook_requests` +- **Tracking**: `acked_requests` + +--- + +## Major Issues Identified + +### 1. **CRITICAL: Dual Customer Systems (Data Duplication)** + +**Problem**: The schema maintains two parallel customer tracking systems: +- `customers` + `hashed_customers` (from Wix forms) +- `conversion_guests` (from PMS) + +**Impact**: +- Same person can exist in both systems with no linkage +- `conversion_guests.hashed_customer_id` attempts to link but this is backward (many-to-one instead of one-to-one) +- Data inconsistency when same guest appears in both sources + +**Details**: +``` +customers (id=1, email="john@example.com") + └─ hashed_customers (id=1, customer_id=1, hashed_email="abc123...") + +conversion_guests (hotel_id="HOTEL1", guest_id=42, guest_email="john@example.com") + └─ hashed_customer_id = NULL (or points to hashed_customers.id=1 after matching) +``` + +**Recommendation**: +- Create a unified `persons` table with a `source` field ("wix", "pms", "merged") +- Both `customers` and `conversion_guests` should reference this unified entity +- Implement proper guest matching/merging logic + +--- + +### 2. **Data Redundancy: Hashed Values Stored Separately** + +**Problem**: `hashed_customers` and `conversion_guests` store hashed values in separate columns alongside originals. + +**Current Structure**: +``` +customers: + - email_address (plaintext) + - phone (plaintext) + +hashed_customers: + - customer_id (FK to customers) + - hashed_email + - hashed_phone + - hashed_given_name + ... +``` + +**Issues**: +- Violates 3NF (derived data stored in separate table) +- Synchronization required between `customers` and `hashed_customers` +- If customer data changes, hashed version can become stale +- Extra JOIN required for every Meta Conversion API call + +**Better Approach**: +Option A: Store hashed values directly in `customers` table as additional columns +Option B: Compute hashes on-the-fly (SHA256 is fast, ~1-2ms per hash) + +**Recommendation**: +- **Short term**: Keep current structure but add triggers to auto-update hashed values +- **Long term**: Move hashed columns into `customers` table directly + +--- + +### 3. **Advertising Account IDs Duplicated Across Tables** + +**Problem**: `meta_account_id` and `google_account_id` appear in 3 places: +- `hotels` table (canonical source) +- `reservations` table (copied at creation time) +- Derived from `fbclid`/`gclid` tracking parameters + +**Current Flow**: +``` +hotels.meta_account_id = "123456" + ↓ +reservation created with fbclid + ↓ +reservations.meta_account_id = "123456" (copied from hotels) +``` + +**Issues**: +- Denormalization without clear benefit +- If hotel's account ID changes, old reservations have stale data +- Mixed source of truth (sometimes from hotels, sometimes from tracking params) + +**Recommendation**: +- Remove `meta_account_id` and `google_account_id` from `reservations` +- Always derive from `hotels` table via JOIN +- If tracking-derived account differs from hotel's account, log a warning + +--- + +### 4. **Hotel Information Duplicated in Reservations** + +**Problem**: `reservations` table stores `hotel_code` and `hotel_name` but has no FK to `hotels` table. + +**Issues**: +- Data can become inconsistent if hotel name changes +- No referential integrity +- Unclear if `hotel_code` matches `hotels.hotel_id` + +**Recommendation**: +- Add `hotel_id` FK column to `reservations` pointing to `hotels.hotel_id` +- Remove `hotel_code` and `hotel_name` columns +- Derive hotel information via JOIN when needed + +--- + +### 5. **Weak Foreign Key Consistency** + +**Problem**: Mixed use of `ON DELETE` policies: +- Some FKs use `SET NULL` (appropriate for nullable relationships) +- Some use `CASCADE` (appropriate for child records) +- Some use `NO ACTION` (prevents deletion, may cause issues) +- `conversions` table has confusing composite FK setup with `hotel_id` and `guest_id` + +**Examples**: +```sql +-- Good: Child data should be deleted with parent +hotel_inventory.hotel_id → hotels.hotel_id (ON DELETE CASCADE) + +-- Questionable: Should webhook requests survive hotel deletion? +webhook_requests.hotel_id → hotels.hotel_id (ON DELETE NO ACTION) + +-- Inconsistent: Why SET NULL vs CASCADE? +reservations.customer_id → customers.id (ON DELETE SET NULL) +reservations.hashed_customer_id → hashed_customers.id (ON DELETE CASCADE) +``` + +**Recommendation**: +Review each FK and establish consistent policies: +- Core data (hotels, customers): SET NULL to preserve historical records +- Supporting data (hashed_customers, inventory): CASCADE +- Transactional data (webhooks, conversions): Decide on retention policy + +--- + +### 6. **Confusing Composite Foreign Key in Conversions** + +**Problem**: The `conversions` table has a composite FK that's incorrectly mapped: + +```python +# In db.py lines 650-655 +__table_args__ = ( + ForeignKeyConstraint( + ["hotel_id", "guest_id"], + ["conversion_guests.hotel_id", "conversion_guests.guest_id"], + ondelete="SET NULL", + ), +) +``` + +**But the database shows**: +``` +Foreign Keys: + hotel_id -> conversion_guests.hotel_id (ON DELETE SET NULL) + guest_id -> conversion_guests.hotel_id (ON DELETE SET NULL) # ← WRONG! + guest_id -> conversion_guests.guest_id (ON DELETE SET NULL) + hotel_id -> conversion_guests.guest_id (ON DELETE SET NULL) # ← WRONG! +``` + +**Impact**: +- Database has 4 FKs instead of 1 composite FK +- Mapping is incorrect (guest_id → hotel_id doesn't make sense) +- Could cause constraint violations or allow orphaned records + +**Recommendation**: +- Fix the composite FK definition in SQLAlchemy +- Run a migration to drop incorrect FKs and recreate properly + +--- + +### 7. **Unclear Relationship Between Reservations and Conversions** + +**Problem**: The relationship between `reservations` (from Wix forms) and `conversions` (from PMS) is complex: + +``` +conversions: + - reservation_id (FK to reservations) - matched by tracking IDs + - customer_id (FK to customers) - matched by guest details + - hashed_customer_id (FK to hashed_customers) - matched by hashed guest details + - guest_id (FK to conversion_guests) - the actual PMS guest +``` + +**Issues**: +- Three different FK fields to three different customer/guest tables +- Matching logic is unclear from schema alone +- `directly_attributable` and `guest_matched` flags indicate matching quality, but this should be more explicit + +**Recommendation**: +- Add a `match_confidence` enum field: "exact_id", "high_confidence", "medium_confidence", "no_match" +- Add `match_method` field to explain how the link was made +- Consider a separate `reservation_conversion_links` table to make the many-to-many relationship explicit + +--- + +### 8. **Room Type Information Scattered** + +**Problem**: Room information appears in multiple places: +- `reservations.room_type_code`, `room_classification_code`, `room_type` +- `conversion_rooms.room_type`, `room_number` +- `hotel_inventory.inv_type_code`, `inv_code`, `room_name` + +**Issues**: +- No clear master data for room types +- Room type codes not standardized across sources +- No FK between `reservations.room_type_code` and `hotel_inventory.inv_type_code` + +**Recommendation**: +- Create a `room_types` reference table linked to hotels +- Add FKs from reservations and conversion_rooms to room_types +- Standardize room type codes across all sources + +--- + +## Normalization Analysis + +### 1st Normal Form (1NF): ✅ PASS +- All columns contain atomic values +- **Exception**: `reservations.children_ages` stores comma-separated values + - Should be: separate `reservation_children` table with age column + +### 2nd Normal Form (2NF): ⚠️ MOSTLY PASS +- All non-key attributes depend on the full primary key +- **Issue**: Some denormalized data exists (hotel names, account IDs in reservations) + +### 3rd Normal Form (3NF): ❌ FAIL +Multiple violations: +- `hashed_customers` stores derived data (hashes) that depend on `customers` +- `reservations.meta_account_id` depends on `hotels` via hotel_code +- `reservations.hotel_name` depends on `hotels` via hotel_code + +--- + +## Data Integrity Issues + +### Missing Foreign Keys +1. **reservations.hotel_code** → should FK to hotels.hotel_id +2. **reservations.room_type_code** → should FK to hotel_inventory +3. **acked_requests.unique_id** → should FK to reservations.unique_id (or be nullable) + +### Missing Indexes +Consider adding for query performance: +1. `customers.email_address` - for lookups during conversion matching +2. `conversions.reservation_date` - for time-based queries +3. `conversion_rooms.total_revenue` - for revenue analytics +4. `reservations.start_date`, `end_date` - for date range queries + +### Missing Constraints +1. **Check constraints** for date logic: + - `reservations.end_date > start_date` + - `conversion_rooms.departure_date > arrival_date` + +2. **Check constraints** for counts: + - `num_adults >= 0`, `num_children >= 0` + +3. **NOT NULL constraints** on critical fields: + - `customers.contact_id` should be NOT NULL (it's the natural key) + - `conversions.hotel_id` is NOT NULL ✓ (good) + +--- + +## Recommendations Priority + +### HIGH PRIORITY (Data Integrity) +1. Fix composite FK in `conversions` table (lines 650-655 in db.py) +2. Add `hotel_id` FK to `reservations` table +3. Add missing NOT NULL constraints on natural keys +4. Add check constraints for date ranges and counts + +### MEDIUM PRIORITY (Normalization) +5. Unify customer/guest systems into a single `persons` entity +6. Remove duplicate account ID fields from `reservations` +7. Remove `hotel_name` from `reservations` (derive via JOIN) +8. Create `reservation_children` table for children_ages + +### LOW PRIORITY (Performance & Cleanup) +9. Move hashed fields into `customers` table (remove `hashed_customers`) +10. Add indexes for common query patterns +11. Create `room_types` reference table +12. Add `match_confidence` and `match_method` to `conversions` + +--- + +## Positive Aspects + +✅ Good use of composite keys (`conversion_guests`, `hotel_inventory`) +✅ Unique constraints on natural keys (`contact_id`, `webhook_secret`) +✅ Proper use of indexes on frequently queried fields +✅ Cascade deletion for child records (inventory, rooms) +✅ Tracking metadata (created_at, updated_at, first_seen, last_seen) +✅ Webhook deduplication via `payload_hash` +✅ JSON storage for flexible data (`conversion_rooms.daily_sales`) + +--- + +## Suggested Refactoring Path + +### Phase 1: Fix Critical Issues (1-2 days) +- Fix composite FK in conversions +- Add hotel_id FK to reservations +- Add missing constraints + +### Phase 2: Normalize Customer Data (3-5 days) +- Create unified persons/guests table +- Migrate existing data +- Update matching logic + +### Phase 3: Clean Up Redundancy (2-3 days) +- Remove duplicate account IDs +- Merge hashed_customers into customers +- Create room_types reference + +### Phase 4: Enhance Tracking (1-2 days) +- Add match_confidence fields +- Improve conversion attribution +- Add missing indexes + +--- + +## Query Examples Affected by Current Issues + +### Issue: Duplicate Customer Data +```sql +-- Current: Find all reservations for a guest (requires checking both systems) +SELECT r.* FROM reservations r +WHERE r.customer_id = ? + OR r.hashed_customer_id IN ( + SELECT id FROM hashed_customers WHERE contact_id = ? + ); + +-- After fix: Simple unified query +SELECT r.* FROM reservations r +WHERE r.person_id = ?; +``` + +### Issue: Missing Hotel FK +```sql +-- Current: Get hotel info for reservation (unreliable) +SELECT r.*, r.hotel_name +FROM reservations r +WHERE r.id = ?; + +-- After fix: Reliable JOIN +SELECT r.*, h.hotel_name, h.meta_account_id +FROM reservations r +JOIN hotels h ON r.hotel_id = h.hotel_id +WHERE r.id = ?; +``` + +### Issue: Hashed Data in Separate Table +```sql +-- Current: Get customer for Meta API (requires JOIN) +SELECT hc.hashed_email, hc.hashed_phone +FROM reservations r +JOIN hashed_customers hc ON r.hashed_customer_id = hc.id +WHERE r.id = ?; + +-- After fix: Direct access +SELECT c.hashed_email, c.hashed_phone +FROM reservations r +JOIN customers c ON r.customer_id = c.id +WHERE r.id = ?; +``` + +--- + +## Conclusion + +The schema is **functional but has significant normalization and consistency issues**. The main problems are: + +1. **Dual customer tracking systems** that should be unified +2. **Redundant storage of derived data** (hashes, account IDs) +3. **Missing foreign key relationships** (hotels, room types) +4. **Inconsistent deletion policies** across foreign keys +5. **Broken composite foreign key** in conversions table + +The database violates 3NF in several places and could benefit from a refactoring effort. However, the issues are primarily architectural rather than critical bugs, so the system can continue operating while improvements are made incrementally. + +**Estimated effort to fix all issues**: 1-2 weeks of development + testing +**Risk level**: Medium (requires data migration and careful FK updates) +**Recommended approach**: Incremental fixes starting with high-priority items diff --git a/reset_database.sh b/reset_database.sh deleted file mode 100644 index a43691c..0000000 --- a/reset_database.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -# Reset database and initialize Alembic from scratch - -echo "=== Database Reset Script ===" -echo "This will drop all tables and reinitialize with Alembic" -echo "" -read -p "Are you sure? (type 'yes' to continue): " confirm - -if [ "$confirm" != "yes" ]; then - echo "Aborted." - exit 1 -fi - -echo "" -echo "Step 1: Dropping all tables in the database..." -echo "Connect to your database and run:" -echo "" -echo " -- For PostgreSQL:" -echo " DROP SCHEMA public CASCADE;" -echo " CREATE SCHEMA public;" -echo " GRANT ALL ON SCHEMA public TO ;" -echo " GRANT ALL ON SCHEMA public TO public;" -echo "" -echo " -- Or if using a custom schema (e.g., alpinebits):" -echo " DROP SCHEMA alpinebits CASCADE;" -echo " CREATE SCHEMA alpinebits;" -echo "" -echo "Press Enter after you've run the SQL commands..." -read - -echo "" -echo "Step 2: Running Alembic migrations..." -uv run alembic upgrade head - -if [ $? -eq 0 ]; then - echo "" - echo "=== Success! ===" - echo "Database has been reset and migrations applied." - echo "" - echo "Current migration status:" - uv run alembic current -else - echo "" - echo "=== Error ===" - echo "Migration failed. Check the error messages above." - exit 1 -fi diff --git a/reset_db.sh b/reset_db.sh new file mode 100755 index 0000000..85e0dfb --- /dev/null +++ b/reset_db.sh @@ -0,0 +1,28 @@ +#!/bin/bash + + +# Recreate the database: run DROP and CREATE in separate psql calls (DROP DATABASE cannot run inside a transaction block) +if ! docker exec -i meta_timescaledb psql -U meta_user -d postgres -c "DROP DATABASE IF EXISTS meta_insights;"; then + echo "Error: failed to drop database 'meta_insights'." >&2 + exit 1 +fi + +if ! docker exec -i meta_timescaledb psql -U meta_user -d postgres -c "CREATE DATABASE meta_insights;"; then + echo "Error: failed to create database 'meta_insights'." >&2 + exit 1 +fi + +# then import dump specified by argument only if previous commands succeeded +if [ -n "$1" ]; then + DUMP_FILE="$1" + if [ ! -r "$DUMP_FILE" ]; then + echo "Error: dump file '$DUMP_FILE' does not exist or is not readable." >&2 + exit 2 + fi + + echo "Importing dump from $DUMP_FILE" + if ! docker exec -i meta_timescaledb psql -U meta_user -d meta_insights < "$DUMP_FILE"; then + echo "Error: failed to import dump '$DUMP_FILE' into 'meta_insights'." >&2 + exit 3 + fi +fi \ No newline at end of file diff --git a/src/alpine_bits_python/alpine_bits_helpers.py b/src/alpine_bits_python/alpine_bits_helpers.py index 27fad1e..ced4dcd 100644 --- a/src/alpine_bits_python/alpine_bits_helpers.py +++ b/src/alpine_bits_python/alpine_bits_helpers.py @@ -768,9 +768,9 @@ def _process_single_reservation( hotel_reservation_id=[hotel_res_id] ) - if reservation.hotel_code is None: + if reservation.hotel_id is None: raise ValueError("Reservation hotel_code is None") - hotel_code = str(reservation.hotel_code) + hotel_code = str(reservation.hotel_id) hotel_name = None if reservation.hotel_name is None else str(reservation.hotel_name) basic_property_info = HotelReservation.ResGlobalInfo.BasicPropertyInfo( diff --git a/src/alpine_bits_python/api.py b/src/alpine_bits_python/api.py index faab88f..27ef88a 100644 --- a/src/alpine_bits_python/api.py +++ b/src/alpine_bits_python/api.py @@ -138,7 +138,7 @@ async def push_listener(customer: DBCustomer, reservation: DBReservation, hotel) server: AlpineBitsServer = app.state.alpine_bits_server hotel_id = hotel["hotel_id"] - reservation_hotel_id = reservation.hotel_code + reservation_hotel_id = reservation.hotel_id # Double-check hotel matching (should be guaranteed by dispatcher) if hotel_id != reservation_hotel_id: diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 1b78b98..43847f6 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -642,7 +642,7 @@ class ConversionService: # Organize by hotel_code for efficient lookup for reservation in reservations: - hotel_code = reservation.hotel_code + hotel_code = reservation.hotel_id if hotel_code not in self._reservation_cache: self._reservation_cache[hotel_code] = [] # Cache the hashed customer - prefer direct relationship, fall back to customer relationship @@ -1095,7 +1095,7 @@ class ConversionService: # Add hotel filter if available if hotel_id: - query = query.where(Reservation.hotel_code == hotel_id) + query = query.where(Reservation.hotel_id == hotel_id) # Execute query db_result = await session.execute(query) diff --git a/src/alpine_bits_python/csv_import.py b/src/alpine_bits_python/csv_import.py index 22539e5..cf50d0c 100644 --- a/src/alpine_bits_python/csv_import.py +++ b/src/alpine_bits_python/csv_import.py @@ -472,7 +472,7 @@ class CSVImporter: num_adults=num_adults, num_children=num_children, children_ages=children_ages, - hotel_code=final_hotel_code, + hotel_id=final_hotel_code, hotel_name=final_hotel_name, offer=str(row.get("room_offer", "")).strip() or None, user_comment=str(row.get("message", "")).strip() or None, diff --git a/src/alpine_bits_python/db.py b/src/alpine_bits_python/db.py index ec8a499..f367e8c 100644 --- a/src/alpine_bits_python/db.py +++ b/src/alpine_bits_python/db.py @@ -27,7 +27,7 @@ from sqlalchemy.ext.asyncio import ( async_sessionmaker, create_async_engine, ) -from sqlalchemy.orm import backref, declarative_base, relationship +from sqlalchemy.orm import backref, declarative_base, foreign, relationship from .const import WebhookStatus from .logging_config import get_logger @@ -435,7 +435,13 @@ class ConversionGuest(Base): last_seen = Column(DateTime(timezone=True)) # Relationships - conversions = relationship("Conversion", back_populates="guest") + conversions = relationship( + "Conversion", + back_populates="guest", + foreign_keys="[Conversion.hotel_id, Conversion.guest_id]", + primaryjoin="and_(ConversionGuest.hotel_id == foreign(Conversion.hotel_id), " + "ConversionGuest.guest_id == foreign(Conversion.guest_id))", + ) hashed_customer = relationship("HashedCustomer", backref="conversion_guests") @staticmethod @@ -541,8 +547,8 @@ class Reservation(Base): # Advertising account IDs (stored conditionally based on fbclid/gclid presence) meta_account_id = Column(String) google_account_id = Column(String) - # Add hotel_code and hotel_name for XML - hotel_code = Column(String) + # Add hotel_id and hotel_name for XML + hotel_id = Column(String, ForeignKey("hotels.hotel_id", ondelete="CASCADE")) hotel_name = Column(String) # RoomTypes fields (optional) room_type_code = Column(String) @@ -569,7 +575,7 @@ class AckedRequest(Base): ) # Username of the client making the request unique_id = Column( String, index=True - ) # Should match Reservation.form_id or another unique field + ) # Matches the md5_unique_id in Reservation timestamp = Column(DateTime(timezone=True)) @@ -646,13 +652,10 @@ class Conversion(Base): created_at = Column(DateTime(timezone=True)) # When this record was imported updated_at = Column(DateTime(timezone=True)) # When this record was last updated - # Composite foreign key constraint for ConversionGuest (hotel_id, guest_id) + # Table constraints + # Note: The relationship to ConversionGuest is handled via SQLAlchemy ORM + # by matching (hotel_id, guest_id) pairs, no DB-level FK constraint needed __table_args__ = ( - ForeignKeyConstraint( - ["hotel_id", "guest_id"], - ["conversion_guests.hotel_id", "conversion_guests.guest_id"], - ondelete="SET NULL", - ), UniqueConstraint( "hotel_id", "pms_reservation_id", name="uq_conversion_hotel_reservation" ), @@ -662,7 +665,13 @@ class Conversion(Base): reservation = relationship("Reservation", backref="conversions") customer = relationship("Customer", backref="conversions") hashed_customer = relationship("HashedCustomer", backref="conversions") - guest = relationship("ConversionGuest", back_populates="conversions") + guest = relationship( + "ConversionGuest", + back_populates="conversions", + foreign_keys="[Conversion.hotel_id, Conversion.guest_id]", + primaryjoin="and_(Conversion.hotel_id == ConversionGuest.hotel_id, " + "Conversion.guest_id == ConversionGuest.guest_id)", + ) conversion_rooms = relationship( "ConversionRoom", back_populates="conversion", cascade="all, delete-orphan" ) diff --git a/src/alpine_bits_python/db_setup.py b/src/alpine_bits_python/db_setup.py index b56a28a..eaf3538 100644 --- a/src/alpine_bits_python/db_setup.py +++ b/src/alpine_bits_python/db_setup.py @@ -115,7 +115,7 @@ async def backfill_advertising_account_ids( sql = text( "UPDATE reservations " "SET meta_account_id = :meta_account " - "WHERE hotel_code = :hotel_id " + "WHERE hotel_id = :hotel_id " "AND fbclid IS NOT NULL " "AND fbclid != '' " "AND (meta_account_id IS NULL OR meta_account_id = '')" @@ -141,7 +141,7 @@ async def backfill_advertising_account_ids( sql = text( "UPDATE reservations " "SET google_account_id = :google_account " - "WHERE hotel_code = :hotel_id " + "WHERE hotel_id = :hotel_id " "AND gclid IS NOT NULL " "AND gclid != '' " "AND (google_account_id IS NULL OR google_account_id = '')" @@ -215,7 +215,7 @@ async def backfill_acked_requests_username( UPDATE acked_requests SET username = :username WHERE unique_id IN ( - SELECT md5_unique_id FROM reservations WHERE hotel_code = :hotel_id + SELECT md5_unique_id FROM reservations WHERE hotel_id = :hotel_id ) AND username IS NULL """ diff --git a/src/alpine_bits_python/email_monitoring.py b/src/alpine_bits_python/email_monitoring.py index 0c7ce55..cb004a9 100644 --- a/src/alpine_bits_python/email_monitoring.py +++ b/src/alpine_bits_python/email_monitoring.py @@ -523,10 +523,10 @@ class ReservationStatsCollector: async with self.async_sessionmaker() as session: # Query reservations created in the reporting period result = await session.execute( - select(Reservation.hotel_code, func.count(Reservation.id)) + select(Reservation.hotel_id, func.count(Reservation.id)) .where(Reservation.created_at >= period_start) .where(Reservation.created_at < period_end) - .group_by(Reservation.hotel_code) + .group_by(Reservation.hotel_id) ) hotel_counts = dict(result.all()) diff --git a/src/alpine_bits_python/reservation_service.py b/src/alpine_bits_python/reservation_service.py index 12c0a2a..c04752e 100644 --- a/src/alpine_bits_python/reservation_service.py +++ b/src/alpine_bits_python/reservation_service.py @@ -181,7 +181,7 @@ class ReservationService: if end_date: filters.append(Reservation.created_at <= end_date) if hotel_code: - filters.append(Reservation.hotel_code == hotel_code) + filters.append(Reservation.hotel_id == hotel_code) if filters: query = query.where(and_(*filters)) diff --git a/src/alpine_bits_python/schemas.py b/src/alpine_bits_python/schemas.py index 852cf84..638c546 100644 --- a/src/alpine_bits_python/schemas.py +++ b/src/alpine_bits_python/schemas.py @@ -131,7 +131,7 @@ class ReservationData(BaseModel): num_adults: int = Field(..., ge=1) num_children: int = Field(0, ge=0, le=10) children_ages: list[int] = Field(default_factory=list) - hotel_code: str = Field(..., min_length=1, max_length=50) + hotel_id: str = Field(..., min_length=1, max_length=50) hotel_name: str | None = Field(None, max_length=200) offer: str | None = Field(None, max_length=500) user_comment: str | None = Field(None, max_length=2000) diff --git a/src/alpine_bits_python/util/migrate_sqlite_to_postgres.py b/src/alpine_bits_python/util/migrate_sqlite_to_postgres.py index 5eefc1b..10094f5 100644 --- a/src/alpine_bits_python/util/migrate_sqlite_to_postgres.py +++ b/src/alpine_bits_python/util/migrate_sqlite_to_postgres.py @@ -306,7 +306,7 @@ async def migrate_data( user_comment=reservation.user_comment, fbclid=reservation.fbclid, gclid=reservation.gclid, - hotel_code=reservation.hotel_code, + hotel_code=reservation.hotel_id, hotel_name=reservation.hotel_name, room_type_code=reservation.room_type_code, room_classification_code=reservation.room_classification_code, diff --git a/src/alpine_bits_python/webhook_processor.py b/src/alpine_bits_python/webhook_processor.py index e051c7e..633e916 100644 --- a/src/alpine_bits_python/webhook_processor.py +++ b/src/alpine_bits_python/webhook_processor.py @@ -247,7 +247,7 @@ async def process_wix_form_submission( num_adults=num_adults, num_children=num_children, children_ages=children_ages, - hotel_code=hotel_code, + hotel_id=hotel_code, hotel_name=hotel_name, offer=offer, created_at=submissionTime, @@ -575,7 +575,7 @@ async def process_generic_webhook_submission( "num_adults": num_adults, "num_children": num_children, "children_ages": children_ages, - "hotel_code": hotel_code, + "hotel_id": hotel_code, "hotel_name": hotel_name, "offer": selected_offers_str, "utm_source": utm_source, diff --git a/tests/test_alpine_bits_server_read.py b/tests/test_alpine_bits_server_read.py index e2ba282..5ca83a3 100644 --- a/tests/test_alpine_bits_server_read.py +++ b/tests/test_alpine_bits_server_read.py @@ -98,7 +98,7 @@ def sample_reservation(sample_customer): user_comment="Late check-in requested", fbclid="PAZXh0bgNhZW0BMABhZGlkAasmYBTNE3QBp1jWuJ9zIpfEGRJMP63fMAMI405yvG5EtH-OT0PxSkAbBJaudFHR6cMtkdHu_aem_fopaFtECyVPNW9fmWfEkyA", gclid="", - hotel_code="HOTEL123", + hotel_id="HOTEL123", hotel_name="Alpine Paradise Resort", ) data = reservation.model_dump(exclude_none=True) @@ -136,7 +136,7 @@ def minimal_reservation(minimal_customer): num_adults=1, num_children=0, children_ages=[], - hotel_code="HOTEL123", + hotel_id="HOTEL123", created_at=datetime(2024, 12, 2, 12, 0, 0, tzinfo=UTC), hotel_name="Alpine Paradise Resort", ) @@ -403,7 +403,7 @@ class TestEdgeCases: num_adults=1, num_children=0, children_ages="", - hotel_code="HOTEL123", + hotel_id="HOTEL123", created_at=datetime.now(UTC), ) @@ -434,7 +434,7 @@ class TestEdgeCases: num_adults=2, num_children=0, children_ages=[], - hotel_code="HOTEL123", + hotel_id="HOTEL123", created_at=datetime.now(UTC), utm_source="facebook", utm_medium="social", @@ -851,7 +851,7 @@ class TestAcknowledgments: num_adults=2, num_children=0, children_ages=[], - hotel_code="HOTEL123", + hotel_id="HOTEL123", hotel_name="Alpine Paradise Resort", created_at=datetime(2024, 11, 1, 12, 0, 0, tzinfo=UTC), ) @@ -863,7 +863,7 @@ class TestAcknowledgments: num_adults=2, num_children=1, children_ages=[10], - hotel_code="HOTEL123", + hotel_id="HOTEL123", hotel_name="Alpine Paradise Resort", created_at=datetime(2024, 11, 15, 10, 0, 0, tzinfo=UTC), ) diff --git a/tests/test_api.py b/tests/test_api.py index b67c89b..1fcafac 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -523,7 +523,7 @@ class TestGenericWebhookEndpoint: (r for r in reservations if r.customer_id == customer.id), None ) assert reservation is not None, "Reservation should be created" - assert reservation.hotel_code == "HOTEL123" + assert reservation.hotel_id == "HOTEL123" assert reservation.hotel_name == "Test Hotel" assert reservation.num_adults == 2 assert reservation.num_children == 1 @@ -614,7 +614,7 @@ class TestGenericWebhookEndpoint: result = await session.execute(select(Reservation)) reservations = result.scalars().all() reservation = next( - (r for r in reservations if r.hotel_code == "HOTEL123"), None + (r for r in reservations if r.hotel_id == "HOTEL123"), None ) assert reservation is not None, "Reservation should be created" assert reservation.num_children == 3 diff --git a/tests/test_conversion_service.py b/tests/test_conversion_service.py index 9aa1ec2..8df6f34 100644 --- a/tests/test_conversion_service.py +++ b/tests/test_conversion_service.py @@ -747,7 +747,7 @@ class TestHashedMatchingLogic: reservation = Reservation( customer_id=customer.id, unique_id="res_6", - hotel_code="hotel_1", + hotel_id="hotel_1", ) test_db_session.add(reservation) await test_db_session.commit() From 34cb2131c4692f98274d351e129e0dae6b028f7a Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 10:51:18 +0100 Subject: [PATCH 02/20] Migration to single customer table works but conversion_service still needs updating --- ...c_merge_hashed_customers_into_customers.py | 104 ++++++++++++++ src/alpine_bits_python/customer_service.py | 136 ++++++++---------- src/alpine_bits_python/db.py | 44 +++--- 3 files changed, 188 insertions(+), 96 deletions(-) create mode 100644 alembic/versions/2025_12_03_1044-0fbeb40dbb2c_merge_hashed_customers_into_customers.py diff --git a/alembic/versions/2025_12_03_1044-0fbeb40dbb2c_merge_hashed_customers_into_customers.py b/alembic/versions/2025_12_03_1044-0fbeb40dbb2c_merge_hashed_customers_into_customers.py new file mode 100644 index 0000000..a9f2382 --- /dev/null +++ b/alembic/versions/2025_12_03_1044-0fbeb40dbb2c_merge_hashed_customers_into_customers.py @@ -0,0 +1,104 @@ +"""merge_hashed_customers_into_customers + +Revision ID: 0fbeb40dbb2c +Revises: 694d52a883c3 +Create Date: 2025-12-03 10:44:32.243220 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '0fbeb40dbb2c' +down_revision: Union[str, Sequence[str], None] = '694d52a883c3' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + # Add hashed columns to customers table + op.add_column('customers', sa.Column('hashed_email', sa.String(length=64), nullable=True)) + op.add_column('customers', sa.Column('hashed_phone', sa.String(length=64), nullable=True)) + op.add_column('customers', sa.Column('hashed_given_name', sa.String(length=64), nullable=True)) + op.add_column('customers', sa.Column('hashed_surname', sa.String(length=64), nullable=True)) + op.add_column('customers', sa.Column('hashed_city', sa.String(length=64), nullable=True)) + op.add_column('customers', sa.Column('hashed_postal_code', sa.String(length=64), nullable=True)) + op.add_column('customers', sa.Column('hashed_country_code', sa.String(length=64), nullable=True)) + op.add_column('customers', sa.Column('hashed_gender', sa.String(length=64), nullable=True)) + op.add_column('customers', sa.Column('hashed_birth_date', sa.String(length=64), nullable=True)) + op.add_column('customers', sa.Column('created_at', sa.DateTime(timezone=True), nullable=True)) + + # Migrate data from hashed_customers to customers + op.execute(''' + UPDATE customers c + SET + hashed_email = hc.hashed_email, + hashed_phone = hc.hashed_phone, + hashed_given_name = hc.hashed_given_name, + hashed_surname = hc.hashed_surname, + hashed_city = hc.hashed_city, + hashed_postal_code = hc.hashed_postal_code, + hashed_country_code = hc.hashed_country_code, + hashed_gender = hc.hashed_gender, + hashed_birth_date = hc.hashed_birth_date, + created_at = COALESCE(c.created_at, hc.created_at) + FROM hashed_customers hc + WHERE c.id = hc.customer_id + ''') + + # Update reservations to point to customers instead of hashed_customers + # First, update reservations.customer_id from reservations.hashed_customer_id + op.execute(''' + UPDATE reservations r + SET customer_id = hc.customer_id + FROM hashed_customers hc + WHERE r.hashed_customer_id = hc.id + AND r.customer_id IS NULL + ''') + + # Update conversions to point to customers instead of hashed_customers + op.execute(''' + UPDATE conversions c + SET customer_id = hc.customer_id + FROM hashed_customers hc + WHERE c.hashed_customer_id = hc.id + AND c.customer_id IS NULL + ''') + + # Update conversion_guests to point to customers instead of hashed_customers + op.execute(''' + UPDATE conversion_guests cg + SET hashed_customer_id = NULL + WHERE hashed_customer_id IS NOT NULL + ''') + + # Now safe to drop the FK and column from reservations + op.drop_constraint(op.f('reservations_hashed_customer_id_fkey'), 'reservations', type_='foreignkey') + op.drop_column('reservations', 'hashed_customer_id') + + # Note: We're keeping the hashed_customers table for now since conversion_service.py still uses it + # It can be dropped in a future migration after updating the application code + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('reservations', sa.Column('hashed_customer_id', sa.INTEGER(), autoincrement=False, nullable=True)) + op.create_foreign_key(op.f('reservations_hashed_customer_id_fkey'), 'reservations', 'hashed_customers', ['hashed_customer_id'], ['id'], ondelete='CASCADE') + op.drop_column('customers', 'created_at') + op.drop_column('customers', 'hashed_birth_date') + op.drop_column('customers', 'hashed_gender') + op.drop_column('customers', 'hashed_country_code') + op.drop_column('customers', 'hashed_postal_code') + op.drop_column('customers', 'hashed_city') + op.drop_column('customers', 'hashed_surname') + op.drop_column('customers', 'hashed_given_name') + op.drop_column('customers', 'hashed_phone') + op.drop_column('customers', 'hashed_email') + # ### end Alembic commands ### diff --git a/src/alpine_bits_python/customer_service.py b/src/alpine_bits_python/customer_service.py index 5fa8d4c..ff73a83 100644 --- a/src/alpine_bits_python/customer_service.py +++ b/src/alpine_bits_python/customer_service.py @@ -53,13 +53,13 @@ class CustomerService: if "phone" in customer_data: customer.phone = customer_data["phone"] - self.session.add(customer) - await self.session.flush() # Flush to get the customer.id + # Set creation timestamp + customer.created_at = datetime.now(UTC) - # Create hashed version - hashed_customer = customer.create_hashed_customer() - hashed_customer.created_at = datetime.now(UTC) - self.session.add(hashed_customer) + # Update hashed fields + customer.update_hashed_fields() + + self.session.add(customer) if auto_commit: await self.session.commit() @@ -130,29 +130,8 @@ class CustomerService: if "phone" in update_data: customer.phone = update_data["phone"] - # Update or create hashed version - result = await self.session.execute( - select(HashedCustomer).where(HashedCustomer.customer_id == customer.id) - ) - hashed_customer = result.scalar_one_or_none() - - if hashed_customer: - # Update existing hashed customer - new_hashed = customer.create_hashed_customer() - hashed_customer.hashed_email = new_hashed.hashed_email - hashed_customer.hashed_phone = new_hashed.hashed_phone - hashed_customer.hashed_given_name = new_hashed.hashed_given_name - hashed_customer.hashed_surname = new_hashed.hashed_surname - hashed_customer.hashed_city = new_hashed.hashed_city - hashed_customer.hashed_postal_code = new_hashed.hashed_postal_code - hashed_customer.hashed_country_code = new_hashed.hashed_country_code - hashed_customer.hashed_gender = new_hashed.hashed_gender - hashed_customer.hashed_birth_date = new_hashed.hashed_birth_date - else: - # Create new hashed customer if it doesn't exist - hashed_customer = customer.create_hashed_customer() - hashed_customer.created_at = datetime.now(UTC) - self.session.add(hashed_customer) + # Update hashed fields + customer.update_hashed_fields() if auto_commit: await self.session.commit() @@ -216,10 +195,11 @@ class CustomerService: return result.scalar_one_or_none() async def hash_existing_customers(self) -> int: - """Hash all existing customers that don't have a hashed version yet. + """Hash all existing customers that don't have hashed fields populated yet. This is useful for backfilling hashed data for customers created - before the hashing system was implemented. + before the hashing system was implemented, or after migrating from + the separate hashed_customers table. Also validates and sanitizes customer data (e.g., normalizes country codes to uppercase). Customers with invalid data that cannot be fixed @@ -229,62 +209,64 @@ class CustomerService: Number of customers that were hashed """ - # Get all customers - result = await self.session.execute(select(Customer)) + # Get all customers without hashed data + result = await self.session.execute( + select(Customer).where(Customer.hashed_email.is_(None)) + ) customers = result.scalars().all() hashed_count = 0 skipped_count = 0 for customer in customers: - # Check if this customer already has a hashed version - existing_hashed = await self.get_hashed_customer(customer.id) - if not existing_hashed: - # Validate and sanitize customer data before hashing - customer_dict = { - "given_name": customer.given_name, - "surname": customer.surname, - "name_prefix": customer.name_prefix, - "email_address": customer.email_address, - "phone": customer.phone, - "email_newsletter": customer.email_newsletter, - "address_line": customer.address_line, - "city_name": customer.city_name, - "postal_code": customer.postal_code, - "country_code": customer.country_code, - "gender": customer.gender, - "birth_date": customer.birth_date, - "language": customer.language, - "address_catalog": customer.address_catalog, - "name_title": customer.name_title, - } + # Validate and sanitize customer data before hashing + customer_dict = { + "given_name": customer.given_name, + "surname": customer.surname, + "name_prefix": customer.name_prefix, + "email_address": customer.email_address, + "phone": customer.phone, + "email_newsletter": customer.email_newsletter, + "address_line": customer.address_line, + "city_name": customer.city_name, + "postal_code": customer.postal_code, + "country_code": customer.country_code, + "gender": customer.gender, + "birth_date": customer.birth_date, + "language": customer.language, + "address_catalog": customer.address_catalog, + "name_title": customer.name_title, + } - try: - # Validate through Pydantic (normalizes country code) - validated = CustomerData(**customer_dict) + try: + # Validate through Pydantic (normalizes country code) + validated = CustomerData(**customer_dict) - # Update customer with sanitized data - # Exclude 'phone_numbers' as Customer model uses 'phone' field - for key, value in validated.model_dump( - exclude_none=True, exclude={"phone_numbers"} - ).items(): - if hasattr(customer, key): - setattr(customer, key, value) + # Update customer with sanitized data + # Exclude 'phone_numbers' as Customer model uses 'phone' field + for key, value in validated.model_dump( + exclude_none=True, exclude={"phone_numbers"} + ).items(): + if hasattr(customer, key): + setattr(customer, key, value) - # Create hashed version with sanitized data - hashed_customer = customer.create_hashed_customer() - hashed_customer.created_at = datetime.now(UTC) - self.session.add(hashed_customer) - hashed_count += 1 + # Update hashed fields with sanitized data + customer.update_hashed_fields() - except ValidationError as e: - # Skip customers with invalid data and log - skipped_count += 1 - _LOGGER.warning( - "Skipping customer ID %s due to validation error: %s", - customer.id, - e, - ) + # Set created_at if not already set + if not customer.created_at: + customer.created_at = datetime.now(UTC) + + hashed_count += 1 + + except ValidationError as e: + # Skip customers with invalid data and log + skipped_count += 1 + _LOGGER.warning( + "Skipping customer ID %s due to validation error: %s", + customer.id, + e, + ) if hashed_count > 0: await self.session.commit() diff --git a/src/alpine_bits_python/db.py b/src/alpine_bits_python/db.py index f367e8c..303e4bc 100644 --- a/src/alpine_bits_python/db.py +++ b/src/alpine_bits_python/db.py @@ -311,6 +311,20 @@ class Customer(Base): language = Column(String) address_catalog = Column(Boolean) # Added for XML name_title = Column(String) # Added for XML + + # Hashed fields for Meta Conversion API (SHA256) + hashed_email = Column(String(64)) + hashed_phone = Column(String(64)) + hashed_given_name = Column(String(64)) + hashed_surname = Column(String(64)) + hashed_city = Column(String(64)) + hashed_postal_code = Column(String(64)) + hashed_country_code = Column(String(64)) + hashed_gender = Column(String(64)) + hashed_birth_date = Column(String(64)) + + created_at = Column(DateTime(timezone=True)) + reservations = relationship("Reservation", back_populates="customer") def __repr__(self): @@ -335,21 +349,17 @@ class Customer(Base): # SHA256 hash return hashlib.sha256(normalized.encode("utf-8")).hexdigest() - def create_hashed_customer(self): - """Create a HashedCustomer instance from this Customer.""" - return HashedCustomer( - customer_id=self.id, - contact_id=self.contact_id, - hashed_email=self._normalize_and_hash(self.email_address), - hashed_phone=self._normalize_and_hash(self.phone), - hashed_given_name=self._normalize_and_hash(self.given_name), - hashed_surname=self._normalize_and_hash(self.surname), - hashed_city=self._normalize_and_hash(self.city_name), - hashed_postal_code=self._normalize_and_hash(self.postal_code), - hashed_country_code=self._normalize_and_hash(self.country_code), - hashed_gender=self._normalize_and_hash(self.gender), - hashed_birth_date=self._normalize_and_hash(self.birth_date), - ) + def update_hashed_fields(self): + """Update the hashed fields based on current plaintext values.""" + self.hashed_email = self._normalize_and_hash(self.email_address) + self.hashed_phone = self._normalize_and_hash(self.phone) + self.hashed_given_name = self._normalize_and_hash(self.given_name) + self.hashed_surname = self._normalize_and_hash(self.surname) + self.hashed_city = self._normalize_and_hash(self.city_name) + self.hashed_postal_code = self._normalize_and_hash(self.postal_code) + self.hashed_country_code = self._normalize_and_hash(self.country_code) + self.hashed_gender = self._normalize_and_hash(self.gender) + self.hashed_birth_date = self._normalize_and_hash(self.birth_date) class HashedCustomer(Base): @@ -523,9 +533,6 @@ class Reservation(Base): __tablename__ = "reservations" id = Column(Integer, primary_key=True) customer_id = Column(Integer, ForeignKey("customers.id", ondelete="SET NULL")) - hashed_customer_id = Column( - Integer, ForeignKey("hashed_customers.id", ondelete="CASCADE") - ) unique_id = Column(String, unique=True) md5_unique_id = Column(String(32), unique=True) # max length 32 guaranteed start_date = Column(Date) @@ -555,7 +562,6 @@ class Reservation(Base): room_classification_code = Column(String) room_type = Column(String) customer = relationship("Customer", back_populates="reservations") - hashed_customer = relationship("HashedCustomer", backref="reservations") # Table for tracking acknowledged requests by client From df7d3c6543c645bdb02b824efa94cff363e935d6 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 11:10:27 +0100 Subject: [PATCH 03/20] HashedCustomer is now no longer necessary --- src/alpine_bits_python/conversion_service.py | 53 ++++++++++---------- src/alpine_bits_python/customer_service.py | 6 +-- test_capi.py | 2 +- tests/test_conversion_service.py | 5 +- tests/test_customer_service.py | 40 +++++++++------ 5 files changed, 56 insertions(+), 50 deletions(-) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 43847f6..5a3de97 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -17,7 +17,6 @@ from .db import ( ConversionGuest, ConversionRoom, Customer, - HashedCustomer, Reservation, SessionMaker, ) @@ -95,11 +94,11 @@ class ConversionService: self.hotel_id = hotel_id # Cache for reservation and customer data within a single XML processing run - # Maps hotel_code -> list of (reservation, hashed_customer) tuples + # Maps hotel_code -> list of (reservation, customer) tuples # This significantly speeds up matching when processing large XML files # Uses hashed data for matching to preserve privacy self._reservation_cache: dict[ - str | None, list[tuple[Reservation, HashedCustomer | None]] + str | None, list[tuple[Reservation, Customer | None]] ] = {} self._cache_initialized = False @@ -1146,8 +1145,8 @@ class ConversionService: guest_last_name: str | None, guest_email: str | None, session: AsyncSession | None = None, - ) -> HashedCustomer | None: - """Match guest by name and email directly to HashedCustomer (no Reservation needed). + ) -> Customer | None: + """Match guest by name and email directly to Customer (no Reservation needed). This method bypasses the Reservation table entirely and matches directly against hashed customer data. Used for guest-detail matching where we don't need to link @@ -1160,23 +1159,23 @@ class ConversionService: session: AsyncSession to use. If None, uses self.session. Returns: - Matched HashedCustomer or None + Matched Customer or None """ if session is None: session = self.session # Query all hashed customers that match the guest details - query = select(HashedCustomer).options(selectinload(HashedCustomer.customer)) + query = select(Customer) # Build filter conditions conditions = [] if guest_email: - conditions.append(HashedCustomer.hashed_email == guest_email) + conditions.append(Customer.hashed_email == guest_email) if guest_first_name and guest_last_name: conditions.append( - (HashedCustomer.hashed_given_name == guest_first_name) - & (HashedCustomer.hashed_surname == guest_last_name) + (Customer.hashed_given_name == guest_first_name) + & (Customer.hashed_surname == guest_last_name) ) if not conditions: @@ -1288,10 +1287,10 @@ class ConversionService: async def _extract_unmatched_guests( self, session: AsyncSession - ) -> dict[str, HashedCustomer]: + ) -> dict[str, Customer]: """Phase 3b: Extract unique guests from unmatched conversions and match them to customers. - Returns a mapping of guest_id -> HashedCustomer for all unique guests found in + Returns a mapping of guest_id -> Customer for all unique guests found in unmatched conversions. Only processes each guest once. This includes: @@ -1303,7 +1302,7 @@ class ConversionService: session: AsyncSession for database queries Returns: - Dictionary mapping guest_id to matched HashedCustomer (or None if no match) + Dictionary mapping guest_id to matched Customer (or None if no match) """ # Find all conversions that either: @@ -1336,7 +1335,7 @@ class ConversionService: ) # Match each unique guest to a hashed customer - guest_to_hashed_customer: dict[str, HashedCustomer] = {} + guest_to_hashed_customer: dict[str, Customer] = {} for guest_id, conversion in unique_guests.items(): conversion_guest = conversion.guest if not conversion_guest: @@ -1365,7 +1364,7 @@ class ConversionService: async def _link_matched_guests_to_reservations( self, - guest_to_hashed_customer: dict[str, HashedCustomer], + guest_to_customer_dict: dict[str, Customer], session: AsyncSession, stats: dict[str, int], ) -> None: @@ -1383,13 +1382,13 @@ class ConversionService: by looking at whether they have paying conversions that predate any reservations. Args: - guest_to_hashed_customer: Mapping from guest_id to matched HashedCustomer + guest_to_customer: Mapping from guest_id to matched Customer session: AsyncSession for database queries stats: Shared stats dictionary to update """ - for guest_id, matched_hashed_customer in guest_to_hashed_customer.items(): - if not matched_hashed_customer or not matched_hashed_customer.customer_id: + for guest_id, matched_hashed_customer in guest_to_customer_dict.items(): + if not matched_hashed_customer or not matched_hashed_customer.id: continue # Find all conversions from this guest that don't have a reservation @@ -1414,7 +1413,7 @@ class ConversionService: "Phase 3c: Processing %d conversions for guest %s (customer_id=%d)", len(conversions), guest_id, - matched_hashed_customer.customer_id, + matched_hashed_customer.id, ) # Try to link each conversion to a reservation for this customer @@ -1423,7 +1422,7 @@ class ConversionService: matched_reservation, is_attributable, ) = await self._check_if_attributable( - matched_hashed_customer.customer_id, conversion, session + matched_hashed_customer.id, conversion, session ) if matched_reservation and is_attributable: @@ -1431,7 +1430,7 @@ class ConversionService: was_previously_matched = conversion.customer_id is not None conversion.reservation_id = matched_reservation.id - conversion.customer_id = matched_hashed_customer.customer_id + conversion.customer_id = matched_hashed_customer.id conversion.hashed_customer_id = matched_hashed_customer.id conversion.directly_attributable = True conversion.guest_matched = True @@ -1447,7 +1446,7 @@ class ConversionService: ) elif matched_hashed_customer and conversion.customer_id is None: # Only count new customer matches (conversions that didn't have a customer before) - conversion.customer_id = matched_hashed_customer.customer_id + conversion.customer_id = matched_hashed_customer.id conversion.hashed_customer_id = matched_hashed_customer.id conversion.directly_attributable = False conversion.guest_matched = True @@ -1458,7 +1457,7 @@ class ConversionService: # Look at ALL conversions from this guest to see if there are pre-dated payments if conversions and conversions[0].guest: await self._check_if_guest_is_regular( - guest_id, matched_hashed_customer.customer_id, session + guest_id, matched_hashed_customer.id, session ) async def _check_regularity_for_all_matched_guests( @@ -1503,15 +1502,15 @@ class ConversionService: # Get the customer ID from the hashed_customer hashed_customer_result = await session.execute( - select(HashedCustomer).where( - HashedCustomer.id == conversion_guest.hashed_customer_id + select(Customer).where( + Customer.id == conversion_guest.hashed_customer_id ) ) hashed_customer = hashed_customer_result.scalar_one_or_none() - if hashed_customer and hashed_customer.customer_id: + if hashed_customer and hashed_customer.id: await self._check_if_guest_is_regular( - conversion_guest.guest_id, hashed_customer.customer_id, session + conversion_guest.guest_id, hashed_customer.id, session ) async def _match_conversions_from_db_sequential( diff --git a/src/alpine_bits_python/customer_service.py b/src/alpine_bits_python/customer_service.py index ff73a83..80c4dc0 100644 --- a/src/alpine_bits_python/customer_service.py +++ b/src/alpine_bits_python/customer_service.py @@ -179,18 +179,18 @@ class CustomerService: # Create new customer (either no contact_id or customer doesn't exist) return await self.create_customer(customer_data, auto_commit=auto_commit) - async def get_hashed_customer(self, customer_id: int) -> HashedCustomer | None: + async def get_customer(self, customer_id: int) -> Customer | None: """Get the hashed version of a customer. Args: customer_id: The customer ID Returns: - HashedCustomer instance if found, None otherwise + Customer instance if found, None otherwise """ result = await self.session.execute( - select(HashedCustomer).where(HashedCustomer.customer_id == customer_id) + select(Customer).where(Customer.id == customer_id) ) return result.scalar_one_or_none() diff --git a/test_capi.py b/test_capi.py index 6eed501..1615991 100644 --- a/test_capi.py +++ b/test_capi.py @@ -59,7 +59,7 @@ async def load_test_data_from_db(): result = [] for reservation, customer in reservations_with_customers: # Get hashed customer data - hashed_customer = await customer_service.get_hashed_customer(customer.id) + hashed_customer = await customer_service.get_customer(customer.id) result.append( { diff --git a/tests/test_conversion_service.py b/tests/test_conversion_service.py index 8df6f34..c180198 100644 --- a/tests/test_conversion_service.py +++ b/tests/test_conversion_service.py @@ -740,9 +740,8 @@ class TestHashedMatchingLogic: test_db_session.add(customer) await test_db_session.flush() - hashed_customer = customer.create_hashed_customer() - test_db_session.add(hashed_customer) - await test_db_session.flush() + customer.update_hashed_fields() + reservation = Reservation( customer_id=customer.id, diff --git a/tests/test_customer_service.py b/tests/test_customer_service.py index 05376cf..5578e97 100644 --- a/tests/test_customer_service.py +++ b/tests/test_customer_service.py @@ -42,9 +42,9 @@ async def test_create_customer_creates_hashed_version(async_session: AsyncSessio assert customer.given_name == "John" # Check that hashed version was created - hashed = await service.get_hashed_customer(customer.id) + hashed = await service.get_customer(customer.id) assert hashed is not None - assert hashed.customer_id == customer.id + assert hashed.id == customer.id assert hashed.hashed_email is not None assert hashed.hashed_phone is not None assert hashed.hashed_given_name is not None @@ -66,7 +66,7 @@ async def test_update_customer_updates_hashed_version(async_session: AsyncSessio customer = await service.create_customer(customer_data) # Get initial hashed email - hashed = await service.get_hashed_customer(customer.id) + hashed = await service.get_customer(customer.id) original_hashed_email = hashed.hashed_email # Update customer email @@ -74,7 +74,7 @@ async def test_update_customer_updates_hashed_version(async_session: AsyncSessio updated_customer = await service.update_customer(customer, update_data) # Check that hashed version was updated - updated_hashed = await service.get_hashed_customer(updated_customer.id) + updated_hashed = await service.get_customer(updated_customer.id) assert updated_hashed.hashed_email != original_hashed_email @@ -95,7 +95,7 @@ async def test_get_or_create_customer_creates_new(async_session: AsyncSession): assert customer.contact_id == "new123" # Verify hashed version exists - hashed = await service.get_hashed_customer(customer.id) + hashed = await service.get_customer(customer.id) assert hashed is not None @@ -145,10 +145,13 @@ async def test_hash_existing_customers_backfills(async_session: AsyncSession): # Verify no hashed version exists result = await async_session.execute( - select(HashedCustomer).where(HashedCustomer.customer_id == customer.id) + select(Customer).where(Customer.id == customer.id) ) hashed = result.scalar_one_or_none() - assert hashed is None + assert hashed, "Customer should exist." + + assert hashed.hashed_given_name is None, "Hashed given name should be None." + assert hashed.hashed_email is None, "Hashed email should be None." # Run backfill service = CustomerService(async_session) @@ -158,11 +161,12 @@ async def test_hash_existing_customers_backfills(async_session: AsyncSession): # Verify hashed version now exists result = await async_session.execute( - select(HashedCustomer).where(HashedCustomer.customer_id == customer.id) + select(Customer).where(Customer.id == customer.id) ) hashed = result.scalar_one_or_none() - assert hashed is not None - assert hashed.hashed_email is not None + assert hashed is not None, "Customer should still exist after backfill." + assert hashed.hashed_email is not None, "Hashed email should be populated." + assert hashed.hashed_given_name is not None, "Hashed given name should be populated." @pytest.mark.asyncio @@ -201,7 +205,7 @@ async def test_hashing_normalization(async_session: AsyncSession): } customer = await service.create_customer(customer_data) - hashed = await service.get_hashed_customer(customer.id) + hashed = await service.get_customer(customer.id) # Verify hashes exist (normalization should have occurred) assert hashed.hashed_email is not None @@ -244,13 +248,17 @@ async def test_hash_existing_customers_normalizes_country_code( # Verify no hashed version exists yet result = await async_session.execute( - select(HashedCustomer).where(HashedCustomer.customer_id == customer.id) + select(Customer).where(Customer.id == customer.id) ) hashed = result.scalar_one_or_none() - assert hashed is None + assert hashed is not None, "Customer should exist." + + assert hashed.hashed_given_name is None, "Hashed given name should be None." + assert hashed.hashed_email is None, "Hashed email should be None." + assert hashed.hashed_country_code is None, "Hashed country code should be None." # Verify the customer has the invalid country code stored in the DB - assert customer.country_code == "Italy" + assert hashed.country_code == "Italy" # Run hash_existing_customers - this should normalize "Italy" to "IT" # during validation and successfully create a hashed customer @@ -263,7 +271,7 @@ async def test_hash_existing_customers_normalizes_country_code( # Verify hashed version was created await async_session.refresh(customer) result = await async_session.execute( - select(HashedCustomer).where(HashedCustomer.customer_id == customer.id) + select(Customer).where(Customer.id == customer.id) ) hashed = result.scalar_one_or_none() assert hashed is not None @@ -302,7 +310,7 @@ async def test_hash_existing_customers_normalizes_country_code( # Verify hashed version was created with correct hash result = await async_session.execute( - select(HashedCustomer).where(HashedCustomer.customer_id == customer2.id) + select(Customer).where(Customer.id == customer2.id) ) hashed = result.scalar_one_or_none() assert hashed is not None From 78f81d6b97c164c798d384c685695c37a9255895 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 11:32:24 +0100 Subject: [PATCH 04/20] Fixed greenlet error on rollback --- src/alpine_bits_python/api.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/alpine_bits_python/api.py b/src/alpine_bits_python/api.py index 27ef88a..db4b521 100644 --- a/src/alpine_bits_python/api.py +++ b/src/alpine_bits_python/api.py @@ -923,6 +923,10 @@ async def handle_webhook_unified( ) # 8. Update status and link created entities when available + # Re-add to session in case processor called rollback (e.g., for duplicates) + if webhook_request not in db_session: + webhook_request = await db_session.merge(webhook_request) + webhook_request.status = WebhookStatus.COMPLETED webhook_request.processing_completed_at = datetime.now(UTC) From ca1f4a010b5d72da0cddf37d99f6768be5313e0e Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 12:00:02 +0100 Subject: [PATCH 05/20] Not quite done but mostly starting to remove hashed_customer references --- ...1bc47_removed_hashed_customer_completly.py | 63 +++++++++++++++++++ src/alpine_bits_python/conversion_service.py | 26 +++----- src/alpine_bits_python/customer_service.py | 2 +- src/alpine_bits_python/db.py | 40 +----------- src/alpine_bits_python/reservation_service.py | 13 +--- .../util/migrate_sqlite_to_postgres.py | 1 - src/alpine_bits_python/webhook_processor.py | 2 +- 7 files changed, 75 insertions(+), 72 deletions(-) create mode 100644 alembic/versions/2025_12_03_1142-3147e421bc47_removed_hashed_customer_completly.py diff --git a/alembic/versions/2025_12_03_1142-3147e421bc47_removed_hashed_customer_completly.py b/alembic/versions/2025_12_03_1142-3147e421bc47_removed_hashed_customer_completly.py new file mode 100644 index 0000000..6895007 --- /dev/null +++ b/alembic/versions/2025_12_03_1142-3147e421bc47_removed_hashed_customer_completly.py @@ -0,0 +1,63 @@ +"""removed hashed_customer completly + +Revision ID: 3147e421bc47 +Revises: 0fbeb40dbb2c +Create Date: 2025-12-03 11:42:05.722690 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = '3147e421bc47' +down_revision: Union[str, Sequence[str], None] = '0fbeb40dbb2c' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + + op.drop_index(op.f('ix_conversion_guests_hashed_customer_id'), table_name='conversion_guests') + op.drop_constraint(op.f('fk_conversion_guests_hashed_customer_id_hashed_customers'), 'conversion_guests', type_='foreignkey') + op.drop_column('conversion_guests', 'hashed_customer_id') + op.drop_index(op.f('ix_conversions_hashed_customer_id'), table_name='conversions') + op.drop_constraint(op.f('conversions_hashed_customer_id_fkey'), 'conversions', type_='foreignkey') + op.drop_column('conversions', 'hashed_customer_id') + op.drop_table('hashed_customers') + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('conversions', sa.Column('hashed_customer_id', sa.INTEGER(), autoincrement=False, nullable=True)) + op.create_foreign_key(op.f('conversions_hashed_customer_id_fkey'), 'conversions', 'hashed_customers', ['hashed_customer_id'], ['id']) + op.create_index(op.f('ix_conversions_hashed_customer_id'), 'conversions', ['hashed_customer_id'], unique=False) + op.add_column('conversion_guests', sa.Column('hashed_customer_id', sa.INTEGER(), autoincrement=False, nullable=True)) + op.create_foreign_key(op.f('fk_conversion_guests_hashed_customer_id_hashed_customers'), 'conversion_guests', 'hashed_customers', ['hashed_customer_id'], ['id']) + op.create_index(op.f('ix_conversion_guests_hashed_customer_id'), 'conversion_guests', ['hashed_customer_id'], unique=False) + op.create_table('hashed_customers', + sa.Column('id', sa.INTEGER(), autoincrement=True, nullable=False), + sa.Column('customer_id', sa.INTEGER(), autoincrement=False, nullable=True), + sa.Column('contact_id', sa.VARCHAR(), autoincrement=False, nullable=True), + sa.Column('hashed_email', sa.VARCHAR(length=64), autoincrement=False, nullable=True), + sa.Column('hashed_phone', sa.VARCHAR(length=64), autoincrement=False, nullable=True), + sa.Column('hashed_given_name', sa.VARCHAR(length=64), autoincrement=False, nullable=True), + sa.Column('hashed_surname', sa.VARCHAR(length=64), autoincrement=False, nullable=True), + sa.Column('hashed_city', sa.VARCHAR(length=64), autoincrement=False, nullable=True), + sa.Column('hashed_postal_code', sa.VARCHAR(length=64), autoincrement=False, nullable=True), + sa.Column('hashed_country_code', sa.VARCHAR(length=64), autoincrement=False, nullable=True), + sa.Column('hashed_gender', sa.VARCHAR(length=64), autoincrement=False, nullable=True), + sa.Column('hashed_birth_date', sa.VARCHAR(length=64), autoincrement=False, nullable=True), + sa.Column('created_at', postgresql.TIMESTAMP(timezone=True), autoincrement=False, nullable=True), + sa.ForeignKeyConstraint(['customer_id'], ['customers.id'], name=op.f('hashed_customers_customer_id_fkey'), ondelete='SET NULL'), + sa.PrimaryKeyConstraint('id', name=op.f('hashed_customers_pkey')), + sa.UniqueConstraint('contact_id', name=op.f('uq_hashed_customers_contact_id'), postgresql_include=[], postgresql_nulls_not_distinct=False), + sa.UniqueConstraint('customer_id', name=op.f('uq_hashed_customers_customer_id'), postgresql_include=[], postgresql_nulls_not_distinct=False) + ) + # ### end Alembic commands ### diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 5a3de97..3049721 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -629,10 +629,8 @@ class ConversionService: from sqlalchemy.orm import selectinload query = select(Reservation).options( - selectinload(Reservation.customer).selectinload( - Customer.hashed_version - ), - selectinload(Reservation.hashed_customer), + selectinload(Reservation.customer), + ) result = await session.execute(query) reservations = result.scalars().all() @@ -645,13 +643,11 @@ class ConversionService: if hotel_code not in self._reservation_cache: self._reservation_cache[hotel_code] = [] # Cache the hashed customer - prefer direct relationship, fall back to customer relationship - hashed_customer = None - if reservation.hashed_customer: - hashed_customer = reservation.hashed_customer - elif reservation.customer and reservation.customer.hashed_version: - hashed_customer = reservation.customer.hashed_version + customer = None + if reservation.customer: + customer = reservation.customer self._reservation_cache[hotel_code].append( - (reservation, hashed_customer) + (reservation, customer) ) self._cache_initialized = True @@ -1431,7 +1427,6 @@ class ConversionService: conversion.reservation_id = matched_reservation.id conversion.customer_id = matched_hashed_customer.id - conversion.hashed_customer_id = matched_hashed_customer.id conversion.directly_attributable = True conversion.guest_matched = True conversion.updated_at = datetime.now() @@ -1447,7 +1442,6 @@ class ConversionService: elif matched_hashed_customer and conversion.customer_id is None: # Only count new customer matches (conversions that didn't have a customer before) conversion.customer_id = matched_hashed_customer.id - conversion.hashed_customer_id = matched_hashed_customer.id conversion.directly_attributable = False conversion.guest_matched = True conversion.updated_at = datetime.now() @@ -1742,8 +1736,6 @@ class ConversionService: if matched_reservation: matched_customer = matched_reservation.customer - if matched_customer and matched_customer.hashed_version: - matched_hashed_customer = matched_customer.hashed_version _LOGGER.info( "Phase 3a: Matched conversion by advertising ID (pms_id=%s, reservation_id=%d)", @@ -1752,14 +1744,12 @@ class ConversionService: ) # Update the conversion with matched entities if found - if matched_reservation or matched_customer or matched_hashed_customer: + if matched_reservation or matched_customer: conversion.reservation_id = ( matched_reservation.id if matched_reservation else None ) conversion.customer_id = matched_customer.id if matched_customer else None - conversion.hashed_customer_id = ( - matched_hashed_customer.id if matched_hashed_customer else None - ) + # ID-based matches are always directly attributable conversion.directly_attributable = True diff --git a/src/alpine_bits_python/customer_service.py b/src/alpine_bits_python/customer_service.py index 80c4dc0..e4c3781 100644 --- a/src/alpine_bits_python/customer_service.py +++ b/src/alpine_bits_python/customer_service.py @@ -6,7 +6,7 @@ from pydantic import ValidationError from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from .db import Customer, HashedCustomer +from .db import Customer from .logging_config import get_logger from .schemas import CustomerData diff --git a/src/alpine_bits_python/db.py b/src/alpine_bits_python/db.py index 303e4bc..a287165 100644 --- a/src/alpine_bits_python/db.py +++ b/src/alpine_bits_python/db.py @@ -362,36 +362,6 @@ class Customer(Base): self.hashed_birth_date = self._normalize_and_hash(self.birth_date) -class HashedCustomer(Base): - """Hashed customer data for Meta Conversion API. - - Stores SHA256 hashed versions of customer PII according to Meta's requirements. - This allows sending conversion events without exposing raw customer data. - """ - - __tablename__ = "hashed_customers" - id = Column(Integer, primary_key=True) - customer_id = Column( - Integer, - ForeignKey("customers.id", ondelete="SET NULL"), - unique=True, - nullable=True, - ) - contact_id = Column(String, unique=True) # Keep unhashed for reference - hashed_email = Column(String(64)) # SHA256 produces 64 hex chars - hashed_phone = Column(String(64)) - hashed_given_name = Column(String(64)) - hashed_surname = Column(String(64)) - hashed_city = Column(String(64)) - hashed_postal_code = Column(String(64)) - hashed_country_code = Column(String(64)) - hashed_gender = Column(String(64)) - hashed_birth_date = Column(String(64)) - created_at = Column(DateTime(timezone=True)) - - customer = relationship( - "Customer", backref=backref("hashed_version", uselist=False, lazy="joined") - ) class ConversionGuest(Base): @@ -430,10 +400,7 @@ class ConversionGuest(Base): hashed_country_code = Column(String(64)) hashed_birth_date = Column(String(64)) - # Matched customer reference (nullable, filled after matching) - hashed_customer_id = Column( - Integer, ForeignKey("hashed_customers.id"), nullable=True, index=True - ) + # Guest classification is_regular = Column( @@ -452,7 +419,6 @@ class ConversionGuest(Base): primaryjoin="and_(ConversionGuest.hotel_id == foreign(Conversion.hotel_id), " "ConversionGuest.guest_id == foreign(Conversion.guest_id))", ) - hashed_customer = relationship("HashedCustomer", backref="conversion_guests") @staticmethod def _normalize_and_hash(value): @@ -613,9 +579,6 @@ class Conversion(Base): Integer, ForeignKey("reservations.id"), nullable=True, index=True ) customer_id = Column(Integer, ForeignKey("customers.id"), nullable=True, index=True) - hashed_customer_id = Column( - Integer, ForeignKey("hashed_customers.id"), nullable=True, index=True - ) # Reservation metadata from XML hotel_id = Column( @@ -670,7 +633,6 @@ class Conversion(Base): # Relationships reservation = relationship("Reservation", backref="conversions") customer = relationship("Customer", backref="conversions") - hashed_customer = relationship("HashedCustomer", backref="conversions") guest = relationship( "ConversionGuest", back_populates="conversions", diff --git a/src/alpine_bits_python/reservation_service.py b/src/alpine_bits_python/reservation_service.py index c04752e..816d075 100644 --- a/src/alpine_bits_python/reservation_service.py +++ b/src/alpine_bits_python/reservation_service.py @@ -7,7 +7,7 @@ from typing import Optional from sqlalchemy import and_, select from sqlalchemy.ext.asyncio import AsyncSession -from .db import AckedRequest, Customer, HashedCustomer, Reservation +from .db import AckedRequest, Customer, Reservation from .schemas import ReservationData @@ -64,17 +64,6 @@ class ReservationService: reservation_data, customer_id ) - # Automatically populate hashed_customer_id from the customer - # Since hashed_customer is always created when a customer is created, - # we can get it by querying for the hashed_customer with matching customer_id - hashed_customer_result = await self.session.execute( - select(HashedCustomer).where( - HashedCustomer.customer_id == customer_id - ) - ) - hashed_customer = hashed_customer_result.scalar_one_or_none() - if hashed_customer: - reservation.hashed_customer_id = hashed_customer.id self.session.add(reservation) diff --git a/src/alpine_bits_python/util/migrate_sqlite_to_postgres.py b/src/alpine_bits_python/util/migrate_sqlite_to_postgres.py index 10094f5..3976b4f 100644 --- a/src/alpine_bits_python/util/migrate_sqlite_to_postgres.py +++ b/src/alpine_bits_python/util/migrate_sqlite_to_postgres.py @@ -51,7 +51,6 @@ from alpine_bits_python.db import ( AckedRequest, Base, Customer, - HashedCustomer, Reservation, get_database_url, ) diff --git a/src/alpine_bits_python/webhook_processor.py b/src/alpine_bits_python/webhook_processor.py index 633e916..b929b75 100644 --- a/src/alpine_bits_python/webhook_processor.py +++ b/src/alpine_bits_python/webhook_processor.py @@ -203,7 +203,7 @@ async def process_wix_form_submission( "name_title": None, } - # This automatically creates/updates both Customer and HashedCustomer + # This automatically creates/updates Customer db_customer = await customer_service.get_or_create_customer(customer_data) # Determine hotel_code and hotel_name From 8c0909453517dbb6f3e1b7b8e9f97de4fd415f3b Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 12:12:37 +0100 Subject: [PATCH 06/20] Fixed removing hashed_customer --- src/alpine_bits_python/conversion_service.py | 57 +++++++++----------- src/alpine_bits_python/schemas.py | 3 +- 2 files changed, 25 insertions(+), 35 deletions(-) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 3049721..869a6f7 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -471,7 +471,6 @@ class ConversionService: "total_daily_sales": 0, "matched_to_reservation": 0, "matched_to_customer": 0, - "matched_to_hashed_customer": 0, "unmatched": 0, "errors": 0, } @@ -1471,41 +1470,40 @@ class ConversionService: session: AsyncSession for database queries """ - # Get all ConversionGuests that have ANY customer link - # This includes: - # 1. Guests matched via guest-details (hashed_customer_id is not null) - # 2. Guests matched via ID-based matching (customer_id is not null via conversion) + # Collect every guest/customer pair derived from conversions. result = await session.execute( - select(ConversionGuest).where( - ConversionGuest.hashed_customer_id.isnot(None) + select(Conversion.guest_id, Conversion.customer_id).where( + Conversion.guest_id.isnot(None), Conversion.customer_id.isnot(None) ) ) - matched_guests = result.scalars().all() + guest_customer_rows = result.all() - if not matched_guests: + if not guest_customer_rows: _LOGGER.debug("Phase 3d: No matched guests to check for regularity") return + # Deduplicate by guest_id to avoid recalculating when multiple conversions share the same guest. + guest_to_customer: dict[int, int] = {} + for guest_id, customer_id in guest_customer_rows: + if guest_id is None or customer_id is None: + continue + if guest_id not in guest_to_customer: + guest_to_customer[guest_id] = customer_id + elif guest_to_customer[guest_id] != customer_id: + _LOGGER.warning( + "Guest %s linked to multiple customers (%s, %s); keeping first match", + guest_id, + guest_to_customer[guest_id], + customer_id, + ) + _LOGGER.debug( - "Phase 3d: Checking regularity for %d matched guests", len(matched_guests) + "Phase 3d: Checking regularity for %d matched guests", + len(guest_to_customer), ) - for conversion_guest in matched_guests: - if not conversion_guest.hashed_customer_id: - continue - - # Get the customer ID from the hashed_customer - hashed_customer_result = await session.execute( - select(Customer).where( - Customer.id == conversion_guest.hashed_customer_id - ) - ) - hashed_customer = hashed_customer_result.scalar_one_or_none() - - if hashed_customer and hashed_customer.id: - await self._check_if_guest_is_regular( - conversion_guest.guest_id, hashed_customer.id, session - ) + for guest_id, customer_id in guest_to_customer.items(): + await self._check_if_guest_is_regular(guest_id, customer_id, session) async def _match_conversions_from_db_sequential( self, pms_reservation_ids: list[str], stats: dict[str, int] @@ -1721,7 +1719,6 @@ class ConversionService: # Guest detail matching is deferred to Phase 3b/3c matched_reservation = None matched_customer = None - matched_hashed_customer = None if conversion.advertising_campagne: matched_reservation = await self._match_by_advertising( @@ -1755,10 +1752,6 @@ class ConversionService: conversion.directly_attributable = True conversion.guest_matched = False - # Update conversion_guest with hashed_customer reference if matched - if conversion_guest and matched_hashed_customer: - conversion_guest.hashed_customer_id = matched_hashed_customer.id - conversion.updated_at = datetime.now() # Update stats if provided @@ -1767,8 +1760,6 @@ class ConversionService: stats["matched_to_reservation"] += 1 elif matched_customer: stats["matched_to_customer"] += 1 - elif matched_hashed_customer: - stats["matched_to_hashed_customer"] += 1 else: stats["unmatched"] += 1 diff --git a/src/alpine_bits_python/schemas.py b/src/alpine_bits_python/schemas.py index 638c546..1ce1f05 100644 --- a/src/alpine_bits_python/schemas.py +++ b/src/alpine_bits_python/schemas.py @@ -562,7 +562,6 @@ class ConversionData(BaseModel): # Foreign key references (nullable - matched after creation) reservation_id: int | None = Field(None, gt=0) customer_id: int | None = Field(None, gt=0) - hashed_customer_id: int | None = Field(None, gt=0) # Required reservation metadata from PMS hotel_id: str = Field(..., min_length=1, max_length=50) @@ -591,7 +590,7 @@ class ConversionData(BaseModel): @field_validator( "pms_reservation_id", "guest_id", "reservation_id", "customer_id", - "hashed_customer_id", mode="before" + mode="before" ) @classmethod def convert_int_fields(cls, v: Any) -> int | None: From bf7b8ac427e23ba5644292a5829bd217c82ec2a0 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 12:27:17 +0100 Subject: [PATCH 07/20] Readded fk constraint for conversion_guests --- ...4f_add_conversions_conversion_guests_fk.py | 32 +++++++++++++++++++ src/alpine_bits_python/db.py | 7 ++-- 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 alembic/versions/2025_12_03_1225-263bed87114f_add_conversions_conversion_guests_fk.py diff --git a/alembic/versions/2025_12_03_1225-263bed87114f_add_conversions_conversion_guests_fk.py b/alembic/versions/2025_12_03_1225-263bed87114f_add_conversions_conversion_guests_fk.py new file mode 100644 index 0000000..80e16c0 --- /dev/null +++ b/alembic/versions/2025_12_03_1225-263bed87114f_add_conversions_conversion_guests_fk.py @@ -0,0 +1,32 @@ +"""add conversions→conversion_guests fk + +Revision ID: 263bed87114f +Revises: 3147e421bc47 +Create Date: 2025-12-03 12:25:12.820232 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '263bed87114f' +down_revision: Union[str, Sequence[str], None] = '3147e421bc47' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_foreign_key('fk_conversions_guest', 'conversions', 'conversion_guests', ['hotel_id', 'guest_id'], ['hotel_id', 'guest_id']) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint('fk_conversions_guest', 'conversions', type_='foreignkey') + # ### end Alembic commands ### diff --git a/src/alpine_bits_python/db.py b/src/alpine_bits_python/db.py index a287165..a25cfa9 100644 --- a/src/alpine_bits_python/db.py +++ b/src/alpine_bits_python/db.py @@ -622,12 +622,15 @@ class Conversion(Base): updated_at = Column(DateTime(timezone=True)) # When this record was last updated # Table constraints - # Note: The relationship to ConversionGuest is handled via SQLAlchemy ORM - # by matching (hotel_id, guest_id) pairs, no DB-level FK constraint needed __table_args__ = ( UniqueConstraint( "hotel_id", "pms_reservation_id", name="uq_conversion_hotel_reservation" ), + ForeignKeyConstraint( + ["hotel_id", "guest_id"], + ["conversion_guests.hotel_id", "conversion_guests.guest_id"], + name="fk_conversions_guest", + ), ) # Relationships From 50ce0ec48623bb9333f51f9da8a217811450beb8 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 14:13:20 +0100 Subject: [PATCH 08/20] Finally fixed greenlet_spawn sqllchemy error. The horror --- src/alpine_bits_python/api.py | 68 ++++++++++++++++++++++------------ tests/test_customer_service.py | 2 +- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/src/alpine_bits_python/api.py b/src/alpine_bits_python/api.py index db4b521..20a3b98 100644 --- a/src/alpine_bits_python/api.py +++ b/src/alpine_bits_python/api.py @@ -28,7 +28,7 @@ from fastapi.security import ( from pydantic import BaseModel from slowapi.errors import RateLimitExceeded from sqlalchemy import and_, select, update -from sqlalchemy.ext.asyncio import async_sessionmaker +from sqlalchemy.ext.asyncio import async_sessionmaker, AsyncSession from sqlalchemy.orm import selectinload from alpine_bits_python.hotel_service import HotelService @@ -704,7 +704,7 @@ async def validate_basic_auth( async def handle_webhook_unified( request: Request, webhook_secret: str, - db_session=Depends(get_async_session), + db_session: AsyncSession = Depends(get_async_session), ): """Unified webhook handler with deduplication and routing. @@ -831,6 +831,9 @@ async def handle_webhook_unified( if not webhook_endpoint: raise HTTPException(status_code=404, detail="Webhook not found") + webhook_endpoint_id = webhook_endpoint.id + webhook_hotel_id = webhook_endpoint.hotel_id + # Verify hotel is active if not webhook_endpoint.hotel.is_active: raise HTTPException(status_code=404, detail="Hotel is not active") @@ -845,8 +848,8 @@ async def handle_webhook_unified( webhook_request_data = WebhookRequestData( payload_json=payload, - webhook_endpoint_id=webhook_endpoint.id, - hotel_id=webhook_endpoint.hotel_id, + webhook_endpoint_id=webhook_endpoint_id, + hotel_id=webhook_hotel_id, status=WebhookStatus.PROCESSING, processing_started_at=timestamp, created_at=timestamp, @@ -908,12 +911,17 @@ async def handle_webhook_unified( db_session.add(webhook_request) await db_session.flush() + webhook_request_id = webhook_request.id + try: # 6. Get processor for webhook_type processor = webhook_registry.get_processor(webhook_endpoint.webhook_type) if not processor: raise ValueError(f"No processor for type: {webhook_endpoint.webhook_type}") + # Persist the webhook row before handing off to processors + await db_session.commit() + # 7. Process webhook with simplified interface result = await processor.process( webhook_request=webhook_request, @@ -922,38 +930,50 @@ async def handle_webhook_unified( event_dispatcher=request.app.state.event_dispatcher, ) - # 8. Update status and link created entities when available - # Re-add to session in case processor called rollback (e.g., for duplicates) - if webhook_request not in db_session: - webhook_request = await db_session.merge(webhook_request) + if not db_session.in_transaction(): + await db_session.begin() - webhook_request.status = WebhookStatus.COMPLETED - webhook_request.processing_completed_at = datetime.now(UTC) + completion_values = { + "status": WebhookStatus.COMPLETED, + "processing_completed_at": datetime.now(UTC), + } - created_customer_id = result.get("customer_id") if isinstance(result, dict) else None - created_reservation_id = ( - result.get("reservation_id") if isinstance(result, dict) else None + if isinstance(result, dict): + created_customer_id = result.get("customer_id") + created_reservation_id = result.get("reservation_id") + if created_customer_id: + completion_values["created_customer_id"] = created_customer_id + if created_reservation_id: + completion_values["created_reservation_id"] = created_reservation_id + + await db_session.execute( + update(WebhookRequest) + .where(WebhookRequest.id == webhook_request_id) + .values(**completion_values) ) - - if created_customer_id: - webhook_request.created_customer_id = created_customer_id - if created_reservation_id: - webhook_request.created_reservation_id = created_reservation_id - await db_session.commit() return { **result, - "webhook_id": webhook_request.id, - "hotel_id": webhook_endpoint.hotel_id, + "webhook_id": webhook_request_id, + "hotel_id": webhook_hotel_id, } except Exception as e: _LOGGER.exception("Error processing webhook: %s", e) - webhook_request.status = WebhookStatus.FAILED - webhook_request.last_error = str(e)[:2000] - webhook_request.processing_completed_at = datetime.now(UTC) + await db_session.rollback() + if not db_session.in_transaction(): + await db_session.begin() + await db_session.execute( + update(WebhookRequest) + .where(WebhookRequest.id == webhook_request_id) + .values( + status=WebhookStatus.FAILED, + last_error=str(e)[:2000], + processing_completed_at=datetime.now(UTC), + ) + ) await db_session.commit() raise HTTPException(status_code=500, detail="Error processing webhook") diff --git a/tests/test_customer_service.py b/tests/test_customer_service.py index 5578e97..e87166e 100644 --- a/tests/test_customer_service.py +++ b/tests/test_customer_service.py @@ -6,7 +6,7 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from alpine_bits_python.customer_service import CustomerService -from alpine_bits_python.db import Base, Customer, HashedCustomer +from alpine_bits_python.db import Base, Customer @pytest_asyncio.fixture From e1bbefb9a3dbbf695246070b11fa67dee4455b2c Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 15:23:10 +0100 Subject: [PATCH 09/20] Significant matching fix. --- src/alpine_bits_python/conversion_service.py | 395 +++++++++++++++---- 1 file changed, 316 insertions(+), 79 deletions(-) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 869a6f7..e76db31 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -631,6 +631,8 @@ class ConversionService: selectinload(Reservation.customer), ) + + query = query.filter(Reservation.hotel_id == self.hotel_id) if self.hotel_id else query result = await session.execute(query) reservations = result.scalars().all() @@ -1050,22 +1052,28 @@ class ConversionService: self, advertising_campagne: str, hotel_id: str | None, - guest_first_name: str | None, - guest_last_name: str | None, - guest_email: str | None, + hashed_first_name: str | None, + hashed_last_name: str | None, + hashed_email: str | None, advertising_partner: str | None, session: AsyncSession | None = None, + raw_first_name: str | None = None, + raw_last_name: str | None = None, + raw_email: str | None = None, ) -> Reservation | None: """Match reservation by advertising tracking data (fbclid/gclid/md5_unique_id). Args: advertising_campagne: Tracking ID from PMS (could be truncated click_id or md5_unique_id) hotel_id: Hotel ID for filtering - guest_first_name: Guest first name for disambiguation - guest_last_name: Guest last name for disambiguation - guest_email: Guest email for disambiguation + hashed_first_name: Guest first name (hashed) for disambiguation + hashed_last_name: Guest last name (hashed) for disambiguation + hashed_email: Guest email (hashed) for disambiguation advertising_partner: Partner info (matches utm_medium) session: AsyncSession to use. If None, uses self.session. + raw_first_name: Plain guest first name (optional fallback) + raw_last_name: Plain guest last name (optional fallback) + raw_email: Plain guest email (optional fallback) Returns: Matched Reservation or None @@ -1098,25 +1106,36 @@ class ConversionService: if not reservations: return None - # If single match, return it - if len(reservations) == 1: + # Determine if this looks like a md5_unique_id (32 hex characters) or a click_id + is_md5_lookup = len(advertising_campagne or "") == 32 + + needs_filtering = len(reservations) > 1 or not is_md5_lookup + + if not needs_filtering: + # Confident single match via md5_unique_id return reservations[0] - # If multiple matches, try to narrow down using guest details + # If multiple matches or click-id matches, try to narrow down using hashed guest details _LOGGER.debug( - "Multiple reservations match advertisingCampagne %s (hotel=%s): found %d matches. " - "Attempting to narrow down using guest details.", + ( + "Ambiguous advertising match for %s (hotel=%s, candidates=%d, md5_lookup=%s). " + "Applying guest detail filtering." + ), advertising_campagne, hotel_id, len(reservations), + is_md5_lookup, ) matched_reservation = self._filter_reservations_by_guest_details( reservations, - guest_first_name, - guest_last_name, - guest_email, + raw_first_name, + raw_last_name, + raw_email, advertising_partner, + hashed_first_name=hashed_first_name, + hashed_last_name=hashed_last_name, + hashed_email=hashed_email, ) if matched_reservation is None: @@ -1126,9 +1145,9 @@ class ConversionService: "(hotel=%s, guest=%s %s, email=%s). Using first match.", advertising_campagne, hotel_id, - guest_first_name, - guest_last_name, - guest_email, + raw_first_name, + raw_last_name, + raw_email, ) matched_reservation = reservations[0] @@ -1210,18 +1229,26 @@ class ConversionService: guest_last_name: str | None, guest_email: str | None, advertising_partner: str | None, + *, + hashed_first_name: str | None = None, + hashed_last_name: str | None = None, + hashed_email: str | None = None, ) -> Reservation | None: """Filter reservations using guest details to find a single match. - First tries to match by guest name and email. If that doesn't yield a single match, - tries matching by advertising_partner against utm_medium. + Prefers hashed comparisons (exact match on hashed email or hashed name pair) and + falls back to plaintext comparison if hashes are unavailable. Finally tries + advertising partner vs utm_medium. Args: reservations: List of candidate reservations - guest_first_name: Guest first name - guest_last_name: Guest last name - guest_email: Guest email + guest_first_name: Guest first name (plaintext, optional) + guest_last_name: Guest last name (plaintext, optional) + guest_email: Guest email (plaintext, optional) advertising_partner: Partner info (e.g., "Facebook_Mobile_Feed") + hashed_first_name: Hashed first name for cross-checking + hashed_last_name: Hashed last name for cross-checking + hashed_email: Hashed email for cross-checking Returns: Single best-match Reservation, or None if no good match found @@ -1229,40 +1256,71 @@ class ConversionService: """ candidates = reservations - # Try to narrow down by guest name and email + # Attempt hashed email match first + if hashed_email: + email_matches = [ + reservation + for reservation in candidates + if reservation.customer + and reservation.customer.hashed_email + and reservation.customer.hashed_email == hashed_email + ] + if len(email_matches) == 1: + _LOGGER.debug("Found unique match via hashed email") + return email_matches[0] + if email_matches: + candidates = email_matches + + # Attempt hashed name match (first + last) + if hashed_first_name and hashed_last_name: + name_matches = [ + reservation + for reservation in candidates + if reservation.customer + and reservation.customer.hashed_given_name == hashed_first_name + and reservation.customer.hashed_surname == hashed_last_name + ] + if len(name_matches) == 1: + _LOGGER.debug("Found unique match via hashed names") + return name_matches[0] + if name_matches: + candidates = name_matches + + # Fallback to plaintext comparison if provided if guest_first_name or guest_last_name or guest_email: - # First try exact match on all available fields for reservation in candidates: customer = reservation.customer - if customer: - name_match = True - email_match = True + if not customer: + continue - if guest_first_name: - name_match = name_match and ( - customer.given_name - and customer.given_name.lower() == guest_first_name.lower() - ) + name_match = True + email_match = True - if guest_last_name: - name_match = name_match and ( - customer.surname - and customer.surname.lower() == guest_last_name.lower() - ) + if guest_first_name: + name_match = name_match and ( + customer.given_name + and customer.given_name.lower() == guest_first_name.lower() + ) - if guest_email: - email_match = ( - customer.email_address - and customer.email_address.lower() == guest_email.lower() - ) + if guest_last_name: + name_match = name_match and ( + customer.surname + and customer.surname.lower() == guest_last_name.lower() + ) - if name_match and email_match: - _LOGGER.debug( - "Found exact match on guest name/email for %s %s", - guest_first_name, - guest_last_name, - ) - return reservation + if guest_email: + email_match = email_match and ( + customer.email_address + and customer.email_address.lower() == guest_email.lower() + ) + + if name_match and email_match: + _LOGGER.debug( + "Found exact plaintext match on guest details for %s %s", + guest_first_name, + guest_last_name, + ) + return reservation # Try to narrow down by advertising_partner matching utm_medium if advertising_partner: @@ -1470,9 +1528,9 @@ class ConversionService: session: AsyncSession for database queries """ - # Collect every guest/customer pair derived from conversions. + # Collect every (hotel, guest) -> customer pair derived from conversions. result = await session.execute( - select(Conversion.guest_id, Conversion.customer_id).where( + select(Conversion.hotel_id, Conversion.guest_id, Conversion.customer_id).where( Conversion.guest_id.isnot(None), Conversion.customer_id.isnot(None) ) ) @@ -1482,27 +1540,54 @@ class ConversionService: _LOGGER.debug("Phase 3d: No matched guests to check for regularity") return - # Deduplicate by guest_id to avoid recalculating when multiple conversions share the same guest. - guest_to_customer: dict[int, int] = {} - for guest_id, customer_id in guest_customer_rows: - if guest_id is None or customer_id is None: + # Group by (hotel_id, guest_id) to detect conflicts. + guest_customer_sets: dict[tuple[str | None, int], set[int]] = {} + for hotel_id, guest_id, customer_id in guest_customer_rows: + if hotel_id is None or guest_id is None or customer_id is None: continue - if guest_id not in guest_to_customer: - guest_to_customer[guest_id] = customer_id - elif guest_to_customer[guest_id] != customer_id: - _LOGGER.warning( - "Guest %s linked to multiple customers (%s, %s); keeping first match", - guest_id, - guest_to_customer[guest_id], - customer_id, + key = (hotel_id, guest_id) + guest_customer_sets.setdefault(key, set()).add(customer_id) + + if not guest_customer_sets: + _LOGGER.debug("Phase 3d: No matched guests to check for regularity") + return + + duplicates = { + key: customer_ids + for key, customer_ids in guest_customer_sets.items() + if len(customer_ids) > 1 + } + if duplicates: + await self._deduplicate_guest_customer_links(duplicates, session) + + guest_to_customer: dict[tuple[str | None, int], int] = {} + for key, customer_ids in guest_customer_sets.items(): + hotel_id, guest_id = key + # After deduplication, reload conversions for this guest to find the remaining customer (if any) + result = await session.execute( + select(Conversion.customer_id) + .where( + Conversion.hotel_id == hotel_id, + Conversion.guest_id == guest_id, + Conversion.customer_id.isnot(None), ) + .limit(1) + ) + chosen_customer = result.scalar_one_or_none() + if chosen_customer: + guest_to_customer[key] = chosen_customer + + if not guest_to_customer: + _LOGGER.debug( + "Phase 3d: No guests remained linked to customers after deduplication" + ) + return _LOGGER.debug( - "Phase 3d: Checking regularity for %d matched guests", - len(guest_to_customer), + "Phase 3d: Checking regularity for %d matched guests", len(guest_to_customer) ) - for guest_id, customer_id in guest_to_customer.items(): + for (hotel_id, guest_id), customer_id in guest_to_customer.items(): await self._check_if_guest_is_regular(guest_id, customer_id, session) async def _match_conversions_from_db_sequential( @@ -1705,15 +1790,21 @@ class ConversionService: # Get conversion_guest if it exists (has the hashed data) conversion_guest = conversion.guest - # Extract hashed data from conversion_guest (already hashed) + # Extract hashed and raw data from conversion_guest hashed_first_name = None hashed_last_name = None hashed_email = None + raw_first_name = None + raw_last_name = None + raw_email = None if conversion_guest: hashed_first_name = conversion_guest.hashed_first_name hashed_last_name = conversion_guest.hashed_last_name hashed_email = conversion_guest.hashed_email + raw_first_name = conversion_guest.guest_first_name + raw_last_name = conversion_guest.guest_last_name + raw_email = conversion_guest.guest_email # Phase 3a: Only try ID-based matching (fbclid/gclid/md5_unique_id) # Guest detail matching is deferred to Phase 3b/3c @@ -1729,6 +1820,9 @@ class ConversionService: hashed_email, conversion.advertising_partner, session, + raw_first_name=raw_first_name, + raw_last_name=raw_last_name, + raw_email=raw_email, ) if matched_reservation: @@ -1838,21 +1932,164 @@ class ConversionService: ) conversion_guest.is_regular = is_regular - if is_regular: - _LOGGER.debug( - "Marking guest %s as regular: earliest paying conversion %s predates first reservation created at %s", - guest_id, - earliest_paying_conversion.reservation_date, - earliest_reservation.created_at, + async def _deduplicate_guest_customer_links( + self, + duplicates: dict[tuple[str | None, int], set[int]], + session: AsyncSession, + ) -> None: + """Resolve guest/customer conflicts by comparing hashed details and severing bad links.""" + for (hotel_id, guest_id), customer_ids in duplicates.items(): + guest_result = await session.execute( + select(ConversionGuest).where( + ConversionGuest.hotel_id == hotel_id, + ConversionGuest.guest_id == guest_id, + ) ) - else: - _LOGGER.debug( - "Guest %s is not regular: first paying conversion %s is after/equal to first reservation created at %s", - guest_id, - earliest_paying_conversion.reservation_date, - earliest_reservation.created_at, + conversion_guest = guest_result.scalar_one_or_none() + + if not conversion_guest: + _LOGGER.warning( + "Guest %s (hotel=%s) missing when resolving duplicates; removing links to customers %s", + guest_id, + hotel_id, + sorted(customer_ids), + ) + for customer_id in customer_ids: + await self._sever_guest_customer_link( + hotel_id, guest_id, customer_id, session + ) + continue + + preferred_customer_id = await self._choose_best_customer_for_guest( + conversion_guest, customer_ids, session ) + if preferred_customer_id: + _LOGGER.warning( + "Guest %s (hotel=%s) linked to multiple customers %s; keeping %s based on hashed data", + guest_id, + hotel_id, + sorted(customer_ids), + preferred_customer_id, + ) + else: + _LOGGER.warning( + "Guest %s (hotel=%s) linked to multiple customers %s but none matched hashed data. Removing all links.", + guest_id, + hotel_id, + sorted(customer_ids), + ) + + for customer_id in customer_ids: + if customer_id != preferred_customer_id: + await self._sever_guest_customer_link( + hotel_id, guest_id, customer_id, session + ) + + async def _choose_best_customer_for_guest( + self, + conversion_guest: ConversionGuest, + candidate_customer_ids: set[int], + session: AsyncSession, + ) -> int | None: + """Pick the most likely customer based on hashed data.""" + if not candidate_customer_ids: + return None + + result = await session.execute( + select(Customer).where(Customer.id.in_(candidate_customer_ids)) + ) + candidates = result.scalars().all() + + if not candidates: + return None + + def score_customer(customer: Customer) -> int: + score = 0 + if ( + conversion_guest.hashed_email + and customer.hashed_email == conversion_guest.hashed_email + ): + score += 100 + if ( + conversion_guest.hashed_first_name + and customer.hashed_given_name == conversion_guest.hashed_first_name + ): + score += 10 + if ( + conversion_guest.hashed_last_name + and customer.hashed_surname == conversion_guest.hashed_last_name + ): + score += 10 + if ( + conversion_guest.hashed_country_code + and customer.hashed_country_code == conversion_guest.hashed_country_code + ): + score += 2 + if ( + conversion_guest.hashed_birth_date + and customer.hashed_birth_date == conversion_guest.hashed_birth_date + ): + score += 1 + return score + + best_customer_id = None + best_score = -1 + is_tied = False + + for customer in candidates: + score = score_customer(customer) + if score > best_score: + best_score = score + best_customer_id = customer.id + is_tied = False + elif score == best_score and score != -1: + is_tied = True + + if best_score <= 0 or is_tied: + return None + + return best_customer_id + + async def _sever_guest_customer_link( + self, + hotel_id: str | None, + guest_id: int, + customer_id: int, + session: AsyncSession, + ) -> None: + """Remove incorrect guest/customer links from conversions.""" + result = await session.execute( + select(Conversion) + .where( + Conversion.hotel_id == hotel_id, + Conversion.guest_id == guest_id, + Conversion.customer_id == customer_id, + ) + .options(selectinload(Conversion.conversion_rooms)) + ) + conversions = result.scalars().all() + + if not conversions: + return + + for conversion in conversions: + conversion.customer_id = None + conversion.reservation_id = None + conversion.directly_attributable = False + conversion.guest_matched = False + conversion.updated_at = datetime.now() + + _LOGGER.warning( + "Removed %d conversion links for guest %s (hotel=%s) customer %s", + len(conversions), + guest_id, + hotel_id, + customer_id, + ) + + + async def _check_if_attributable( self, matched_customer_id: int, From c4ecf3802fe3937b5316eb75e339ffde6e9370ab Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 16:12:07 +0100 Subject: [PATCH 10/20] Fine this needs more work --- src/alpine_bits_python/conversion_service.py | 242 ++++++++++++++----- 1 file changed, 182 insertions(+), 60 deletions(-) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index e76db31..7b53558 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -1116,7 +1116,7 @@ class ConversionService: return reservations[0] # If multiple matches or click-id matches, try to narrow down using hashed guest details - _LOGGER.debug( + _LOGGER.info( ( "Ambiguous advertising match for %s (hotel=%s, candidates=%d, md5_lookup=%s). " "Applying guest detail filtering." @@ -1159,6 +1159,8 @@ class ConversionService: guest_last_name: str | None, guest_email: str | None, session: AsyncSession | None = None, + *, + conversion_guest: ConversionGuest | None = None, ) -> Customer | None: """Match guest by name and email directly to Customer (no Reservation needed). @@ -1208,15 +1210,33 @@ class ConversionService: if len(matches) == 1: return matches[0] - # If multiple matches, prefer email match over name match - for match in matches: - if guest_email and match.hashed_email == guest_email: - _LOGGER.debug( - "Multiple hashed customer matches, preferring email match" - ) - return match + best_customer: Customer | None = None + best_score = -1 + tie = False + + for candidate in matches: + candidate_score = self._score_guest_customer_match( + conversion_guest, + candidate, + hashed_first_name=guest_first_name, + hashed_last_name=guest_last_name, + hashed_email=guest_email, + ) + if candidate_score > best_score: + best_score = candidate_score + best_customer = candidate + tie = False + elif candidate_score == best_score: + tie = True + + if best_customer and best_score > 0 and not tie: + _LOGGER.debug( + "Multiple hashed customer matches; selected candidate %s via score %s", + best_customer.id, + best_score, + ) + return best_customer - # Otherwise return first match _LOGGER.warning( "Multiple hashed customer matches found for guest details, using first match" ) @@ -1400,6 +1420,7 @@ class ConversionService: conversion_guest.hashed_last_name, conversion_guest.hashed_email, session, + conversion_guest=conversion_guest, ) if matched_hashed_customer: @@ -1540,42 +1561,46 @@ class ConversionService: _LOGGER.debug("Phase 3d: No matched guests to check for regularity") return - # Group by (hotel_id, guest_id) to detect conflicts. + # Group by guest and by customer to detect conflicts in both directions. guest_customer_sets: dict[tuple[str | None, int], set[int]] = {} + customer_guest_sets: dict[int, set[tuple[str | None, int]]] = {} for hotel_id, guest_id, customer_id in guest_customer_rows: if hotel_id is None or guest_id is None or customer_id is None: continue - key = (hotel_id, guest_id) - guest_customer_sets.setdefault(key, set()).add(customer_id) + guest_key = (hotel_id, guest_id) + guest_customer_sets.setdefault(guest_key, set()).add(customer_id) + customer_guest_sets.setdefault(customer_id, set()).add(guest_key) if not guest_customer_sets: _LOGGER.debug("Phase 3d: No matched guests to check for regularity") return - duplicates = { + guest_duplicates = { key: customer_ids for key, customer_ids in guest_customer_sets.items() if len(customer_ids) > 1 } - if duplicates: - await self._deduplicate_guest_customer_links(duplicates, session) + if guest_duplicates: + await self._deduplicate_guest_customer_links(guest_duplicates, session) + customer_duplicates = { + customer_id: guest_keys + for customer_id, guest_keys in customer_guest_sets.items() + if len(guest_keys) > 1 + } + if customer_duplicates: + await self._deduplicate_customer_guest_links(customer_duplicates, session) + + refreshed = await session.execute( + select( + Conversion.hotel_id, Conversion.guest_id, Conversion.customer_id + ).where(Conversion.guest_id.isnot(None), Conversion.customer_id.isnot(None)) + ) guest_to_customer: dict[tuple[str | None, int], int] = {} - for key, customer_ids in guest_customer_sets.items(): - hotel_id, guest_id = key - # After deduplication, reload conversions for this guest to find the remaining customer (if any) - result = await session.execute( - select(Conversion.customer_id) - .where( - Conversion.hotel_id == hotel_id, - Conversion.guest_id == guest_id, - Conversion.customer_id.isnot(None), - ) - .limit(1) - ) - chosen_customer = result.scalar_one_or_none() - if chosen_customer: - guest_to_customer[key] = chosen_customer + for hotel_id, guest_id, customer_id in refreshed.all(): + if hotel_id is None or guest_id is None or customer_id is None: + continue + guest_to_customer[(hotel_id, guest_id)] = customer_id if not guest_to_customer: _LOGGER.debug( @@ -1986,6 +2011,60 @@ class ConversionService: hotel_id, guest_id, customer_id, session ) + def _score_guest_customer_match( + self, + conversion_guest: ConversionGuest | None, + customer: Customer | None, + *, + hashed_first_name: str | None = None, + hashed_last_name: str | None = None, + hashed_email: str | None = None, + ) -> int: + """Score how well a guest matches a customer using hashed data.""" + if not customer: + return -1 + + score = 0 + guest_email_hash = ( + hashed_email or (conversion_guest.hashed_email if conversion_guest else None) + ) + guest_first_hash = ( + hashed_first_name + or (conversion_guest.hashed_first_name if conversion_guest else None) + ) + guest_last_hash = ( + hashed_last_name + or (conversion_guest.hashed_last_name if conversion_guest else None) + ) + + if guest_email_hash and customer.hashed_email == guest_email_hash: + score += 200 + if guest_first_hash and guest_last_hash: + if ( + customer.hashed_given_name == guest_first_hash + and customer.hashed_surname == guest_last_hash + ): + score += 50 + elif guest_first_hash and customer.hashed_given_name == guest_first_hash: + score += 10 + elif guest_last_hash and customer.hashed_surname == guest_last_hash: + score += 10 + + if conversion_guest: + if ( + conversion_guest.hashed_country_code + and customer.hashed_country_code + == conversion_guest.hashed_country_code + ): + score += 5 + if ( + conversion_guest.hashed_birth_date + and customer.hashed_birth_date == conversion_guest.hashed_birth_date + ): + score += 2 + + return score + async def _choose_best_customer_for_guest( self, conversion_guest: ConversionGuest, @@ -2004,41 +2083,12 @@ class ConversionService: if not candidates: return None - def score_customer(customer: Customer) -> int: - score = 0 - if ( - conversion_guest.hashed_email - and customer.hashed_email == conversion_guest.hashed_email - ): - score += 100 - if ( - conversion_guest.hashed_first_name - and customer.hashed_given_name == conversion_guest.hashed_first_name - ): - score += 10 - if ( - conversion_guest.hashed_last_name - and customer.hashed_surname == conversion_guest.hashed_last_name - ): - score += 10 - if ( - conversion_guest.hashed_country_code - and customer.hashed_country_code == conversion_guest.hashed_country_code - ): - score += 2 - if ( - conversion_guest.hashed_birth_date - and customer.hashed_birth_date == conversion_guest.hashed_birth_date - ): - score += 1 - return score - best_customer_id = None best_score = -1 is_tied = False for customer in candidates: - score = score_customer(customer) + score = self._score_guest_customer_match(conversion_guest, customer) if score > best_score: best_score = score best_customer_id = customer.id @@ -2051,6 +2101,78 @@ class ConversionService: return best_customer_id + async def _deduplicate_customer_guest_links( + self, + duplicates: dict[int, set[tuple[str | None, int]]], + session: AsyncSession, + ) -> None: + """Ensure each customer is linked to at most one guest.""" + for customer_id, guest_keys in duplicates.items(): + customer_result = await session.execute( + select(Customer).where(Customer.id == customer_id) + ) + customer = customer_result.scalar_one_or_none() + + guest_records: list[tuple[str | None, int, ConversionGuest | None]] = [] + for hotel_id, guest_id in guest_keys: + guest_result = await session.execute( + select(ConversionGuest).where( + ConversionGuest.hotel_id == hotel_id, + ConversionGuest.guest_id == guest_id, + ) + ) + guest_records.append((hotel_id, guest_id, guest_result.scalar_one_or_none())) + + if not customer: + _LOGGER.warning( + "Customer %s missing while deduplicating guests; severing links %s", + customer_id, + guest_keys, + ) + for hotel_id, guest_id, _ in guest_records: + await self._sever_guest_customer_link( + hotel_id, guest_id, customer_id, session + ) + continue + + best_key: tuple[str | None, int] | None = None + best_score = -1 + is_tied = False + for hotel_id, guest_id, guest in guest_records: + score = self._score_guest_customer_match(guest, customer) + if score > best_score: + best_score = score + best_key = (hotel_id, guest_id) + is_tied = False + elif score == best_score: + is_tied = True + + if not best_key or best_score <= 0 or is_tied: + _LOGGER.warning( + "Customer %s linked to guests %s but no clear match; removing all links", + customer_id, + guest_keys, + ) + for hotel_id, guest_id, _ in guest_records: + await self._sever_guest_customer_link( + hotel_id, guest_id, customer_id, session + ) + continue + + _LOGGER.warning( + "Customer %s linked to multiple guests %s; keeping guest %s (hotel=%s, score=%s)", + customer_id, + guest_keys, + best_key[1], + best_key[0], + best_score, + ) + for hotel_id, guest_id, _ in guest_records: + if (hotel_id, guest_id) != best_key: + await self._sever_guest_customer_link( + hotel_id, guest_id, customer_id, session + ) + async def _sever_guest_customer_link( self, hotel_id: str | None, From 24d64bf28f3be65e2deed44e5dc88600ed0ac778 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 17:05:58 +0100 Subject: [PATCH 11/20] Seems to mostly work now. Regular matching is still wrong --- sql_analysis.md | 2 +- src/alpine_bits_python/api.py | 2 +- src/alpine_bits_python/conversion_service.py | 1245 ++++-------------- tests/test_conversion_service.py | 20 +- 4 files changed, 301 insertions(+), 968 deletions(-) diff --git a/sql_analysis.md b/sql_analysis.md index 786f848..61a572e 100644 --- a/sql_analysis.md +++ b/sql_analysis.md @@ -42,7 +42,7 @@ select res.id, res.created_at, con.created_at as "Con Created at", con.updated_a left join alpinebits.conversions as con on con.reservation_id = res.id left join alpinebits.conversion_guests as g on g.guest_id = con.guest_id - where hotel_code = '39054_001' + where hotel_id = '39054_001' order by res.created_at desc limit 400 diff --git a/src/alpine_bits_python/api.py b/src/alpine_bits_python/api.py index 20a3b98..c6ba9e2 100644 --- a/src/alpine_bits_python/api.py +++ b/src/alpine_bits_python/api.py @@ -1197,7 +1197,7 @@ async def _process_conversion_xml_background( # Now process the conversion XML _LOGGER.info("Starting database processing of %s", filename) conversion_service = ConversionService(session_maker, hotel.hotel_id) - processing_stats = await conversion_service.process_conversion_xml(xml_content) + processing_stats = await conversion_service.process_conversion_xml(xml_content, run_full_guest_matching=True) _LOGGER.info( "Conversion processing complete for %s: %s", filename, processing_stats diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 7b53558..4130ebc 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -446,11 +446,16 @@ class ConversionService: len(items), ) - async def process_conversion_xml(self, xml_content: str) -> dict[str, Any]: + async def process_conversion_xml( + self, xml_content: str, *, run_full_guest_matching: bool = False + ) -> dict[str, Any]: """Parse conversion XML and save daily sales data to database. Args: xml_content: XML string containing reservation and daily sales data + run_full_guest_matching: If True, run guest-based matching for all unmatched + conversions of this hotel after processing the XML. When False (default), + guest matching only runs for the conversions touched in this XML. Returns: Dictionary with processing statistics @@ -575,10 +580,7 @@ class ConversionService: len(reservations), ) - # Phase 3: Match all conversions using database data only - # Only match conversions that actually exist in the database - # No XML parsing, no re-hashing - complete separation of concerns - # This also enables matching historical data that wasn't just created + # Phase 3: Match all conversions using database data only via advertising IDs if pms_reservation_ids: _LOGGER.debug("Phase 3: Matching conversions to reservations/customers") if self.supports_concurrent: @@ -590,6 +592,11 @@ class ConversionService: pms_reservation_ids, stats ) + # Guest matching (optional full scan or limited to current XML) + await self._run_guest_matching( + pms_reservation_ids, stats, run_full_guest_matching + ) + return stats async def _load_reservation_cache(self) -> None: @@ -667,7 +674,7 @@ class ConversionService: async def _process_reservations_sequential( self, reservations: list, stats: dict[str, int] - ) -> list[str]: + ) -> list[int]: """Process reservations one at a time (original behavior). Returns: @@ -705,7 +712,7 @@ class ConversionService: async def _process_reservations_concurrent( self, reservations: list, stats: dict[str, int] - ) -> list[str]: + ) -> list[int]: """Process reservations concurrently with semaphore limiting. Each concurrent task gets its own independent database session @@ -1055,25 +1062,17 @@ class ConversionService: hashed_first_name: str | None, hashed_last_name: str | None, hashed_email: str | None, - advertising_partner: str | None, session: AsyncSession | None = None, - raw_first_name: str | None = None, - raw_last_name: str | None = None, - raw_email: str | None = None, ) -> Reservation | None: """Match reservation by advertising tracking data (fbclid/gclid/md5_unique_id). Args: - advertising_campagne: Tracking ID from PMS (could be truncated click_id or md5_unique_id) + advertising_campagne: Tracking ID from PMS (typically md5_unique_id) hotel_id: Hotel ID for filtering hashed_first_name: Guest first name (hashed) for disambiguation hashed_last_name: Guest last name (hashed) for disambiguation hashed_email: Guest email (hashed) for disambiguation - advertising_partner: Partner info (matches utm_medium) session: AsyncSession to use. If None, uses self.session. - raw_first_name: Plain guest first name (optional fallback) - raw_last_name: Plain guest last name (optional fallback) - raw_email: Plain guest email (optional fallback) Returns: Matched Reservation or None @@ -1081,548 +1080,117 @@ class ConversionService: """ if session is None: session = self.session - # Find reservations where: - # - fbclid/gclid starts with the advertising_campagne value, OR - # - md5_unique_id matches exactly (for direct ID matching) - query = select(Reservation).where( + + if not advertising_campagne: + return None + + base_query = select(Reservation).options(selectinload(Reservation.customer)) + if hotel_id: + base_query = base_query.where(Reservation.hotel_id == hotel_id) + + # Normal flow: md5 hash matches exactly + is_md5_lookup = len(advertising_campagne or "") == 32 + if is_md5_lookup: + md5_query = base_query.where( + Reservation.md5_unique_id == advertising_campagne + ) + md5_result = await session.execute(md5_query) + md5_matches = md5_result.scalars().all() + if md5_matches: + return self._select_reservation_by_guest_hashes( + md5_matches, + hashed_first_name, + hashed_last_name, + hashed_email, + ) + + # Fallback: advertising ids (fbclid/gclid) are truncated, so only match prefix + like_pattern = f"{advertising_campagne}%" + advertising_query = base_query.where( or_( - Reservation.fbclid.like(f"{advertising_campagne}%"), - Reservation.gclid.like(f"{advertising_campagne}%"), - Reservation.md5_unique_id == advertising_campagne, + Reservation.fbclid.like(like_pattern), + Reservation.gclid.like(like_pattern), ) ) - - # Eagerly load the customer relationship - query = query.options(selectinload(Reservation.customer)) - - # Add hotel filter if available - if hotel_id: - query = query.where(Reservation.hotel_id == hotel_id) - - # Execute query - db_result = await session.execute(query) - reservations = db_result.scalars().all() + advertising_result = await session.execute(advertising_query) + reservations = advertising_result.scalars().all() if not reservations: return None - # Determine if this looks like a md5_unique_id (32 hex characters) or a click_id - is_md5_lookup = len(advertising_campagne or "") == 32 - - needs_filtering = len(reservations) > 1 or not is_md5_lookup - - if not needs_filtering: - # Confident single match via md5_unique_id - return reservations[0] - - # If multiple matches or click-id matches, try to narrow down using hashed guest details - _LOGGER.info( - ( - "Ambiguous advertising match for %s (hotel=%s, candidates=%d, md5_lookup=%s). " - "Applying guest detail filtering." - ), - advertising_campagne, - hotel_id, - len(reservations), - is_md5_lookup, - ) - - matched_reservation = self._filter_reservations_by_guest_details( - reservations, - raw_first_name, - raw_last_name, - raw_email, - advertising_partner, - hashed_first_name=hashed_first_name, - hashed_last_name=hashed_last_name, - hashed_email=hashed_email, - ) - - if matched_reservation is None: - # If we still can't narrow it down, use the first match and log warning - _LOGGER.warning( - "Could not narrow down multiple reservations for advertisingCampagne %s " - "(hotel=%s, guest=%s %s, email=%s). Using first match.", + if len(reservations) > 1: + _LOGGER.info( + ( + "Ambiguous advertising match for %s (hotel=%s, candidates=%d). " + "Using hashed guest data to deduplicate." + ), advertising_campagne, hotel_id, - raw_first_name, - raw_last_name, - raw_email, - ) - matched_reservation = reservations[0] - - return matched_reservation - - async def _match_by_guest_details_hashed( - self, - guest_first_name: str | None, - guest_last_name: str | None, - guest_email: str | None, - session: AsyncSession | None = None, - *, - conversion_guest: ConversionGuest | None = None, - ) -> Customer | None: - """Match guest by name and email directly to Customer (no Reservation needed). - - This method bypasses the Reservation table entirely and matches directly against - hashed customer data. Used for guest-detail matching where we don't need to link - to a specific reservation. - - Args: - guest_first_name: Guest first name (pre-hashed) - guest_last_name: Guest last name (pre-hashed) - guest_email: Guest email (pre-hashed) - session: AsyncSession to use. If None, uses self.session. - - Returns: - Matched Customer or None - - """ - if session is None: - session = self.session - - # Query all hashed customers that match the guest details - query = select(Customer) - - # Build filter conditions - conditions = [] - if guest_email: - conditions.append(Customer.hashed_email == guest_email) - if guest_first_name and guest_last_name: - conditions.append( - (Customer.hashed_given_name == guest_first_name) - & (Customer.hashed_surname == guest_last_name) + len(reservations), ) - if not conditions: - return None - - # Combine conditions with OR (match if email matches OR name matches) - query = query.where(or_(*conditions)) - - db_result = await session.execute(query) - matches = db_result.scalars().all() - - if not matches: - return None - - # If single match, return it - if len(matches) == 1: - return matches[0] - - best_customer: Customer | None = None - best_score = -1 - tie = False - - for candidate in matches: - candidate_score = self._score_guest_customer_match( - conversion_guest, - candidate, - hashed_first_name=guest_first_name, - hashed_last_name=guest_last_name, - hashed_email=guest_email, - ) - if candidate_score > best_score: - best_score = candidate_score - best_customer = candidate - tie = False - elif candidate_score == best_score: - tie = True - - if best_customer and best_score > 0 and not tie: - _LOGGER.debug( - "Multiple hashed customer matches; selected candidate %s via score %s", - best_customer.id, - best_score, - ) - return best_customer - - _LOGGER.warning( - "Multiple hashed customer matches found for guest details, using first match" + return self._select_reservation_by_guest_hashes( + reservations, + hashed_first_name, + hashed_last_name, + hashed_email, ) - return matches[0] - def _filter_reservations_by_guest_details( + def _select_reservation_by_guest_hashes( self, reservations: list[Reservation], - guest_first_name: str | None, - guest_last_name: str | None, - guest_email: str | None, - advertising_partner: str | None, - *, - hashed_first_name: str | None = None, - hashed_last_name: str | None = None, - hashed_email: str | None = None, + hashed_first_name: str | None, + hashed_last_name: str | None, + hashed_email: str | None, ) -> Reservation | None: - """Filter reservations using guest details to find a single match. + """Select the best matching reservation using hashed guest info.""" + if not reservations: + return None - Prefers hashed comparisons (exact match on hashed email or hashed name pair) and - falls back to plaintext comparison if hashes are unavailable. Finally tries - advertising partner vs utm_medium. - - Args: - reservations: List of candidate reservations - guest_first_name: Guest first name (plaintext, optional) - guest_last_name: Guest last name (plaintext, optional) - guest_email: Guest email (plaintext, optional) - advertising_partner: Partner info (e.g., "Facebook_Mobile_Feed") - hashed_first_name: Hashed first name for cross-checking - hashed_last_name: Hashed last name for cross-checking - hashed_email: Hashed email for cross-checking - - Returns: - Single best-match Reservation, or None if no good match found - - """ - candidates = reservations - - # Attempt hashed email match first - if hashed_email: - email_matches = [ - reservation - for reservation in candidates - if reservation.customer - and reservation.customer.hashed_email + def _matches_email(reservation: Reservation) -> bool: + return ( + hashed_email is not None + and reservation.customer is not None and reservation.customer.hashed_email == hashed_email - ] - if len(email_matches) == 1: - _LOGGER.debug("Found unique match via hashed email") - return email_matches[0] - if email_matches: - candidates = email_matches + ) - # Attempt hashed name match (first + last) - if hashed_first_name and hashed_last_name: - name_matches = [ - reservation - for reservation in candidates - if reservation.customer + def _matches_full_name(reservation: Reservation) -> bool: + return ( + hashed_first_name is not None + and hashed_last_name is not None + and reservation.customer is not None and reservation.customer.hashed_given_name == hashed_first_name and reservation.customer.hashed_surname == hashed_last_name - ] - if len(name_matches) == 1: - _LOGGER.debug("Found unique match via hashed names") - return name_matches[0] - if name_matches: - candidates = name_matches - - # Fallback to plaintext comparison if provided - if guest_first_name or guest_last_name or guest_email: - for reservation in candidates: - customer = reservation.customer - if not customer: - continue - - name_match = True - email_match = True - - if guest_first_name: - name_match = name_match and ( - customer.given_name - and customer.given_name.lower() == guest_first_name.lower() - ) - - if guest_last_name: - name_match = name_match and ( - customer.surname - and customer.surname.lower() == guest_last_name.lower() - ) - - if guest_email: - email_match = email_match and ( - customer.email_address - and customer.email_address.lower() == guest_email.lower() - ) - - if name_match and email_match: - _LOGGER.debug( - "Found exact plaintext match on guest details for %s %s", - guest_first_name, - guest_last_name, - ) - return reservation - - # Try to narrow down by advertising_partner matching utm_medium - if advertising_partner: - for reservation in candidates: - if ( - reservation.utm_medium - and reservation.utm_medium.lower() == advertising_partner.lower() - ): - _LOGGER.debug( - "Found match on advertising_partner=%s matching utm_medium", - advertising_partner, - ) - return reservation - - # No single clear match found - return None - - async def _extract_unmatched_guests( - self, session: AsyncSession - ) -> dict[str, Customer]: - """Phase 3b: Extract unique guests from unmatched conversions and match them to customers. - - Returns a mapping of guest_id -> Customer for all unique guests found in - unmatched conversions. Only processes each guest once. - - This includes: - 1. Conversions with no customer match at all (reservation_id IS NULL AND customer_id IS NULL) - 2. Conversions matched to a customer but not a reservation (reservation_id IS NULL AND customer_id IS NOT NULL) - - These may have been matched in a previous run and need re-evaluation for reservation linking - - Args: - session: AsyncSession for database queries - - Returns: - Dictionary mapping guest_id to matched Customer (or None if no match) - - """ - # Find all conversions that either: - # - Have no match at all (reservation_id IS NULL AND customer_id IS NULL), OR - # - Have a customer but no reservation (for re-linking in case new reservations were added) - result = await session.execute( - select(Conversion) - .where( - (Conversion.reservation_id.is_(None)) - & (Conversion.guest_id.isnot(None)) - ) - .options(selectinload(Conversion.guest)) - ) - unmatched_conversions = result.scalars().all() - - if not unmatched_conversions: - _LOGGER.debug("Phase 3b: No unmatched conversions with guests") - return {} - - # Extract unique guests (by guest_id) - unique_guests: dict[str, Conversion] = {} - for conversion in unmatched_conversions: - if conversion.guest_id not in unique_guests: - unique_guests[conversion.guest_id] = conversion - - _LOGGER.info( - "Phase 3b: Found %d unique guests from %d unmatched conversions", - len(unique_guests), - len(unmatched_conversions), - ) - - # Match each unique guest to a hashed customer - guest_to_hashed_customer: dict[str, Customer] = {} - for guest_id, conversion in unique_guests.items(): - conversion_guest = conversion.guest - if not conversion_guest: - continue - - # Try to match by guest details - matched_hashed_customer = await self._match_by_guest_details_hashed( - conversion_guest.hashed_first_name, - conversion_guest.hashed_last_name, - conversion_guest.hashed_email, - session, - conversion_guest=conversion_guest, ) - if matched_hashed_customer: - guest_to_hashed_customer[guest_id] = matched_hashed_customer - _LOGGER.debug( - "Phase 3b: Matched guest %s to hashed_customer %d", - guest_id, - matched_hashed_customer.id, + email_matches = [res for res in reservations if _matches_email(res)] + if email_matches: + if len(email_matches) > 1: + _LOGGER.warning( + "Multiple reservations matched hashed email; using first candidate" ) - else: - guest_to_hashed_customer[guest_id] = None - _LOGGER.debug("Phase 3b: No match found for guest %s", guest_id) + return email_matches[0] - return guest_to_hashed_customer - - async def _link_matched_guests_to_reservations( - self, - guest_to_customer_dict: dict[str, Customer], - session: AsyncSession, - stats: dict[str, int], - ) -> None: - """Phase 3c: Link conversions from matched guests to reservations based on dates. - - For each guest matched to a hashed_customer, find all conversions from that guest - that don't have a reservation yet, and try to link them to reservations based on - date matching. - - This includes: - 1. Conversions with no customer match (will link customer first) - 2. Conversions already linked to a customer from a previous run (will try to link to reservation) - - After all conversions for a guest are processed, check if the guest is a regular - by looking at whether they have paying conversions that predate any reservations. - - Args: - guest_to_customer: Mapping from guest_id to matched Customer - session: AsyncSession for database queries - stats: Shared stats dictionary to update - - """ - for guest_id, matched_hashed_customer in guest_to_customer_dict.items(): - if not matched_hashed_customer or not matched_hashed_customer.id: - continue - - # Find all conversions from this guest that don't have a reservation - # (whether or not they have a customer match - we might be re-running after new reservations added) - result = await session.execute( - select(Conversion) - .where( - (Conversion.guest_id == guest_id) - & (Conversion.reservation_id.is_(None)) + name_matches = [res for res in reservations if _matches_full_name(res)] + if name_matches: + if len(name_matches) > 1: + _LOGGER.warning( + "Multiple reservations matched hashed name; using first candidate" ) - .options( - selectinload(Conversion.conversion_rooms), - selectinload(Conversion.guest), - ) - ) - conversions = result.scalars().all() + return name_matches[0] - if not conversions: - continue - - _LOGGER.debug( - "Phase 3c: Processing %d conversions for guest %s (customer_id=%d)", - len(conversions), - guest_id, - matched_hashed_customer.id, + if len(reservations) > 1: + _LOGGER.warning( + "Unable to disambiguate %d reservations without hashed guest data; using first match", + len(reservations), ) - # Try to link each conversion to a reservation for this customer - for conversion in conversions: - ( - matched_reservation, - is_attributable, - ) = await self._check_if_attributable( - matched_hashed_customer.id, conversion, session - ) - - if matched_reservation and is_attributable: - # Only update stats if this is a NEW match (wasn't matched before) - was_previously_matched = conversion.customer_id is not None - - conversion.reservation_id = matched_reservation.id - conversion.customer_id = matched_hashed_customer.id - conversion.directly_attributable = True - conversion.guest_matched = True - conversion.updated_at = datetime.now() - - if not was_previously_matched: - stats["matched_to_reservation"] += 1 - - _LOGGER.info( - "Phase 3c: Linked conversion (pms_id=%s) to reservation %d via guest matching", - conversion.pms_reservation_id, - matched_reservation.id, - ) - elif matched_hashed_customer and conversion.customer_id is None: - # Only count new customer matches (conversions that didn't have a customer before) - conversion.customer_id = matched_hashed_customer.id - conversion.directly_attributable = False - conversion.guest_matched = True - conversion.updated_at = datetime.now() - stats["matched_to_customer"] += 1 - - # After all conversions for this guest are processed, check if guest is regular - # Look at ALL conversions from this guest to see if there are pre-dated payments - if conversions and conversions[0].guest: - await self._check_if_guest_is_regular( - guest_id, matched_hashed_customer.id, session - ) - - async def _check_regularity_for_all_matched_guests( - self, session: AsyncSession - ) -> None: - """Phase 3d: Check regularity for ALL matched guests (both ID-matched and guest-detail-matched). - - This is called after all matching is complete to evaluate every guest that has been - matched to a customer, regardless of match type. This ensures consistent regularity - evaluation across all matched conversions. - - This is run on ALL matched guests, not just newly matched ones, to ensure that if - the regularity logic changes, it gets re-applied to all guests on the next run. - This maintains idempotency of the matching process. - - Args: - session: AsyncSession for database queries - - """ - # Collect every (hotel, guest) -> customer pair derived from conversions. - result = await session.execute( - select(Conversion.hotel_id, Conversion.guest_id, Conversion.customer_id).where( - Conversion.guest_id.isnot(None), Conversion.customer_id.isnot(None) - ) - ) - guest_customer_rows = result.all() - - if not guest_customer_rows: - _LOGGER.debug("Phase 3d: No matched guests to check for regularity") - return - - # Group by guest and by customer to detect conflicts in both directions. - guest_customer_sets: dict[tuple[str | None, int], set[int]] = {} - customer_guest_sets: dict[int, set[tuple[str | None, int]]] = {} - for hotel_id, guest_id, customer_id in guest_customer_rows: - if hotel_id is None or guest_id is None or customer_id is None: - continue - guest_key = (hotel_id, guest_id) - guest_customer_sets.setdefault(guest_key, set()).add(customer_id) - customer_guest_sets.setdefault(customer_id, set()).add(guest_key) - - if not guest_customer_sets: - _LOGGER.debug("Phase 3d: No matched guests to check for regularity") - return - - guest_duplicates = { - key: customer_ids - for key, customer_ids in guest_customer_sets.items() - if len(customer_ids) > 1 - } - if guest_duplicates: - await self._deduplicate_guest_customer_links(guest_duplicates, session) - - customer_duplicates = { - customer_id: guest_keys - for customer_id, guest_keys in customer_guest_sets.items() - if len(guest_keys) > 1 - } - if customer_duplicates: - await self._deduplicate_customer_guest_links(customer_duplicates, session) - - refreshed = await session.execute( - select( - Conversion.hotel_id, Conversion.guest_id, Conversion.customer_id - ).where(Conversion.guest_id.isnot(None), Conversion.customer_id.isnot(None)) - ) - guest_to_customer: dict[tuple[str | None, int], int] = {} - for hotel_id, guest_id, customer_id in refreshed.all(): - if hotel_id is None or guest_id is None or customer_id is None: - continue - guest_to_customer[(hotel_id, guest_id)] = customer_id - - if not guest_to_customer: - _LOGGER.debug( - "Phase 3d: No guests remained linked to customers after deduplication" - ) - return - - _LOGGER.debug( - "Phase 3d: Checking regularity for %d matched guests", len(guest_to_customer) - ) - - for (hotel_id, guest_id), customer_id in guest_to_customer.items(): - await self._check_if_guest_is_regular(guest_id, customer_id, session) + return reservations[0] async def _match_conversions_from_db_sequential( - self, pms_reservation_ids: list[str], stats: dict[str, int] + self, pms_reservation_ids: list[int], stats: dict[str, int] ) -> None: - """Phase 3a: Match conversions sequentially using database data only. - - Performs ID-based matching for all conversions. Then extracts unique guests - and performs guest detail matching in phases 3b and 3c. - """ + """Match conversions sequentially using database data only.""" semaphore = asyncio.Semaphore(1) # Process one at a time async with asyncio.TaskGroup() as tg: for pms_id in pms_reservation_ids: @@ -1630,44 +1198,10 @@ class ConversionService: self._match_conversion_from_db_safe(pms_id, semaphore, stats) ) - # After Phase 3a (ID-based matching), do Phase 3b and 3c (guest detail matching) - if self.session_maker: - session = await self.session_maker.create_session() - else: - session = self.session - - try: - # Phase 3b: Extract and cache unique guests from unmatched conversions - guest_to_hashed_customer = await self._extract_unmatched_guests(session) - - # Phase 3c: Link matched guests to reservations - if guest_to_hashed_customer: - await self._link_matched_guests_to_reservations( - guest_to_hashed_customer, session, stats - ) - - # Phase 3d: Check regularity for all matched guests (both ID and guest-detail matched) - await self._check_regularity_for_all_matched_guests(session) - - await session.commit() - except Exception as e: - await session.rollback() - _LOGGER.exception("Error in Phase 3b/3c/3d guest matching: %s", e) - finally: - if self.session_maker: - await session.close() - async def _match_conversions_from_db_concurrent( - self, pms_reservation_ids: list[str], stats: dict[str, int] + self, pms_reservation_ids: list[int], stats: dict[str, int] ) -> None: - """Phase 3a: Match conversions concurrently using database data only. - - Performs ID-based matching for all conversions concurrently. Then extracts unique guests - and performs guest detail matching sequentially in phases 3b and 3c. - - Each concurrent task gets its own independent database session - from the SessionMaker. - """ + """Match conversions concurrently using database data only.""" if not self.session_maker: _LOGGER.error( "Concurrent matching requested but SessionMaker not available. " @@ -1683,30 +1217,6 @@ class ConversionService: self._match_conversion_from_db_safe(pms_id, semaphore, stats) ) - # After Phase 3a (ID-based matching), do Phase 3b and 3c (guest detail matching) - # Use sequential processing for guest matching to avoid duplicating work - session = await self.session_maker.create_session() - - try: - # Phase 3b: Extract and cache unique guests from unmatched conversions - guest_to_hashed_customer = await self._extract_unmatched_guests(session) - - # Phase 3c: Link matched guests to reservations - if guest_to_hashed_customer: - await self._link_matched_guests_to_reservations( - guest_to_hashed_customer, session, stats - ) - - # Phase 3d: Check regularity for all matched guests (both ID and guest-detail matched) - await self._check_regularity_for_all_matched_guests(session) - - await session.commit() - except Exception as e: - await session.rollback() - _LOGGER.exception("Error in Phase 3b/3c/3d guest matching: %s", e) - finally: - await session.close() - async def _match_conversion_from_db_safe( self, pms_reservation_id: int, @@ -1764,16 +1274,10 @@ class ConversionService: session: AsyncSession | None = None, stats: dict[str, int] | None = None, ) -> None: - """Phase 3a: Match a conversion using ID-based matching only. + """Match a conversion to an existing reservation using tracking IDs. - This method reads both the conversion and conversion_guest from the database - and uses their stored hashed data to match to existing reservations via - advertising data (fbclid/gclid/md5_unique_id). - - Guest detail matching is deferred to Phase 3b/3c to avoid reprocessing the same - guest multiple times. - - Updates stats dictionary in-place if provided. + Uses the stored advertising tracking ID (md5_unique_id/fbclid/gclid) plus hashed + guest details for deduplication when multiple reservations share the same tracking ID. Args: pms_reservation_id: PMS reservation ID to match @@ -1819,20 +1323,12 @@ class ConversionService: hashed_first_name = None hashed_last_name = None hashed_email = None - raw_first_name = None - raw_last_name = None - raw_email = None if conversion_guest: hashed_first_name = conversion_guest.hashed_first_name hashed_last_name = conversion_guest.hashed_last_name hashed_email = conversion_guest.hashed_email - raw_first_name = conversion_guest.guest_first_name - raw_last_name = conversion_guest.guest_last_name - raw_email = conversion_guest.guest_email - # Phase 3a: Only try ID-based matching (fbclid/gclid/md5_unique_id) - # Guest detail matching is deferred to Phase 3b/3c matched_reservation = None matched_customer = None @@ -1843,30 +1339,22 @@ class ConversionService: hashed_first_name, hashed_last_name, hashed_email, - conversion.advertising_partner, session, - raw_first_name=raw_first_name, - raw_last_name=raw_last_name, - raw_email=raw_email, ) if matched_reservation: matched_customer = matched_reservation.customer _LOGGER.info( - "Phase 3a: Matched conversion by advertising ID (pms_id=%s, reservation_id=%d)", + "Matched conversion by advertising ID (pms_id=%s, reservation_id=%d)", pms_reservation_id, matched_reservation.id, ) # Update the conversion with matched entities if found - if matched_reservation or matched_customer: - conversion.reservation_id = ( - matched_reservation.id if matched_reservation else None - ) + if matched_reservation: + conversion.reservation_id = matched_reservation.id conversion.customer_id = matched_customer.id if matched_customer else None - - # ID-based matches are always directly attributable conversion.directly_attributable = True conversion.guest_matched = False @@ -1877,383 +1365,239 @@ class ConversionService: if stats is not None: if matched_reservation: stats["matched_to_reservation"] += 1 - elif matched_customer: - stats["matched_to_customer"] += 1 else: stats["unmatched"] += 1 - async def _check_if_guest_is_regular( + async def _run_guest_matching( self, - guest_id: str, - customer_id: int, - session: AsyncSession, + pms_reservation_ids: list[int], + stats: dict[str, int], + run_full_guest_matching: bool, ) -> None: - """Check if a guest is a regular customer based on conversion and reservation history. - - A guest is regular if they have conversions with paying bookings that predate their first - reservation sent by us. This indicates they were already a customer before we started tracking. - - This check is done AFTER all conversions for a guest have been processed and matched, - so it can evaluate the complete picture of their payment history vs our reservation history. + """Run guest-detail matching for conversions. Args: - guest_id: The guest ID to evaluate - customer_id: The matched customer ID - session: AsyncSession for database queries + pms_reservation_ids: Reservation IDs processed in the current XML run + stats: Shared stats dictionary + run_full_guest_matching: If True, scan all guest conversions (including those already matched) """ - # Get the ConversionGuest record - guest_result = await session.execute( - select(ConversionGuest).where(ConversionGuest.guest_id == guest_id) - ) - conversion_guest = guest_result.scalar_one_or_none() - - if not conversion_guest: + if not self.hotel_id: + _LOGGER.warning("Guest matching skipped: hotel_id not set") return - # Find the earliest paying conversion for this guest (across all hotels) - # Look for conversions with actual revenue - earliest_paying_conversion_result = await session.execute( - select(Conversion) - .join(ConversionRoom, Conversion.id == ConversionRoom.conversion_id) - .where( - Conversion.guest_id == guest_id, - ConversionRoom.total_revenue.isnot(None), - ConversionRoom.total_revenue > Decimal(0), - ) - .order_by(Conversion.reservation_date.asc()) - .limit(1) - ) - earliest_paying_conversion = ( - earliest_paying_conversion_result.scalar_one_or_none() - ) - - if not earliest_paying_conversion: - # No paying conversions found for this guest - conversion_guest.is_regular = False + if not run_full_guest_matching and not pms_reservation_ids: return - # Find the earliest reservation sent to this customer - earliest_reservation_result = await session.execute( - select(Reservation) - .where(Reservation.customer_id == customer_id) - .order_by(Reservation.start_date.asc()) - .limit(1) - ) - earliest_reservation = earliest_reservation_result.scalar_one_or_none() + if self.session_maker: + session = await self.session_maker.create_session() + close_session = True + else: + session = self.session + close_session = False - if not earliest_reservation: - # No reservations for this customer yet, can't determine regularity - conversion_guest.is_regular = False + if not session: + _LOGGER.warning("Guest matching skipped: no active session") return - # Guest is regular if their earliest paying conversion predates our first reservation - # (meaning they were already a customer before we sent them a reservation) - # Compare against the reservation's creation date (when WE created/sent it), not check-in date - # Convert created_at to date for comparison with reservation_date (both are dates) - is_regular = ( - earliest_paying_conversion.reservation_date - < earliest_reservation.created_at.date() - ) - conversion_guest.is_regular = is_regular + try: + base_conditions = [ + Conversion.hotel_id == self.hotel_id, + Conversion.guest_id.isnot(None), + ] + mode_label = "full-scan" if run_full_guest_matching else "new-only" - async def _deduplicate_guest_customer_links( - self, - duplicates: dict[tuple[str | None, int], set[int]], - session: AsyncSession, - ) -> None: - """Resolve guest/customer conflicts by comparing hashed details and severing bad links.""" - for (hotel_id, guest_id), customer_ids in duplicates.items(): - guest_result = await session.execute( - select(ConversionGuest).where( - ConversionGuest.hotel_id == hotel_id, - ConversionGuest.guest_id == guest_id, - ) - ) - conversion_guest = guest_result.scalar_one_or_none() - - if not conversion_guest: - _LOGGER.warning( - "Guest %s (hotel=%s) missing when resolving duplicates; removing links to customers %s", - guest_id, - hotel_id, - sorted(customer_ids), - ) - for customer_id in customer_ids: - await self._sever_guest_customer_link( - hotel_id, guest_id, customer_id, session - ) - continue - - preferred_customer_id = await self._choose_best_customer_for_guest( - conversion_guest, customer_ids, session + query = select(Conversion).where(*base_conditions).options( + selectinload(Conversion.guest), + selectinload(Conversion.conversion_rooms), + selectinload(Conversion.customer), ) - if preferred_customer_id: - _LOGGER.warning( - "Guest %s (hotel=%s) linked to multiple customers %s; keeping %s based on hashed data", - guest_id, - hotel_id, - sorted(customer_ids), - preferred_customer_id, + target_ids: set[int] = set() + if run_full_guest_matching: + _LOGGER.info( + "Guest matching (%s): scanning all conversions with guests for hotel %s", + mode_label, + self.hotel_id, ) else: - _LOGGER.warning( - "Guest %s (hotel=%s) linked to multiple customers %s but none matched hashed data. Removing all links.", - guest_id, - hotel_id, - sorted(customer_ids), + query = query.where(Conversion.reservation_id.is_(None)) + if pms_reservation_ids: + target_ids = { + int(pms_id) + for pms_id in pms_reservation_ids + if pms_id is not None + } + if not target_ids: + return + query = query.where(Conversion.pms_reservation_id.in_(target_ids)) + _LOGGER.debug( + "Guest matching (%s): scanning %d recent conversions", + mode_label, + len(target_ids), ) - for customer_id in customer_ids: - if customer_id != preferred_customer_id: - await self._sever_guest_customer_link( - hotel_id, guest_id, customer_id, session - ) + result = await session.execute(query) + conversions = result.scalars().all() - def _score_guest_customer_match( + if not conversions: + return + + for conversion in conversions: + had_reservation = conversion.reservation_id is not None + had_customer = conversion.customer_id is not None + + matched_reservation, matched_customer = await self._apply_guest_matching( + conversion, session, rematch_existing=run_full_guest_matching + ) + + if matched_reservation and not had_reservation: + stats["matched_to_reservation"] += 1 + if stats["unmatched"] > 0: + stats["unmatched"] -= 1 + elif matched_customer and not had_customer: + stats["matched_to_customer"] += 1 + if stats["unmatched"] > 0: + stats["unmatched"] -= 1 + + await session.commit() + except Exception as exc: + await session.rollback() + _LOGGER.exception("Guest matching failed: %s", exc) + finally: + if close_session: + await session.close() + + async def _apply_guest_matching( self, - conversion_guest: ConversionGuest | None, - customer: Customer | None, - *, - hashed_first_name: str | None = None, - hashed_last_name: str | None = None, - hashed_email: str | None = None, - ) -> int: - """Score how well a guest matches a customer using hashed data.""" - if not customer: - return -1 - - score = 0 - guest_email_hash = ( - hashed_email or (conversion_guest.hashed_email if conversion_guest else None) - ) - guest_first_hash = ( - hashed_first_name - or (conversion_guest.hashed_first_name if conversion_guest else None) - ) - guest_last_hash = ( - hashed_last_name - or (conversion_guest.hashed_last_name if conversion_guest else None) - ) - - if guest_email_hash and customer.hashed_email == guest_email_hash: - score += 200 - if guest_first_hash and guest_last_hash: - if ( - customer.hashed_given_name == guest_first_hash - and customer.hashed_surname == guest_last_hash - ): - score += 50 - elif guest_first_hash and customer.hashed_given_name == guest_first_hash: - score += 10 - elif guest_last_hash and customer.hashed_surname == guest_last_hash: - score += 10 - - if conversion_guest: - if ( - conversion_guest.hashed_country_code - and customer.hashed_country_code - == conversion_guest.hashed_country_code - ): - score += 5 - if ( - conversion_guest.hashed_birth_date - and customer.hashed_birth_date == conversion_guest.hashed_birth_date - ): - score += 2 - - return score - - async def _choose_best_customer_for_guest( - self, - conversion_guest: ConversionGuest, - candidate_customer_ids: set[int], + conversion: Conversion, session: AsyncSession, - ) -> int | None: - """Pick the most likely customer based on hashed data.""" - if not candidate_customer_ids: + *, + rematch_existing: bool, + ) -> tuple[Reservation | None, Customer | None]: + """Apply guest-detail matching to a single conversion.""" + guest = conversion.guest + if not guest: + return None, None + + existing_customer = conversion.customer + matched_customer = existing_customer + + if rematch_existing or matched_customer is None: + candidate = await self._find_customer_for_guest(guest, session) + if candidate is not None: + matched_customer = candidate + elif matched_customer is None: + return None, None + + matched_reservation = None + if matched_customer: + matched_reservation = await self._find_reservation_for_customer( + matched_customer.id, conversion, session + ) + + if matched_customer: + conversion.customer_id = matched_customer.id + conversion.guest_matched = True + + if matched_reservation: + conversion.reservation_id = matched_reservation.id + conversion.directly_attributable = True + elif conversion.reservation_id is None: + conversion.directly_attributable = False + + conversion.updated_at = datetime.now() + + return matched_reservation, matched_customer + + async def _find_customer_for_guest( + self, guest: ConversionGuest, session: AsyncSession + ) -> Customer | None: + """Locate the best matching customer for the given guest hashes.""" + conditions = [] + if guest.hashed_email: + conditions.append(Customer.hashed_email == guest.hashed_email) + if guest.hashed_first_name and guest.hashed_last_name: + conditions.append( + (Customer.hashed_given_name == guest.hashed_first_name) + & (Customer.hashed_surname == guest.hashed_last_name) + ) + + if not conditions: return None - result = await session.execute( - select(Customer).where(Customer.id.in_(candidate_customer_ids)) - ) + query = select(Customer).where(or_(*conditions)) + result = await session.execute(query) candidates = result.scalars().all() if not candidates: return None - best_customer_id = None - best_score = -1 - is_tied = False + if len(candidates) == 1: + return candidates[0] - for customer in candidates: - score = self._score_guest_customer_match(conversion_guest, customer) + best_customer: Customer | None = None + best_score = -1 + tie = False + for candidate in candidates: + score = self._score_guest_customer_match(guest, candidate) if score > best_score: best_score = score - best_customer_id = customer.id - is_tied = False - elif score == best_score and score != -1: - is_tied = True + best_customer = candidate + tie = False + elif score == best_score: + tie = True - if best_score <= 0 or is_tied: - return None + if best_customer and best_score > 0 and not tie: + return best_customer - return best_customer_id + return candidates[0] - async def _deduplicate_customer_guest_links( + def _score_guest_customer_match( + self, guest: ConversionGuest, customer: Customer + ) -> int: + """Score how well a guest matches a given customer using hashed data.""" + score = 0 + + if guest.hashed_email and customer.hashed_email == guest.hashed_email: + score += 200 + + if guest.hashed_first_name and guest.hashed_last_name: + if ( + customer.hashed_given_name == guest.hashed_first_name + and customer.hashed_surname == guest.hashed_last_name + ): + score += 100 + else: + if guest.hashed_first_name and customer.hashed_given_name == guest.hashed_first_name: + score += 40 + if guest.hashed_last_name and customer.hashed_surname == guest.hashed_last_name: + score += 40 + + if guest.hashed_country_code and customer.hashed_country_code == guest.hashed_country_code: + score += 5 + if guest.hashed_birth_date and customer.hashed_birth_date == guest.hashed_birth_date: + score += 2 + + return score + + async def _find_reservation_for_customer( self, - duplicates: dict[int, set[tuple[str | None, int]]], - session: AsyncSession, - ) -> None: - """Ensure each customer is linked to at most one guest.""" - for customer_id, guest_keys in duplicates.items(): - customer_result = await session.execute( - select(Customer).where(Customer.id == customer_id) - ) - customer = customer_result.scalar_one_or_none() - - guest_records: list[tuple[str | None, int, ConversionGuest | None]] = [] - for hotel_id, guest_id in guest_keys: - guest_result = await session.execute( - select(ConversionGuest).where( - ConversionGuest.hotel_id == hotel_id, - ConversionGuest.guest_id == guest_id, - ) - ) - guest_records.append((hotel_id, guest_id, guest_result.scalar_one_or_none())) - - if not customer: - _LOGGER.warning( - "Customer %s missing while deduplicating guests; severing links %s", - customer_id, - guest_keys, - ) - for hotel_id, guest_id, _ in guest_records: - await self._sever_guest_customer_link( - hotel_id, guest_id, customer_id, session - ) - continue - - best_key: tuple[str | None, int] | None = None - best_score = -1 - is_tied = False - for hotel_id, guest_id, guest in guest_records: - score = self._score_guest_customer_match(guest, customer) - if score > best_score: - best_score = score - best_key = (hotel_id, guest_id) - is_tied = False - elif score == best_score: - is_tied = True - - if not best_key or best_score <= 0 or is_tied: - _LOGGER.warning( - "Customer %s linked to guests %s but no clear match; removing all links", - customer_id, - guest_keys, - ) - for hotel_id, guest_id, _ in guest_records: - await self._sever_guest_customer_link( - hotel_id, guest_id, customer_id, session - ) - continue - - _LOGGER.warning( - "Customer %s linked to multiple guests %s; keeping guest %s (hotel=%s, score=%s)", - customer_id, - guest_keys, - best_key[1], - best_key[0], - best_score, - ) - for hotel_id, guest_id, _ in guest_records: - if (hotel_id, guest_id) != best_key: - await self._sever_guest_customer_link( - hotel_id, guest_id, customer_id, session - ) - - async def _sever_guest_customer_link( - self, - hotel_id: str | None, - guest_id: int, customer_id: int, - session: AsyncSession, - ) -> None: - """Remove incorrect guest/customer links from conversions.""" - result = await session.execute( - select(Conversion) - .where( - Conversion.hotel_id == hotel_id, - Conversion.guest_id == guest_id, - Conversion.customer_id == customer_id, - ) - .options(selectinload(Conversion.conversion_rooms)) - ) - conversions = result.scalars().all() - - if not conversions: - return - - for conversion in conversions: - conversion.customer_id = None - conversion.reservation_id = None - conversion.directly_attributable = False - conversion.guest_matched = False - conversion.updated_at = datetime.now() - - _LOGGER.warning( - "Removed %d conversion links for guest %s (hotel=%s) customer %s", - len(conversions), - guest_id, - hotel_id, - customer_id, - ) - - - - async def _check_if_attributable( - self, - matched_customer_id: int, conversion: Conversion, session: AsyncSession, - exclude_reservation_id: int | None = None, - ) -> tuple[Reservation | None, bool]: - """Search for and check if a customer has an attributable reservation for this conversion. - - Finds reservations linked to the customer and checks if any have dates matching this conversion. - A conversion is attributable ONLY if the conversion_room dates match a reservation's dates closely. - - Args: - matched_customer_id: The customer ID to search reservations for - conversion: The Conversion record being evaluated - session: AsyncSession for database queries - exclude_reservation_id: Reservation ID to exclude from search (if already matched) - - Returns: - Tuple of (matched_reservation, is_attributable) where: - - matched_reservation: The Reservation that matches (if any) - - is_attributable: True if the reservation's dates match this conversion - - """ - # Check if conversion_room dates exist (criterion for attributability) + ) -> Reservation | None: + """Find a reservation for the customer that matches this conversion's stay dates.""" if not conversion.conversion_rooms: - return None, False + return None - # Find reservations for this customer - query = select(Reservation).where( - Reservation.customer_id == matched_customer_id - ) - if exclude_reservation_id: - query = query.where(Reservation.id != exclude_reservation_id) - - db_result = await session.execute(query) - reservations = db_result.scalars().all() + query = select(Reservation).where(Reservation.customer_id == customer_id) + result = await session.execute(query) + reservations = result.scalars().all() if not reservations: - return None, False + return None - # Check each reservation for date match for reservation in reservations: for room in conversion.conversion_rooms: if ( @@ -2262,24 +1606,13 @@ class ConversionService: and reservation.start_date and reservation.end_date ): - # Check if dates match or mostly match (within 7 day tolerance) arrival_match = ( abs((room.arrival_date - reservation.start_date).days) <= 7 ) departure_match = ( abs((room.departure_date - reservation.end_date).days) <= 7 ) - if arrival_match and departure_match: - _LOGGER.info( - "Found attributable reservation for customer %d: " - "room dates %s-%s match reservation dates %s-%s", - matched_customer_id, - room.arrival_date, - room.departure_date, - reservation.start_date, - reservation.end_date, - ) - return reservation, True + return reservation - return None, False + return None diff --git a/tests/test_conversion_service.py b/tests/test_conversion_service.py index c180198..b1b5ace 100644 --- a/tests/test_conversion_service.py +++ b/tests/test_conversion_service.py @@ -142,7 +142,7 @@ class TestConversionServiceWithImportedData: ## Need to check if reservations and customers are now actually available in the db before proceeding - conversion_service = ConversionService(test_db_session) + conversion_service = ConversionService(test_db_session, hotel_id="39054_001") stats = await conversion_service.process_conversion_xml(xml_content) # BASELINE ASSERTIONS: @@ -224,7 +224,7 @@ class TestConversionServiceWithImportedData: # File already has proper XML structure, just use it as-is xml_content = xml_content.strip() - conversion_service = ConversionService(test_db_session) + conversion_service = ConversionService(test_db_session, hotel_id="39054_001") stats = await conversion_service.process_conversion_xml(xml_content) # Verify conversions were created @@ -300,7 +300,7 @@ class TestConversionServiceWithImportedData: # File already has proper XML structure, just use it as-is xml_content = xml_content.strip() - conversion_service = ConversionService(test_db_session) + conversion_service = ConversionService(test_db_session, hotel_id="39054_001") stats = await conversion_service.process_conversion_xml(xml_content) # Verify conversions were processed @@ -332,7 +332,7 @@ class TestConversionServiceWithImportedData: """Test ConversionService handles invalid XML gracefully.""" invalid_xml = "unclosed tag" - conversion_service = ConversionService(test_db_session) + conversion_service = ConversionService(test_db_session, hotel_id="39054_001") with pytest.raises(ValueError, match="Invalid XML"): await conversion_service.process_conversion_xml(invalid_xml) @@ -342,7 +342,7 @@ class TestConversionServiceWithImportedData: """Test ConversionService handles empty/minimal XML.""" minimal_xml = '' - conversion_service = ConversionService(test_db_session) + conversion_service = ConversionService(test_db_session, hotel_id="39054_001") stats = await conversion_service.process_conversion_xml(minimal_xml) assert stats["total_reservations"] == 0 @@ -421,7 +421,7 @@ class TestConversionServiceWithImportedData: xml_content1 = multi_builder1.build_xml() # Process first batch - service = ConversionService(test_db_session) + service = ConversionService(test_db_session, hotel_id="39054_001") stats1 = await service.process_conversion_xml(xml_content1) assert stats1["total_reservations"] == 2 @@ -577,7 +577,7 @@ class TestXMLBuilderUsage: ) # Process the XML - service = ConversionService(test_db_session) + service = ConversionService(test_db_session, hotel_id="39054_001") stats = await service.process_conversion_xml(xml_content) assert stats["total_reservations"] == 1 @@ -616,7 +616,7 @@ class TestXMLBuilderUsage: .build_xml() ) - service = ConversionService(test_db_session) + service = ConversionService(test_db_session, hotel_id="39054_001") stats = await service.process_conversion_xml(xml_content) assert stats["total_reservations"] == 1 @@ -677,7 +677,7 @@ class TestXMLBuilderUsage: xml_content = multi_builder.build_xml() # Process the XML - service = ConversionService(test_db_session) + service = ConversionService(test_db_session, hotel_id="39054_001") stats = await service.process_conversion_xml(xml_content) assert stats["total_reservations"] == 2 @@ -768,7 +768,7 @@ class TestHashedMatchingLogic: """ - service = ConversionService(test_db_session, hotel_id="hotel_1") + service = ConversionService(test_db_session, hotel_id="39054_001") stats = await service.process_conversion_xml(xml_content) # Verify conversion was created From ea1d207438cb9bc90c832dfa961960924c86de77 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 17:43:51 +0100 Subject: [PATCH 12/20] Added awarnesss flag for conversion_guests --- src/alpine_bits_python/db.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/alpine_bits_python/db.py b/src/alpine_bits_python/db.py index a25cfa9..7899f15 100644 --- a/src/alpine_bits_python/db.py +++ b/src/alpine_bits_python/db.py @@ -406,6 +406,8 @@ class ConversionGuest(Base): is_regular = Column( Boolean, default=False ) # True if guest has many prior stays before appearing in our reservations + # Guest classification + # Metadata first_seen = Column(DateTime(timezone=True)) From 325f64fbc3d7b1c3ab4ca659d83cad8a2f33a680 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 17:44:05 +0100 Subject: [PATCH 13/20] Awarness guest now for real --- src/alpine_bits_python/db.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/alpine_bits_python/db.py b/src/alpine_bits_python/db.py index 7899f15..56394aa 100644 --- a/src/alpine_bits_python/db.py +++ b/src/alpine_bits_python/db.py @@ -407,6 +407,9 @@ class ConversionGuest(Base): Boolean, default=False ) # True if guest has many prior stays before appearing in our reservations # Guest classification + is_awarness_guest = Column( + Boolean, default=False + ) # True if guests first stay was from our campaigns # Metadata From 859716d0eaef2d3d61dfa486cfd4c163494faa20 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 17:47:23 +0100 Subject: [PATCH 14/20] Added migration for awarness guest column --- ...3_boolean_to_signify_awarness_match_in_.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 alembic/versions/2025_12_03_1744-1daea5172a03_boolean_to_signify_awarness_match_in_.py diff --git a/alembic/versions/2025_12_03_1744-1daea5172a03_boolean_to_signify_awarness_match_in_.py b/alembic/versions/2025_12_03_1744-1daea5172a03_boolean_to_signify_awarness_match_in_.py new file mode 100644 index 0000000..932c822 --- /dev/null +++ b/alembic/versions/2025_12_03_1744-1daea5172a03_boolean_to_signify_awarness_match_in_.py @@ -0,0 +1,32 @@ +"""boolean to signify awarness match in guests + +Revision ID: 1daea5172a03 +Revises: 263bed87114f +Create Date: 2025-12-03 17:44:29.657898 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '1daea5172a03' +down_revision: Union[str, Sequence[str], None] = '263bed87114f' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('conversion_guests', sa.Column('is_awareness_guest', sa.Boolean(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('conversion_guests', 'is_awareness_guest') + # ### end Alembic commands ### From 4a9c0b91a122288a61504ce699c9623d43331a15 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 17:47:30 +0100 Subject: [PATCH 15/20] And fixed typo --- src/alpine_bits_python/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alpine_bits_python/db.py b/src/alpine_bits_python/db.py index 56394aa..9333459 100644 --- a/src/alpine_bits_python/db.py +++ b/src/alpine_bits_python/db.py @@ -407,7 +407,7 @@ class ConversionGuest(Base): Boolean, default=False ) # True if guest has many prior stays before appearing in our reservations # Guest classification - is_awarness_guest = Column( + is_awareness_guest = Column( Boolean, default=False ) # True if guests first stay was from our campaigns From 16de553095d69e0a72075db7ac79ba453b5c5463 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 17:59:30 +0100 Subject: [PATCH 16/20] Added is_regular and awarness detection --- src/alpine_bits_python/api.py | 2 + src/alpine_bits_python/conversion_service.py | 162 ++++++++++++++++++- 2 files changed, 162 insertions(+), 2 deletions(-) diff --git a/src/alpine_bits_python/api.py b/src/alpine_bits_python/api.py index c6ba9e2..5a053db 100644 --- a/src/alpine_bits_python/api.py +++ b/src/alpine_bits_python/api.py @@ -1199,6 +1199,8 @@ async def _process_conversion_xml_background( conversion_service = ConversionService(session_maker, hotel.hotel_id) processing_stats = await conversion_service.process_conversion_xml(xml_content, run_full_guest_matching=True) + await conversion_service.classify_regular_guests(24) + _LOGGER.info( "Conversion processing complete for %s: %s", filename, processing_stats ) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 4130ebc..526a19f 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -3,8 +3,8 @@ import asyncio import xml.etree.ElementTree as ET from dataclasses import dataclass, field -from datetime import UTC, date, datetime -from decimal import Decimal +from datetime import UTC, date, datetime, timedelta +from decimal import Decimal, InvalidOperation from typing import Any from sqlalchemy import or_, select @@ -1510,6 +1510,164 @@ class ConversionService: return matched_reservation, matched_customer + async def classify_regular_guests( + self, updated_within_hours: int | None = 24 + ) -> dict[str, int]: + """Classify recently updated guests as regular/awareness guests. + + Args: + updated_within_hours: Only evaluate guests whose last_seen timestamp + is within this many hours. Use None to scan all guests for this hotel. + + Returns: + Dictionary with counts of processed guests and updates performed. + + """ + if not self.hotel_id: + _LOGGER.warning("Cannot classify regular guests without hotel_id context") + return { + "processed": 0, + "updated": 0, + "regular": 0, + "awareness": 0, + } + + cutoff: datetime | None = None + if updated_within_hours is not None and updated_within_hours > 0: + cutoff = datetime.now(UTC) - timedelta(hours=updated_within_hours) + + if self.session_maker: + session = await self.session_maker.create_session() + close_session = True + else: + session = self.session + close_session = False + + if not session: + _LOGGER.warning("Cannot classify regular guests without an active session") + return { + "processed": 0, + "updated": 0, + "regular": 0, + "awareness": 0, + } + + stats = { + "processed": 0, + "updated": 0, + "regular": 0, + "awareness": 0, + } + + try: + query = ( + select(ConversionGuest) + .where(ConversionGuest.hotel_id == self.hotel_id) + .options( + selectinload(ConversionGuest.conversions).selectinload( + Conversion.conversion_rooms + ) + ) + ) + if cutoff is not None: + query = query.where(ConversionGuest.last_seen >= cutoff) + + result = await session.execute(query) + guests = result.scalars().all() + + for guest in guests: + stats["processed"] += 1 + is_regular, is_awareness = self._evaluate_guest_regularity(guest) + + if guest.is_regular == is_regular and ( + guest.is_awareness_guest or False + ) == is_awareness: + if is_regular: + stats["regular"] += 1 + if is_awareness: + stats["awareness"] += 1 + continue + + guest.is_regular = is_regular + guest.is_awareness_guest = is_awareness + stats["updated"] += 1 + if is_regular: + stats["regular"] += 1 + if is_awareness: + stats["awareness"] += 1 + + await session.commit() + except Exception as exc: + await session.rollback() + _LOGGER.exception("Failed to classify regular guests: %s", exc) + finally: + if close_session: + await session.close() + + return stats + + def _evaluate_guest_regularity( + self, guest: ConversionGuest + ) -> tuple[bool, bool]: + """Determine whether a guest qualifies as regular/awareness.""" + paying_stays: list[tuple[date, bool]] = [] + + for conversion in guest.conversions: + if not conversion.conversion_rooms: + continue + + for room in conversion.conversion_rooms: + if not self._is_paying_room(room): + continue + + stay_date = self._stay_reference_date(conversion, room) + paying_stays.append( + ( + stay_date, + bool(conversion.directly_attributable), + ) + ) + + if not paying_stays: + return False, False + + paying_stays.sort(key=lambda item: item[0]) + earliest_date, earliest_direct = paying_stays[0] + + if not earliest_direct: + return True, False + + if len(paying_stays) > 1: + return True, True + + return False, False + + @staticmethod + def _is_paying_room(room: ConversionRoom) -> bool: + """Return True if the room entry represents a paying, completed stay.""" + if not room.room_status or room.room_status.lower() != "departed": + return False + if room.total_revenue in (None, 0): + return False + try: + revenue_value = Decimal(str(room.total_revenue)) + except (InvalidOperation, ValueError, TypeError): + return False + return revenue_value > 0 + + @staticmethod + def _stay_reference_date(conversion: Conversion, room: ConversionRoom) -> date: + """Pick the best available date for ordering stays chronologically.""" + if conversion.reservation_date: + return conversion.reservation_date + if room.arrival_date: + return room.arrival_date + if room.departure_date: + return room.departure_date + if conversion.creation_time: + return conversion.creation_time.date() + return date.min + async def _find_customer_for_guest( self, guest: ConversionGuest, session: AsyncSession ) -> Customer | None: From a6c5d0b9f50eb67d54fdd7e46d28696b6557f50e Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 18:44:32 +0100 Subject: [PATCH 17/20] Added a logging statement to better see where the child dies --- src/alpine_bits_python/conversion_service.py | 2 +- tests/test_conversion_service.py | 337 ++++++++++++++++++- 2 files changed, 328 insertions(+), 11 deletions(-) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 526a19f..0c6a0a6 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -1558,7 +1558,7 @@ class ConversionService: "regular": 0, "awareness": 0, } - + _LOGGER.info("Classifying regular guests for hotel %s", self.hotel_id) try: query = ( select(ConversionGuest) diff --git a/tests/test_conversion_service.py b/tests/test_conversion_service.py index b1b5ace..e0bde40 100644 --- a/tests/test_conversion_service.py +++ b/tests/test_conversion_service.py @@ -20,6 +20,7 @@ import pytest import pytest_asyncio from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine +from sqlalchemy.orm import selectinload from alpine_bits_python.conversion_service import ConversionService from alpine_bits_python.csv_import import CSVImporter @@ -542,6 +543,187 @@ class TestConversionServiceWithImportedData: ) +class TestConversionUpdatesAndMatching: + """Tests covering conversion updates and core matching logic.""" + + @pytest.mark.asyncio + async def test_reprocessing_conversion_updates_metadata(self, test_db_session): + """Ensure reprocessing a reservation updates metadata instead of duplicating.""" + def build_xml( + *, + booking_channel: str, + advertising_medium: str, + advertising_partner: str, + room_number: str, + arrival: str, + departure: str, + revenue: float, + ) -> str: + return f""" + + + + + + + + + + + + +""" + + first_xml = build_xml( + booking_channel="OTA", + advertising_medium="META", + advertising_partner="cpc", + room_number="33", + arrival="2025-02-01", + departure="2025-02-03", + revenue=120.0, + ) + + service = ConversionService(test_db_session, hotel_id="39054_001") + stats_first = await service.process_conversion_xml(first_xml) + assert stats_first["total_reservations"] == 1 + + result = await test_db_session.execute( + select(Conversion) + .where( + Conversion.hotel_id == "39054_001", + Conversion.pms_reservation_id == 2001, + ) + .options(selectinload(Conversion.conversion_rooms)) + ) + conversion = result.scalar_one() + assert conversion.booking_channel == "OTA" + assert conversion.advertising_partner == "cpc" + original_room_count = len(conversion.conversion_rooms) + assert original_room_count == 1 + assert conversion.conversion_rooms[0].room_number == "33" + + updated_xml = build_xml( + booking_channel="DIRECT", + advertising_medium="WEBSITE", + advertising_partner="organic", + room_number="44", + arrival="2025-02-02", + departure="2025-02-04", + revenue=150.0, + ) + + stats_second = await service.process_conversion_xml(updated_xml) + assert stats_second["total_reservations"] == 1 + + test_db_session.expire_all() + result = await test_db_session.execute( + select(Conversion) + .where( + Conversion.hotel_id == "39054_001", + Conversion.pms_reservation_id == 2001, + ) + .options(selectinload(Conversion.conversion_rooms)) + ) + updated_conversion = result.scalar_one() + assert updated_conversion.booking_channel == "DIRECT" + assert updated_conversion.advertising_medium == "WEBSITE" + assert updated_conversion.advertising_partner == "organic" + assert len(updated_conversion.conversion_rooms) == 1 + assert updated_conversion.conversion_rooms[0].room_number == "44" + assert updated_conversion.conversion_rooms[0].arrival_date.strftime( + "%Y-%m-%d" + ) == "2025-02-02" + + @pytest.mark.asyncio + async def test_advertising_match_uses_hashed_email_for_disambiguation( + self, test_db_session + ): + """Ensure hashed email filters ambiguous advertising matches.""" + # Create two customers/reservations sharing the same click-id prefix + customer_a = Customer( + given_name="Lara", + surname="North", + email_address="lara@example.com", + contact_id="contact_a", + ) + customer_a.update_hashed_fields() + customer_b = Customer( + given_name="Mia", + surname="West", + email_address="mia@example.com", + contact_id="contact_b", + ) + customer_b.update_hashed_fields() + + test_db_session.add_all([customer_a, customer_b]) + await test_db_session.flush() + + reservation_a = Reservation( + customer_id=customer_a.id, + unique_id="res_a", + md5_unique_id="A" * 32, + hotel_id="39054_001", + fbclid="click-prefix-111", + ) + reservation_b = Reservation( + customer_id=customer_b.id, + unique_id="res_b", + md5_unique_id="B" * 32, + hotel_id="39054_001", + fbclid="click-prefix-222", + ) + test_db_session.add_all([reservation_a, reservation_b]) + await test_db_session.commit() + + from tests.helpers import ReservationXMLBuilder + + xml_content = ( + ReservationXMLBuilder( + hotel_id="39054_001", + reservation_id="3001", + reservation_number="B-1", + reservation_date="2025-03-10", + advertising_campagne="click-prefix", + ) + .set_guest( + guest_id="701", + first_name="Mia", + last_name="West", + email="mia@example.com", + ) + .add_room( + arrival="2025-04-01", + departure="2025-04-03", + room_number="55", + status="reserved", + revenue_logis_per_day=180.0, + ) + .build_xml() + ) + + service = ConversionService(test_db_session, hotel_id="39054_001") + stats = await service.process_conversion_xml(xml_content) + + result = await test_db_session.execute( + select(Conversion) + .where( + Conversion.hotel_id == "39054_001", + Conversion.pms_reservation_id == 3001, + ) + .options(selectinload(Conversion.guest)) + ) + conversion = result.scalar_one() + assert conversion.reservation_id == reservation_b.id + assert conversion.customer_id == customer_b.id + assert stats["matched_to_reservation"] == 1 + assert stats["matched_to_customer"] == 0 + + class TestXMLBuilderUsage: """Demonstrate usage of XML builder helpers for creating test data.""" @@ -799,17 +981,152 @@ class TestHashedMatchingLogic: assert conversion_with_guest.guest.guest_last_name == "Miller" assert conversion_with_guest.guest.guest_email == "david@example.com" - # Verify conversion_room was created - room_result = await test_db_session.execute( - select(ConversionRoom).where(ConversionRoom.conversion_id == conversion.id) - ) - rooms = room_result.scalars().all() - assert len(rooms) > 0, "ConversionRoom should be created" - # Verify matching occurred (may or may not have matched depending on data) - # The important thing is that the records exist - assert stats["total_reservations"] == 1 - assert stats["total_daily_sales"] == 1 +class TestRegularGuestClassification: + """Tests for the classify_regular_guests helper.""" + + @pytest.mark.asyncio + async def test_classify_regular_guest_with_unattributable_history( + self, test_db_session + ): + """Guests with unattributable paying stays become regulars.""" + from tests.helpers import MultiReservationXMLBuilder, ReservationXMLBuilder + + multi = MultiReservationXMLBuilder() + base_builder = ReservationXMLBuilder( + hotel_id="39054_001", + reservation_id="4001", + reservation_number="REG-1", + reservation_date="2025-05-01", + ).set_guest( + guest_id="888", + first_name="Regular", + last_name="Guest", + email="regular@example.com", + ) + base_builder.add_room( + arrival="2025-06-01", + departure="2025-06-03", + room_number="71", + status="departed", + revenue_logis_per_day=220.0, + ) + multi.add_reservation(base_builder) + + second = ReservationXMLBuilder( + hotel_id="39054_001", + reservation_id="4002", + reservation_number="REG-2", + reservation_date="2025-05-10", + ).set_guest( + guest_id="888", + first_name="Regular", + last_name="Guest", + email="regular@example.com", + ) + second.add_room( + arrival="2025-07-01", + departure="2025-07-04", + room_number="72", + status="departed", + revenue_logis_per_day=210.0, + ) + multi.add_reservation(second) + + service = ConversionService(test_db_session, hotel_id="39054_001") + await service.process_conversion_xml(multi.build_xml()) + + stats = await service.classify_regular_guests(updated_within_hours=None) + assert stats["regular"] == 1 + + guest = await test_db_session.execute( + select(ConversionGuest).where( + ConversionGuest.hotel_id == "39054_001", + ConversionGuest.guest_id == 888, + ) + ) + guest_record = guest.scalar_one() + assert guest_record.is_regular is True + assert guest_record.is_awareness_guest is False + + @pytest.mark.asyncio + async def test_classify_awareness_guest_when_first_stay_attributable( + self, test_db_session + ): + """If the earliest paying stay is attributable, mark awareness guests.""" + from tests.helpers import MultiReservationXMLBuilder, ReservationXMLBuilder + + multi = MultiReservationXMLBuilder() + first = ReservationXMLBuilder( + hotel_id="39054_001", + reservation_id="4101", + reservation_number="AW-1", + reservation_date="2025-08-01", + ).set_guest( + guest_id="889", + first_name="Aware", + last_name="Guest", + email="aware@example.com", + ) + first.add_room( + arrival="2025-09-01", + departure="2025-09-03", + room_number="81", + status="departed", + revenue_logis_per_day=250.0, + ) + multi.add_reservation(first) + + second = ReservationXMLBuilder( + hotel_id="39054_001", + reservation_id="4102", + reservation_number="AW-2", + reservation_date="2025-08-10", + ).set_guest( + guest_id="889", + first_name="Aware", + last_name="Guest", + email="aware@example.com", + ) + second.add_room( + arrival="2025-10-05", + departure="2025-10-08", + room_number="82", + status="departed", + revenue_logis_per_day=260.0, + ) + multi.add_reservation(second) + + service = ConversionService(test_db_session, hotel_id="39054_001") + await service.process_conversion_xml(multi.build_xml()) + + # Mark earliest stay as attributable to simulate campaign match + result = await test_db_session.execute( + select(Conversion) + .where( + Conversion.hotel_id == "39054_001", + Conversion.guest_id == 889, + ) + .order_by(Conversion.reservation_date.asc()) + ) + conversions = result.scalars().all() + conversions[0].directly_attributable = True + conversions[1].directly_attributable = False + await test_db_session.commit() + + stats = await service.classify_regular_guests(updated_within_hours=None) + assert stats["regular"] == 1 + assert stats["awareness"] == 1 + + guest = await test_db_session.execute( + select(ConversionGuest).where( + ConversionGuest.hotel_id == "39054_001", + ConversionGuest.guest_id == 889, + ) + ) + guest_record = guest.scalar_one() + assert guest_record.is_regular is True + assert guest_record.is_awareness_guest is True @pytest.mark.asyncio async def test_conversion_guest_composite_key_prevents_duplicates( From a2ac51e5db19cf3113440a477cb6cdfb9fc29d41 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 18:52:53 +0100 Subject: [PATCH 18/20] Another logging message to see how the classification is doing --- src/alpine_bits_python/conversion_service.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 0c6a0a6..c48f0d3 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -1575,6 +1575,10 @@ class ConversionService: result = await session.execute(query) guests = result.scalars().all() + _LOGGER.info( + "Evaluating %d guests for regular/awareness classification", len(guests) + ) + for guest in guests: stats["processed"] += 1 is_regular, is_awareness = self._evaluate_guest_regularity(guest) From a0c795d0e441e505a1ff576d03bf339b56a48574 Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 19:02:34 +0100 Subject: [PATCH 19/20] Updated classification scheme --- src/alpine_bits_python/conversion_service.py | 21 ++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index c48f0d3..5aa6d50 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -10,7 +10,7 @@ from typing import Any from sqlalchemy import or_, select from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import selectinload +from sqlalchemy.orm import selectinload, with_loader_criteria from .db import ( Conversion, @@ -1568,6 +1568,18 @@ class ConversionService: Conversion.conversion_rooms ) ) + .options( + with_loader_criteria( + Conversion, + lambda cls: cls.reservation_type == "reservation", + include_aliases=True, + ), + with_loader_criteria( + ConversionRoom, + lambda cls: ConversionRoom.total_revenue.isnot(None), + include_aliases=True, + ), + ) ) if cutoff is not None: query = query.where(ConversionGuest.last_seen >= cutoff) @@ -1583,9 +1595,10 @@ class ConversionService: stats["processed"] += 1 is_regular, is_awareness = self._evaluate_guest_regularity(guest) - if guest.is_regular == is_regular and ( - guest.is_awareness_guest or False - ) == is_awareness: + current_regular = bool(guest.is_regular) + current_awareness = bool(guest.is_awareness_guest) + + if current_regular == is_regular and current_awareness == is_awareness: if is_regular: stats["regular"] += 1 if is_awareness: From 0c2254544d0323d2646d1c6adec15d94ad00781c Mon Sep 17 00:00:00 2001 From: Jonas Linter <{email_address}> Date: Wed, 3 Dec 2025 19:15:21 +0100 Subject: [PATCH 20/20] Bit hamfisted but updates old records --- src/alpine_bits_python/conversion_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alpine_bits_python/conversion_service.py b/src/alpine_bits_python/conversion_service.py index 5aa6d50..d798bf0 100644 --- a/src/alpine_bits_python/conversion_service.py +++ b/src/alpine_bits_python/conversion_service.py @@ -1598,7 +1598,7 @@ class ConversionService: current_regular = bool(guest.is_regular) current_awareness = bool(guest.is_awareness_guest) - if current_regular == is_regular and current_awareness == is_awareness: + if current_regular == is_regular and current_awareness == is_awareness and guest.is_regular is not None and guest.is_awareness_guest is not None: if is_regular: stats["regular"] += 1 if is_awareness: