"""Unified monitoring with support for multiple notification backends. This module provides alert handlers and schedulers that work with the unified notification service to send alerts through multiple channels. """ import asyncio import logging import threading from collections import deque from datetime import datetime, timedelta from typing import Any from .email_monitoring import ErrorRecord, ReservationStatsCollector from .logging_config import get_logger from .notification_service import NotificationService _LOGGER = get_logger(__name__) class UnifiedAlertHandler(logging.Handler): """Custom logging handler that sends alerts through unified notification service. This handler uses a hybrid approach: - Accumulates errors in a buffer - Sends immediately if error threshold is reached - Otherwise sends after buffer duration expires - Always sends buffered errors (no minimum threshold for time-based flush) - Implements cooldown to prevent alert spam The handler is thread-safe and works with asyncio event loops. """ def __init__( self, notification_service: NotificationService, config: dict[str, Any], loop: asyncio.AbstractEventLoop | None = None, ): """Initialize the unified alert handler. Args: notification_service: Unified notification service config: Configuration dictionary for error alerts loop: Asyncio event loop (will use current loop if not provided) """ super().__init__() self.notification_service = notification_service self.config = config self.loop = loop # Will be set when first error occurs if not provided # Configuration self.error_threshold = config.get("error_threshold", 5) self.buffer_minutes = config.get("buffer_minutes", 15) self.cooldown_minutes = config.get("cooldown_minutes", 15) self.log_levels = config.get("log_levels", ["ERROR", "CRITICAL"]) # State self.error_buffer: deque[ErrorRecord] = deque() self.last_sent = datetime.min # Last time we sent an alert self._flush_task: asyncio.Task | None = None self._lock = threading.Lock() # Thread-safe for multi-threaded logging _LOGGER.info( "UnifiedAlertHandler initialized: threshold=%d, buffer=%dmin, cooldown=%dmin", self.error_threshold, self.buffer_minutes, self.cooldown_minutes, ) def emit(self, record: logging.LogRecord) -> None: """Handle a log record. This is called automatically by the logging system when an error is logged. It's important that this method is fast and doesn't block. Args: record: The log record to handle """ # Only handle configured log levels if record.levelname not in self.log_levels: return try: # Ensure we have an event loop if self.loop is None: try: self.loop = asyncio.get_running_loop() except RuntimeError: # No running loop, we'll need to handle this differently _LOGGER.warning("No asyncio event loop available for alerts") return # Add error to buffer (thread-safe) with self._lock: error_record = ErrorRecord(record) self.error_buffer.append(error_record) buffer_size = len(self.error_buffer) # Determine if we should send immediately should_send_immediately = buffer_size >= self.error_threshold if should_send_immediately: # Cancel any pending flush task if self._flush_task and not self._flush_task.done(): self._flush_task.cancel() # Schedule immediate flush self._flush_task = asyncio.run_coroutine_threadsafe( self._flush_buffer(immediate=True), self.loop, ) # Schedule delayed flush if not already scheduled elif not self._flush_task or self._flush_task.done(): self._flush_task = asyncio.run_coroutine_threadsafe( self._schedule_delayed_flush(), self.loop, ) except Exception: # Never let the handler crash - just log and continue _LOGGER.exception("Error in UnifiedAlertHandler.emit") async def _schedule_delayed_flush(self) -> None: """Schedule a delayed buffer flush after buffer duration.""" await asyncio.sleep(self.buffer_minutes * 60) await self._flush_buffer(immediate=False) async def _flush_buffer(self, *, immediate: bool) -> None: """Flush the error buffer and send alert. Args: immediate: Whether this is an immediate flush (threshold hit) """ # Check cooldown period now = datetime.now() time_since_last = (now - self.last_sent).total_seconds() / 60 if time_since_last < self.cooldown_minutes: _LOGGER.info( "Alert cooldown active (%.1f min remaining), buffering errors", self.cooldown_minutes - time_since_last, ) # Don't clear buffer - let errors accumulate until cooldown expires return # Get all buffered errors (thread-safe) with self._lock: if not self.error_buffer: return errors = list(self.error_buffer) self.error_buffer.clear() # Update last sent time self.last_sent = now # Format alert error_count = len(errors) time_range = ( f"{errors[0].timestamp.strftime('%H:%M:%S')} to " f"{errors[-1].timestamp.strftime('%H:%M:%S')}" ) # Determine alert type alert_type = "Immediate Alert" if immediate else "Scheduled Alert" if immediate: reason = f"(threshold of {self.error_threshold} exceeded)" else: reason = f"({self.buffer_minutes} minute buffer)" title = f"AlpineBits Error {alert_type}: {error_count} errors {reason}" # Build message message = f"Error Alert - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" message += "=" * 70 + "\n\n" message += f"Alert Type: {alert_type}\n" message += f"Error Count: {error_count}\n" message += f"Time Range: {time_range}\n" message += f"Reason: {reason}\n" message += "\n" + "=" * 70 + "\n\n" # Add individual errors message += "Errors:\n" message += "-" * 70 + "\n\n" for error in errors: message += error.format_plain_text() message += "\n" message += "-" * 70 + "\n" message += f"Generated by AlpineBits Monitoring at {now.strftime('%Y-%m-%d %H:%M:%S')}\n" # Send through unified notification service try: results = await self.notification_service.send_alert( title=title, message=message, backends=None, # Send to all backends ) success_count = sum(1 for success in results.values() if success) if success_count > 0: _LOGGER.info( "Alert sent successfully through %d/%d backend(s): %d errors", success_count, len(results), error_count, ) else: _LOGGER.error("Failed to send alert through any backend: %d errors", error_count) except Exception: _LOGGER.exception("Exception while sending alert") def close(self) -> None: """Close the handler and flush any remaining errors. This is called when the logging system shuts down. """ # Cancel any pending flush tasks if self._flush_task and not self._flush_task.done(): self._flush_task.cancel() # Flush any remaining errors immediately if self.error_buffer and self.loop: try: # Check if the loop is still running if not self.loop.is_closed(): future = asyncio.run_coroutine_threadsafe( self._flush_buffer(immediate=False), self.loop, ) future.result(timeout=5) else: _LOGGER.warning( "Event loop closed, cannot flush %d remaining errors", len(self.error_buffer), ) except Exception: _LOGGER.exception("Error flushing buffer on close") super().close() class UnifiedDailyReportScheduler: """Scheduler for sending daily reports through unified notification service. This runs as a background task and sends daily reports containing statistics and error summaries through all configured notification backends. """ def __init__( self, notification_service: NotificationService, config: dict[str, Any], ): """Initialize the unified daily report scheduler. Args: notification_service: Unified notification service config: Configuration for daily reports """ self.notification_service = notification_service self.config = config self.send_time = config.get("send_time", "08:00") # Default 8 AM self.include_stats = config.get("include_stats", True) self.include_errors = config.get("include_errors", True) self._task: asyncio.Task | None = None self._stats_collector = None # Will be set by application self._error_log: list[dict[str, Any]] = [] _LOGGER.info( "UnifiedDailyReportScheduler initialized: send_time=%s", self.send_time, ) def start(self) -> None: """Start the daily report scheduler.""" if self._task is None or self._task.done(): self._task = asyncio.create_task(self._run()) _LOGGER.info("Daily report scheduler started") def stop(self) -> None: """Stop the daily report scheduler.""" if self._task and not self._task.done(): self._task.cancel() _LOGGER.info("Daily report scheduler stopped") def log_error(self, error: dict[str, Any]) -> None: """Log an error for inclusion in daily report. Args: error: Error information dictionary """ self._error_log.append(error) async def _run(self) -> None: """Run the daily report scheduler loop.""" while True: try: # Calculate time until next report now = datetime.now() target_hour, target_minute = map(int, self.send_time.split(":")) # Calculate next send time next_send = now.replace( hour=target_hour, minute=target_minute, second=0, microsecond=0, ) # If time has passed today, schedule for tomorrow if next_send <= now: next_send += timedelta(days=1) # Calculate sleep duration sleep_seconds = (next_send - now).total_seconds() _LOGGER.info( "Next daily report scheduled for %s (in %.1f hours)", next_send.strftime("%Y-%m-%d %H:%M:%S"), sleep_seconds / 3600, ) # Wait until send time await asyncio.sleep(sleep_seconds) # Send report await self._send_report() except asyncio.CancelledError: _LOGGER.info("Daily report scheduler cancelled") break except Exception: _LOGGER.exception("Error in daily report scheduler") # Sleep a bit before retrying await asyncio.sleep(60) async def _send_report(self) -> None: """Send the daily report.""" stats = {} # Collect statistics if enabled if self.include_stats and self._stats_collector: try: stats = await self._stats_collector() except Exception: _LOGGER.exception("Error collecting statistics for daily report") # Get errors if enabled errors = self._error_log.copy() if self.include_errors else None # Send report through unified notification service try: results = await self.notification_service.send_daily_report( stats=stats, errors=errors, backends=None, # Send to all backends ) success_count = sum(1 for success in results.values() if success) if success_count > 0: _LOGGER.info( "Daily report sent successfully through %d/%d backend(s)", success_count, len(results), ) # Clear error log after successful send self._error_log.clear() else: _LOGGER.error("Failed to send daily report through any backend") except Exception: _LOGGER.exception("Exception while sending daily report") def set_stats_collector(self, collector) -> None: """Set the statistics collector function. Args: collector: Async function that returns statistics dictionary """ self._stats_collector = collector