alpinebits_python/src/alpine_bits_python/unified_monitoring.py

"""Unified monitoring with support for multiple notification backends.

This module provides alert handlers and schedulers that work with the
unified notification service to send alerts through multiple channels.
"""

import asyncio
import logging
import threading
from collections import deque
from datetime import datetime, timedelta
from typing import Any

from .email_monitoring import ErrorRecord, ReservationStatsCollector
from .logging_config import get_logger
from .notification_service import NotificationService

_LOGGER = get_logger(__name__)


class UnifiedAlertHandler(logging.Handler):
    """Custom logging handler that sends alerts through unified notification service.

    This handler uses a hybrid approach:
    - Accumulates errors in a buffer
    - Sends immediately if error threshold is reached
    - Otherwise sends after buffer duration expires
    - Always sends buffered errors (no minimum threshold for time-based flush)
    - Implements cooldown to prevent alert spam

    The handler is thread-safe and works with asyncio event loops.
    """

    def __init__(
        self,
        notification_service: NotificationService,
        config: dict[str, Any],
        loop: asyncio.AbstractEventLoop | None = None,
    ):
        """Initialize the unified alert handler.

        Args:
            notification_service: Unified notification service
            config: Configuration dictionary for error alerts
            loop: Asyncio event loop (will use current loop if not provided)

        """
        super().__init__()
        self.notification_service = notification_service
        self.config = config
        self.loop = loop  # Will be set when first error occurs if not provided

        # Configuration
        self.error_threshold = config.get("error_threshold", 5)
        self.buffer_minutes = config.get("buffer_minutes", 15)
        self.cooldown_minutes = config.get("cooldown_minutes", 15)
        self.log_levels = config.get("log_levels", ["ERROR", "CRITICAL"])

        # State
        self.error_buffer: deque[ErrorRecord] = deque()
        self.last_sent = datetime.min  # Last time we sent an alert
        self._flush_task: asyncio.Task | None = None
        self._lock = threading.Lock()  # Thread-safe for multi-threaded logging

        _LOGGER.info(
            "UnifiedAlertHandler initialized: threshold=%d, buffer=%dmin, cooldown=%dmin",
            self.error_threshold,
            self.buffer_minutes,
            self.cooldown_minutes,
        )

    def emit(self, record: logging.LogRecord) -> None:
        """Handle a log record.

        This is called automatically by the logging system when an error is logged.
        It's important that this method is fast and doesn't block.

        Args:
            record: The log record to handle

        """
        # Only handle configured log levels
        if record.levelname not in self.log_levels:
            return

        try:
            # Ensure we have an event loop
            if self.loop is None:
                try:
                    self.loop = asyncio.get_running_loop()
                except RuntimeError:
                    # No running loop, we'll need to handle this differently
                    _LOGGER.warning("No asyncio event loop available for alerts")
                    return

            # Add error to buffer (thread-safe)
            with self._lock:
                error_record = ErrorRecord(record)
                self.error_buffer.append(error_record)
                buffer_size = len(self.error_buffer)

            # Determine if we should send immediately
            should_send_immediately = buffer_size >= self.error_threshold

            if should_send_immediately:
                # Cancel any pending flush task
                if self._flush_task and not self._flush_task.done():
                    self._flush_task.cancel()

                # Schedule immediate flush
                self._flush_task = asyncio.run_coroutine_threadsafe(
                    self._flush_buffer(immediate=True),
                    self.loop,
                )
            # Schedule delayed flush if not already scheduled
            elif not self._flush_task or self._flush_task.done():
                self._flush_task = asyncio.run_coroutine_threadsafe(
                    self._schedule_delayed_flush(),
                    self.loop,
                )

        except Exception:
            # Never let the handler crash - just log and continue
            _LOGGER.exception("Error in UnifiedAlertHandler.emit")

    async def _schedule_delayed_flush(self) -> None:
        """Schedule a delayed buffer flush after buffer duration."""
        await asyncio.sleep(self.buffer_minutes * 60)
        await self._flush_buffer(immediate=False)

    async def _flush_buffer(self, *, immediate: bool) -> None:
        """Flush the error buffer and send alert.

        Args:
            immediate: Whether this is an immediate flush (threshold hit)

        """
        # Check cooldown period
        now = datetime.now()
        time_since_last = (now - self.last_sent).total_seconds() / 60

        if time_since_last < self.cooldown_minutes:
            _LOGGER.info(
                "Alert cooldown active (%.1f min remaining), buffering errors",
                self.cooldown_minutes - time_since_last,
            )
            # Don't clear buffer - let errors accumulate until cooldown expires
            return

        # Get all buffered errors (thread-safe)
        with self._lock:
            if not self.error_buffer:
                return

            errors = list(self.error_buffer)
            self.error_buffer.clear()

        # Update last sent time
        self.last_sent = now

        # Format alert
        error_count = len(errors)
        time_range = (
            f"{errors[0].timestamp.strftime('%H:%M:%S')} to "
            f"{errors[-1].timestamp.strftime('%H:%M:%S')}"
        )

        # Determine alert type
        alert_type = "Immediate Alert" if immediate else "Scheduled Alert"
        if immediate:
            reason = f"(threshold of {self.error_threshold} exceeded)"
        else:
            reason = f"({self.buffer_minutes} minute buffer)"

        title = f"AlpineBits Error {alert_type}: {error_count} errors {reason}"

        # Build message
        message = f"Error Alert - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
        message += "=" * 70 + "\n\n"
        message += f"Alert Type: {alert_type}\n"
        message += f"Error Count: {error_count}\n"
        message += f"Time Range: {time_range}\n"
        message += f"Reason: {reason}\n"
        message += "\n" + "=" * 70 + "\n\n"

        # Add individual errors
        message += "Errors:\n"
        message += "-" * 70 + "\n\n"
        for error in errors:
            message += error.format_plain_text()
            message += "\n"

        message += "-" * 70 + "\n"
        message += f"Generated by AlpineBits Monitoring at {now.strftime('%Y-%m-%d %H:%M:%S')}\n"

        # Send through unified notification service
        try:
            results = await self.notification_service.send_alert(
                title=title,
                message=message,
                backends=None,  # Send to all backends
            )

            success_count = sum(1 for success in results.values() if success)
            if success_count > 0:
                _LOGGER.info(
                    "Alert sent successfully through %d/%d backend(s): %d errors",
                    success_count,
                    len(results),
                    error_count,
                )
            else:
                _LOGGER.error("Failed to send alert through any backend: %d errors", error_count)

        except Exception:
            _LOGGER.exception("Exception while sending alert")

    def close(self) -> None:
        """Close the handler and flush any remaining errors.

        This is called when the logging system shuts down.
        """
        # Cancel any pending flush tasks
        if self._flush_task and not self._flush_task.done():
            self._flush_task.cancel()

        # Flush any remaining errors immediately
        if self.error_buffer and self.loop:
            try:
                # Check if the loop is still running
                if not self.loop.is_closed():
                    future = asyncio.run_coroutine_threadsafe(
                        self._flush_buffer(immediate=False),
                        self.loop,
                    )
                    future.result(timeout=5)
                else:
                    _LOGGER.warning(
                        "Event loop closed, cannot flush %d remaining errors",
                        len(self.error_buffer),
                    )
            except Exception:
                _LOGGER.exception("Error flushing buffer on close")

        super().close()


class UnifiedDailyReportScheduler:
    """Scheduler for sending daily reports through unified notification service.

    This runs as a background task and sends daily reports containing
    statistics and error summaries through all configured notification backends.
    """

    def __init__(
        self,
        notification_service: NotificationService,
        config: dict[str, Any],
    ):
        """Initialize the unified daily report scheduler.

        Args:
            notification_service: Unified notification service
            config: Configuration for daily reports

        """
        self.notification_service = notification_service
        self.config = config
        self.send_time = config.get("send_time", "08:00")  # Default 8 AM
        self.include_stats = config.get("include_stats", True)
        self.include_errors = config.get("include_errors", True)

        self._task: asyncio.Task | None = None
        self._stats_collector = None  # Will be set by application
        self._error_log: list[dict[str, Any]] = []

        _LOGGER.info(
            "UnifiedDailyReportScheduler initialized: send_time=%s",
            self.send_time,
        )

    def start(self) -> None:
        """Start the daily report scheduler."""
        if self._task is None or self._task.done():
            self._task = asyncio.create_task(self._run())
            _LOGGER.info("Daily report scheduler started")

    def stop(self) -> None:
        """Stop the daily report scheduler."""
        if self._task and not self._task.done():
            self._task.cancel()
            _LOGGER.info("Daily report scheduler stopped")

    def log_error(self, error: dict[str, Any]) -> None:
        """Log an error for inclusion in daily report.

        Args:
            error: Error information dictionary

        """
        self._error_log.append(error)

    async def _run(self) -> None:
        """Run the daily report scheduler loop."""
        while True:
            try:
                # Calculate time until next report
                now = datetime.now()
                target_hour, target_minute = map(int, self.send_time.split(":"))

                # Calculate next send time
                next_send = now.replace(
                    hour=target_hour,
                    minute=target_minute,
                    second=0,
                    microsecond=0,
                )

                # If time has passed today, schedule for tomorrow
                if next_send <= now:
                    next_send += timedelta(days=1)

                # Calculate sleep duration
                sleep_seconds = (next_send - now).total_seconds()

                _LOGGER.info(
                    "Next daily report scheduled for %s (in %.1f hours)",
                    next_send.strftime("%Y-%m-%d %H:%M:%S"),
                    sleep_seconds / 3600,
                )

                # Wait until send time
                await asyncio.sleep(sleep_seconds)

                # Send report
                await self._send_report()

            except asyncio.CancelledError:
                _LOGGER.info("Daily report scheduler cancelled")
                break
            except Exception:
                _LOGGER.exception("Error in daily report scheduler")
                # Sleep a bit before retrying
                await asyncio.sleep(60)

    async def _send_report(self) -> None:
        """Send the daily report."""
        stats = {}

        # Collect statistics if enabled
        if self.include_stats and self._stats_collector:
            try:
                stats = await self._stats_collector()
            except Exception:
                _LOGGER.exception("Error collecting statistics for daily report")

        # Get errors if enabled
        errors = self._error_log.copy() if self.include_errors else None

        # Send report through unified notification service
        try:
            results = await self.notification_service.send_daily_report(
                stats=stats,
                errors=errors,
                backends=None,  # Send to all backends
            )

            success_count = sum(1 for success in results.values() if success)
            if success_count > 0:
                _LOGGER.info(
                    "Daily report sent successfully through %d/%d backend(s)",
                    success_count,
                    len(results),
                )
                # Clear error log after successful send
                self._error_log.clear()
            else:
                _LOGGER.error("Failed to send daily report through any backend")

        except Exception:
            _LOGGER.exception("Exception while sending daily report")

    def set_stats_collector(self, collector) -> None:
        """Set the statistics collector function.

        Args:
            collector: Async function that returns statistics dictionary

        """
        self._stats_collector = collector