Importing mailbox leads now works

2025-11-19 09:55:54 +01:00
parent 57dac8514c
commit e8cdc75421
8 changed files with 111063 additions and 32 deletions
--- a/extract_leads.py
+++ b/extract_leads.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+Extract lead information from MBOX email file.
+Parses email entries and extracts structured lead data.
+"""
+
+import re
+from dataclasses import dataclass, field, asdict
+from typing import List, Optional
+from datetime import datetime
+import json
+
+
+@dataclass
+class Lead:
+    """Represents a single lead extracted from email."""
+    name: Optional[str] = None
+    lastname: Optional[str] = None
+    mail: Optional[str] = None
+    tel: Optional[str] = None
+    anreise: Optional[str] = None  # Check-in date
+    abreise: Optional[str] = None  # Check-out date
+    erwachsene: Optional[int] = None  # Adults
+    kinder: Optional[int] = None  # Children
+    kind_ages: List[int] = field(default_factory=list)  # Children ages
+    apartments: List[str] = field(default_factory=list)
+    verpflegung: Optional[str] = None  # Meal plan
+    sprache: Optional[str] = None  # Language
+    device: Optional[str] = None
+    anrede: Optional[str] = None  # Salutation
+    land: Optional[str] = None  # Country
+    privacy: Optional[bool] = None
+
+
+def parse_mbox_file(filepath: str) -> List[Lead]:
+    """
+    Parse MBOX file and extract lead information.
+
+    Args:
+        filepath: Path to the MBOX file
+
+    Returns:
+        List of Lead objects with extracted data
+    """
+    leads = []
+
+    with open(filepath, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    # Split by "From " at the beginning of lines to separate emails
+    email_blocks = re.split(r'^From \d+@', content, flags=re.MULTILINE)[1:]
+
+    for email_block in email_blocks:
+        # Find the content section after headers (after a blank line)
+        # Headers end with a blank line, then the actual form data starts
+        parts = email_block.split('\n\n', 1)
+
+        if len(parts) < 2:
+            continue
+
+        headers = parts[0]
+        body = parts[1] if len(parts) > 1 else ""
+
+        # Extract lead data from body
+        lead = parse_email_body(body)
+
+        if lead.name or lead.mail:  # Only add if we have some data
+            leads.append(lead)
+
+    return leads
+
+
+def parse_email_body(body: str) -> Lead:
+    """
+    Parse the body of an email to extract lead information.
+
+    Args:
+        body: Email body content
+
+    Returns:
+        Lead object with extracted data
+    """
+    lead = Lead()
+
+    # Split body into lines for easier parsing
+    lines = body.split('\n')
+
+    for line in lines:
+        line = line.strip()
+
+        if not line or ':' not in line:
+            continue
+
+        key, value = line.split(':', 1)
+        key = key.strip()
+        value = value.strip()
+
+        # Map keys to Lead attributes
+        if key == 'Name':
+            lead.name = value
+        elif key == 'Nachname':
+            lead.lastname = value
+        elif key == 'Mail':
+            lead.mail = value
+        elif key == 'Tel':
+            lead.tel = value
+        elif key == 'Anreise':
+            lead.anreise = value
+        elif key == 'Abreise':
+            lead.abreise = value
+        elif key == 'Erwachsene':
+            lead.erwachsene = int(value) if value.isdigit() else None
+        elif key == 'Kinder':
+            lead.kinder = int(value) if value.isdigit() else None
+        elif key.startswith('Alter Kind'):
+            # Extract age from "Alter Kind 1", "Alter Kind 2", etc.
+            try:
+                age = int(value)
+                lead.kind_ages.append(age)
+            except ValueError:
+                pass
+        elif key == 'Apartment':
+            lead.apartments.append(value)
+        elif key == 'Verpflegung':
+            lead.verpflegung = value
+        elif key == 'Sprache':
+            lead.sprache = value
+        elif key == 'Device':
+            lead.device = value
+        elif key == 'Anrede':
+            lead.anrede = value
+        elif key == 'Land':
+            lead.land = value
+        elif key == 'Privacy':
+            lead.privacy = value.lower() == 'on'
+
+    # Sort child ages to maintain order
+    lead.kind_ages.sort()
+
+    return lead
+
+
+def export_to_json(leads: List[Lead], output_file: str) -> None:
+    """Export leads to JSON file."""
+    data = [asdict(lead) for lead in leads]
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
+    print(f"Exported {len(leads)} leads to {output_file}")
+
+
+def export_to_csv(leads: List[Lead], output_file: str) -> None:
+    """Export leads to CSV file."""
+    import csv
+
+    if not leads:
+        return
+
+    # Define CSV headers
+    headers = [
+        'name',
+        'lastname',
+        'mail',
+        'tel',
+        'anreise',
+        'abreise',
+        'erwachsene',
+        'kinder',
+        'kind_ages',
+        'apartments',
+        'verpflegung',
+        'sprache',
+        'device',
+        'anrede',
+        'land',
+        'privacy'
+    ]
+
+    with open(output_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=headers)
+        writer.writeheader()
+
+        for lead in leads:
+            row = asdict(lead)
+            # Convert lists to comma-separated strings for CSV
+            row['kind_ages'] = ','.join(map(str, row['kind_ages']))
+            row['apartments'] = ','.join(row['apartments'])
+            row['privacy'] = 'Yes' if row['privacy'] else 'No' if row['privacy'] is False else ''
+            writer.writerow(row)
+
+    print(f"Exported {len(leads)} leads to {output_file}")
+
+
+def print_summary(leads: List[Lead]) -> None:
+    """Print a summary of extracted leads."""
+    print(f"\n{'='*60}")
+    print(f"Total leads extracted: {len(leads)}")
+    print(f"{'='*60}\n")
+
+    for i, lead in enumerate(leads, 1):
+        print(f"Lead {i}:")
+        print(f"  Name: {lead.name} {lead.lastname}")
+        print(f"  Email: {lead.mail}")
+        print(f"  Phone: {lead.tel}")
+        print(f"  Check-in: {lead.anreise}, Check-out: {lead.abreise}")
+        print(f"  Adults: {lead.erwachsene}, Children: {lead.kinder}")
+        if lead.kind_ages:
+            print(f"  Children ages: {lead.kind_ages}")
+        if lead.apartments:
+            print(f"  Apartments: {', '.join(lead.apartments)}")
+        print()
+
+
+if __name__ == '__main__':
+    import sys
+
+    mbox_file = '/home/divusjulius/repos/alpinebits_python/Leads-Bemelmans Apartments.mbox'
+
+    print(f"Parsing {mbox_file}...")
+    leads = parse_mbox_file(mbox_file)
+
+    # Print summary
+    print_summary(leads)
+
+    # Export to JSON
+    export_to_json(leads, 'leads_export.json')
+
+    # Export to CSV
+    export_to_csv(leads, 'leads_export.csv')