#!/usr/bin/env python3 """ Extract lead information from MBOX email file. Parses email entries and extracts structured lead data. """ import re from dataclasses import dataclass, field, asdict from typing import List, Optional from datetime import datetime import json @dataclass class Lead: """Represents a single lead extracted from email.""" name: Optional[str] = None lastname: Optional[str] = None mail: Optional[str] = None tel: Optional[str] = None anreise: Optional[str] = None # Check-in date abreise: Optional[str] = None # Check-out date erwachsene: Optional[int] = None # Adults kinder: Optional[int] = None # Children kind_ages: List[int] = field(default_factory=list) # Children ages apartments: List[str] = field(default_factory=list) verpflegung: Optional[str] = None # Meal plan sprache: Optional[str] = None # Language device: Optional[str] = None anrede: Optional[str] = None # Salutation land: Optional[str] = None # Country privacy: Optional[bool] = None def parse_mbox_file(filepath: str) -> List[Lead]: """ Parse MBOX file and extract lead information. Args: filepath: Path to the MBOX file Returns: List of Lead objects with extracted data """ leads = [] with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Split by "From " at the beginning of lines to separate emails email_blocks = re.split(r'^From \d+@', content, flags=re.MULTILINE)[1:] for email_block in email_blocks: # Find the content section after headers (after a blank line) # Headers end with a blank line, then the actual form data starts parts = email_block.split('\n\n', 1) if len(parts) < 2: continue headers = parts[0] body = parts[1] if len(parts) > 1 else "" # Extract lead data from body lead = parse_email_body(body) if lead.name or lead.mail: # Only add if we have some data leads.append(lead) return leads def parse_email_body(body: str) -> Lead: """ Parse the body of an email to extract lead information. Args: body: Email body content Returns: Lead object with extracted data """ lead = Lead() # Split body into lines for easier parsing lines = body.split('\n') for line in lines: line = line.strip() if not line or ':' not in line: continue key, value = line.split(':', 1) key = key.strip() value = value.strip() # Map keys to Lead attributes if key == 'Name': lead.name = value elif key == 'Nachname': lead.lastname = value elif key == 'Mail': lead.mail = value elif key == 'Tel': lead.tel = value elif key == 'Anreise': lead.anreise = value elif key == 'Abreise': lead.abreise = value elif key == 'Erwachsene': lead.erwachsene = int(value) if value.isdigit() else None elif key == 'Kinder': lead.kinder = int(value) if value.isdigit() else None elif key.startswith('Alter Kind'): # Extract age from "Alter Kind 1", "Alter Kind 2", etc. try: age = int(value) lead.kind_ages.append(age) except ValueError: pass elif key == 'Apartment': lead.apartments.append(value) elif key == 'Verpflegung': lead.verpflegung = value elif key == 'Sprache': lead.sprache = value elif key == 'Device': lead.device = value elif key == 'Anrede': lead.anrede = value elif key == 'Land': lead.land = value elif key == 'Privacy': lead.privacy = value.lower() == 'on' # Sort child ages to maintain order lead.kind_ages.sort() return lead def export_to_json(leads: List[Lead], output_file: str) -> None: """Export leads to JSON file.""" data = [asdict(lead) for lead in leads] with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"Exported {len(leads)} leads to {output_file}") def export_to_csv(leads: List[Lead], output_file: str) -> None: """Export leads to CSV file.""" import csv if not leads: return # Define CSV headers headers = [ 'name', 'lastname', 'mail', 'tel', 'anreise', 'abreise', 'erwachsene', 'kinder', 'kind_ages', 'apartments', 'verpflegung', 'sprache', 'device', 'anrede', 'land', 'privacy' ] with open(output_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=headers) writer.writeheader() for lead in leads: row = asdict(lead) # Convert lists to comma-separated strings for CSV row['kind_ages'] = ','.join(map(str, row['kind_ages'])) row['apartments'] = ','.join(row['apartments']) row['privacy'] = 'Yes' if row['privacy'] else 'No' if row['privacy'] is False else '' writer.writerow(row) print(f"Exported {len(leads)} leads to {output_file}") def print_summary(leads: List[Lead]) -> None: """Print a summary of extracted leads.""" print(f"\n{'='*60}") print(f"Total leads extracted: {len(leads)}") print(f"{'='*60}\n") for i, lead in enumerate(leads, 1): print(f"Lead {i}:") print(f" Name: {lead.name} {lead.lastname}") print(f" Email: {lead.mail}") print(f" Phone: {lead.tel}") print(f" Check-in: {lead.anreise}, Check-out: {lead.abreise}") print(f" Adults: {lead.erwachsene}, Children: {lead.kinder}") if lead.kind_ages: print(f" Children ages: {lead.kind_ages}") if lead.apartments: print(f" Apartments: {', '.join(lead.apartments)}") print() if __name__ == '__main__': import sys mbox_file = '/home/divusjulius/repos/alpinebits_python/Leads-Bemelmans Apartments.mbox' print(f"Parsing {mbox_file}...") leads = parse_mbox_file(mbox_file) # Print summary print_summary(leads) # Export to JSON export_to_json(leads, 'leads_export.json') # Export to CSV export_to_csv(leads, 'leads_export.csv')