Importing mailbox leads now works

This commit is contained in:
Jonas Linter
2025-11-19 09:55:54 +01:00
parent 57dac8514c
commit e8cdc75421
8 changed files with 111063 additions and 32 deletions

228
extract_leads.py Normal file
View File

@@ -0,0 +1,228 @@
#!/usr/bin/env python3
"""
Extract lead information from MBOX email file.
Parses email entries and extracts structured lead data.
"""
import re
from dataclasses import dataclass, field, asdict
from typing import List, Optional
from datetime import datetime
import json
@dataclass
class Lead:
"""Represents a single lead extracted from email."""
name: Optional[str] = None
lastname: Optional[str] = None
mail: Optional[str] = None
tel: Optional[str] = None
anreise: Optional[str] = None # Check-in date
abreise: Optional[str] = None # Check-out date
erwachsene: Optional[int] = None # Adults
kinder: Optional[int] = None # Children
kind_ages: List[int] = field(default_factory=list) # Children ages
apartments: List[str] = field(default_factory=list)
verpflegung: Optional[str] = None # Meal plan
sprache: Optional[str] = None # Language
device: Optional[str] = None
anrede: Optional[str] = None # Salutation
land: Optional[str] = None # Country
privacy: Optional[bool] = None
def parse_mbox_file(filepath: str) -> List[Lead]:
"""
Parse MBOX file and extract lead information.
Args:
filepath: Path to the MBOX file
Returns:
List of Lead objects with extracted data
"""
leads = []
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Split by "From " at the beginning of lines to separate emails
email_blocks = re.split(r'^From \d+@', content, flags=re.MULTILINE)[1:]
for email_block in email_blocks:
# Find the content section after headers (after a blank line)
# Headers end with a blank line, then the actual form data starts
parts = email_block.split('\n\n', 1)
if len(parts) < 2:
continue
headers = parts[0]
body = parts[1] if len(parts) > 1 else ""
# Extract lead data from body
lead = parse_email_body(body)
if lead.name or lead.mail: # Only add if we have some data
leads.append(lead)
return leads
def parse_email_body(body: str) -> Lead:
"""
Parse the body of an email to extract lead information.
Args:
body: Email body content
Returns:
Lead object with extracted data
"""
lead = Lead()
# Split body into lines for easier parsing
lines = body.split('\n')
for line in lines:
line = line.strip()
if not line or ':' not in line:
continue
key, value = line.split(':', 1)
key = key.strip()
value = value.strip()
# Map keys to Lead attributes
if key == 'Name':
lead.name = value
elif key == 'Nachname':
lead.lastname = value
elif key == 'Mail':
lead.mail = value
elif key == 'Tel':
lead.tel = value
elif key == 'Anreise':
lead.anreise = value
elif key == 'Abreise':
lead.abreise = value
elif key == 'Erwachsene':
lead.erwachsene = int(value) if value.isdigit() else None
elif key == 'Kinder':
lead.kinder = int(value) if value.isdigit() else None
elif key.startswith('Alter Kind'):
# Extract age from "Alter Kind 1", "Alter Kind 2", etc.
try:
age = int(value)
lead.kind_ages.append(age)
except ValueError:
pass
elif key == 'Apartment':
lead.apartments.append(value)
elif key == 'Verpflegung':
lead.verpflegung = value
elif key == 'Sprache':
lead.sprache = value
elif key == 'Device':
lead.device = value
elif key == 'Anrede':
lead.anrede = value
elif key == 'Land':
lead.land = value
elif key == 'Privacy':
lead.privacy = value.lower() == 'on'
# Sort child ages to maintain order
lead.kind_ages.sort()
return lead
def export_to_json(leads: List[Lead], output_file: str) -> None:
"""Export leads to JSON file."""
data = [asdict(lead) for lead in leads]
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"Exported {len(leads)} leads to {output_file}")
def export_to_csv(leads: List[Lead], output_file: str) -> None:
"""Export leads to CSV file."""
import csv
if not leads:
return
# Define CSV headers
headers = [
'name',
'lastname',
'mail',
'tel',
'anreise',
'abreise',
'erwachsene',
'kinder',
'kind_ages',
'apartments',
'verpflegung',
'sprache',
'device',
'anrede',
'land',
'privacy'
]
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
for lead in leads:
row = asdict(lead)
# Convert lists to comma-separated strings for CSV
row['kind_ages'] = ','.join(map(str, row['kind_ages']))
row['apartments'] = ','.join(row['apartments'])
row['privacy'] = 'Yes' if row['privacy'] else 'No' if row['privacy'] is False else ''
writer.writerow(row)
print(f"Exported {len(leads)} leads to {output_file}")
def print_summary(leads: List[Lead]) -> None:
"""Print a summary of extracted leads."""
print(f"\n{'='*60}")
print(f"Total leads extracted: {len(leads)}")
print(f"{'='*60}\n")
for i, lead in enumerate(leads, 1):
print(f"Lead {i}:")
print(f" Name: {lead.name} {lead.lastname}")
print(f" Email: {lead.mail}")
print(f" Phone: {lead.tel}")
print(f" Check-in: {lead.anreise}, Check-out: {lead.abreise}")
print(f" Adults: {lead.erwachsene}, Children: {lead.kinder}")
if lead.kind_ages:
print(f" Children ages: {lead.kind_ages}")
if lead.apartments:
print(f" Apartments: {', '.join(lead.apartments)}")
print()
if __name__ == '__main__':
import sys
mbox_file = '/home/divusjulius/repos/alpinebits_python/Leads-Bemelmans Apartments.mbox'
print(f"Parsing {mbox_file}...")
leads = parse_mbox_file(mbox_file)
# Print summary
print_summary(leads)
# Export to JSON
export_to_json(leads, 'leads_export.json')
# Export to CSV
export_to_csv(leads, 'leads_export.csv')