Importing mailbox leads now works
This commit is contained in:
228
extract_leads.py
Normal file
228
extract_leads.py
Normal file
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract lead information from MBOX email file.
|
||||
Parses email entries and extracts structured lead data.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
||||
|
||||
@dataclass
|
||||
class Lead:
|
||||
"""Represents a single lead extracted from email."""
|
||||
name: Optional[str] = None
|
||||
lastname: Optional[str] = None
|
||||
mail: Optional[str] = None
|
||||
tel: Optional[str] = None
|
||||
anreise: Optional[str] = None # Check-in date
|
||||
abreise: Optional[str] = None # Check-out date
|
||||
erwachsene: Optional[int] = None # Adults
|
||||
kinder: Optional[int] = None # Children
|
||||
kind_ages: List[int] = field(default_factory=list) # Children ages
|
||||
apartments: List[str] = field(default_factory=list)
|
||||
verpflegung: Optional[str] = None # Meal plan
|
||||
sprache: Optional[str] = None # Language
|
||||
device: Optional[str] = None
|
||||
anrede: Optional[str] = None # Salutation
|
||||
land: Optional[str] = None # Country
|
||||
privacy: Optional[bool] = None
|
||||
|
||||
|
||||
def parse_mbox_file(filepath: str) -> List[Lead]:
|
||||
"""
|
||||
Parse MBOX file and extract lead information.
|
||||
|
||||
Args:
|
||||
filepath: Path to the MBOX file
|
||||
|
||||
Returns:
|
||||
List of Lead objects with extracted data
|
||||
"""
|
||||
leads = []
|
||||
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Split by "From " at the beginning of lines to separate emails
|
||||
email_blocks = re.split(r'^From \d+@', content, flags=re.MULTILINE)[1:]
|
||||
|
||||
for email_block in email_blocks:
|
||||
# Find the content section after headers (after a blank line)
|
||||
# Headers end with a blank line, then the actual form data starts
|
||||
parts = email_block.split('\n\n', 1)
|
||||
|
||||
if len(parts) < 2:
|
||||
continue
|
||||
|
||||
headers = parts[0]
|
||||
body = parts[1] if len(parts) > 1 else ""
|
||||
|
||||
# Extract lead data from body
|
||||
lead = parse_email_body(body)
|
||||
|
||||
if lead.name or lead.mail: # Only add if we have some data
|
||||
leads.append(lead)
|
||||
|
||||
return leads
|
||||
|
||||
|
||||
def parse_email_body(body: str) -> Lead:
|
||||
"""
|
||||
Parse the body of an email to extract lead information.
|
||||
|
||||
Args:
|
||||
body: Email body content
|
||||
|
||||
Returns:
|
||||
Lead object with extracted data
|
||||
"""
|
||||
lead = Lead()
|
||||
|
||||
# Split body into lines for easier parsing
|
||||
lines = body.split('\n')
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
||||
if not line or ':' not in line:
|
||||
continue
|
||||
|
||||
key, value = line.split(':', 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
|
||||
# Map keys to Lead attributes
|
||||
if key == 'Name':
|
||||
lead.name = value
|
||||
elif key == 'Nachname':
|
||||
lead.lastname = value
|
||||
elif key == 'Mail':
|
||||
lead.mail = value
|
||||
elif key == 'Tel':
|
||||
lead.tel = value
|
||||
elif key == 'Anreise':
|
||||
lead.anreise = value
|
||||
elif key == 'Abreise':
|
||||
lead.abreise = value
|
||||
elif key == 'Erwachsene':
|
||||
lead.erwachsene = int(value) if value.isdigit() else None
|
||||
elif key == 'Kinder':
|
||||
lead.kinder = int(value) if value.isdigit() else None
|
||||
elif key.startswith('Alter Kind'):
|
||||
# Extract age from "Alter Kind 1", "Alter Kind 2", etc.
|
||||
try:
|
||||
age = int(value)
|
||||
lead.kind_ages.append(age)
|
||||
except ValueError:
|
||||
pass
|
||||
elif key == 'Apartment':
|
||||
lead.apartments.append(value)
|
||||
elif key == 'Verpflegung':
|
||||
lead.verpflegung = value
|
||||
elif key == 'Sprache':
|
||||
lead.sprache = value
|
||||
elif key == 'Device':
|
||||
lead.device = value
|
||||
elif key == 'Anrede':
|
||||
lead.anrede = value
|
||||
elif key == 'Land':
|
||||
lead.land = value
|
||||
elif key == 'Privacy':
|
||||
lead.privacy = value.lower() == 'on'
|
||||
|
||||
# Sort child ages to maintain order
|
||||
lead.kind_ages.sort()
|
||||
|
||||
return lead
|
||||
|
||||
|
||||
def export_to_json(leads: List[Lead], output_file: str) -> None:
|
||||
"""Export leads to JSON file."""
|
||||
data = [asdict(lead) for lead in leads]
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
print(f"Exported {len(leads)} leads to {output_file}")
|
||||
|
||||
|
||||
def export_to_csv(leads: List[Lead], output_file: str) -> None:
|
||||
"""Export leads to CSV file."""
|
||||
import csv
|
||||
|
||||
if not leads:
|
||||
return
|
||||
|
||||
# Define CSV headers
|
||||
headers = [
|
||||
'name',
|
||||
'lastname',
|
||||
'mail',
|
||||
'tel',
|
||||
'anreise',
|
||||
'abreise',
|
||||
'erwachsene',
|
||||
'kinder',
|
||||
'kind_ages',
|
||||
'apartments',
|
||||
'verpflegung',
|
||||
'sprache',
|
||||
'device',
|
||||
'anrede',
|
||||
'land',
|
||||
'privacy'
|
||||
]
|
||||
|
||||
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=headers)
|
||||
writer.writeheader()
|
||||
|
||||
for lead in leads:
|
||||
row = asdict(lead)
|
||||
# Convert lists to comma-separated strings for CSV
|
||||
row['kind_ages'] = ','.join(map(str, row['kind_ages']))
|
||||
row['apartments'] = ','.join(row['apartments'])
|
||||
row['privacy'] = 'Yes' if row['privacy'] else 'No' if row['privacy'] is False else ''
|
||||
writer.writerow(row)
|
||||
|
||||
print(f"Exported {len(leads)} leads to {output_file}")
|
||||
|
||||
|
||||
def print_summary(leads: List[Lead]) -> None:
|
||||
"""Print a summary of extracted leads."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Total leads extracted: {len(leads)}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
for i, lead in enumerate(leads, 1):
|
||||
print(f"Lead {i}:")
|
||||
print(f" Name: {lead.name} {lead.lastname}")
|
||||
print(f" Email: {lead.mail}")
|
||||
print(f" Phone: {lead.tel}")
|
||||
print(f" Check-in: {lead.anreise}, Check-out: {lead.abreise}")
|
||||
print(f" Adults: {lead.erwachsene}, Children: {lead.kinder}")
|
||||
if lead.kind_ages:
|
||||
print(f" Children ages: {lead.kind_ages}")
|
||||
if lead.apartments:
|
||||
print(f" Apartments: {', '.join(lead.apartments)}")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
mbox_file = '/home/divusjulius/repos/alpinebits_python/Leads-Bemelmans Apartments.mbox'
|
||||
|
||||
print(f"Parsing {mbox_file}...")
|
||||
leads = parse_mbox_file(mbox_file)
|
||||
|
||||
# Print summary
|
||||
print_summary(leads)
|
||||
|
||||
# Export to JSON
|
||||
export_to_json(leads, 'leads_export.json')
|
||||
|
||||
# Export to CSV
|
||||
export_to_csv(leads, 'leads_export.csv')
|
||||
Reference in New Issue
Block a user