Skript now extracts from _all skipping the dumb filter in notion entirely

This commit is contained in:
Jonas Linter
2025-11-05 11:00:28 +01:00
parent 8d3a48b34c
commit 623e45861d

View File

@@ -19,27 +19,57 @@ def extract_csvs_from_zip(zip_path, extract_dir):
for zip_file in glob.glob("input/*.zip"):
extract_csvs_from_zip(zip_file, "input")
# Find all csv files, but only use those without '_all' in filename
all_files = [f for f in glob.glob("input/*.csv") if '_all' not in os.path.basename(f)]
# check if input folder exists
if not os.path.exists('input'):
os.makedirs('input')
print("Created 'input' directory. Please add your zip or csv files there and rerun the script.")
exit()
for file in all_files:
print(f"Processing file: {file}")
# Find the _all csv file
all_files = [f for f in glob.glob("input/*.csv") if '_all' in os.path.basename(f)]
df = pd.read_csv(file)
# Extract date from 'Date 1' (format: 'DD/MM/YYYY HH:MM (MESZ)')
df['Datum'] = df['Date 1'].str.extract(r'(\d{2}/\d{2}/\d{4})')
# Convert 'Stunden' to float (replace comma with dot)
df['Stunden'] = df['Stunden'].str.replace(',', '.', regex=False).astype(float)
# Group by date and sum hours
result = df.groupby('Datum', as_index=False)['Stunden'].sum()
# round to 2 decimal places to my benefit
if not all_files:
print("Error: No '_all' file found in input folder. Please ensure your Notion export contains an '_all' CSV file.")
exit()
if len(all_files) > 1:
print(f"Warning: Found multiple '_all' files. Using the first one: {all_files[0]}")
file = all_files[0]
print(f"Processing file: {file}")
df = pd.read_csv(file)
# Extract date from 'Date 1' (format: 'DD/MM/YYYY HH:MM (MESZ)')
df['Datum'] = df['Date 1'].str.extract(r'(\d{2}/\d{2}/\d{4})')
# Convert 'Stunden' to float (replace comma with dot)
df['Stunden'] = df['Stunden'].str.replace(',', '.', regex=False).astype(float)
# Convert 'Datum' to datetime
df['Datum'] = pd.to_datetime(df['Datum'], format='%d/%m/%Y')
# Group by date and sum hours
daily_result = df.groupby('Datum', as_index=False)['Stunden'].sum()
# round to 2 decimal places
daily_result['Stunden'] = daily_result['Stunden'].round(2)
# Group by year-month and create separate files for each month
df['YearMonth'] = df['Datum'].dt.to_period('M')
monthly_groups = df.groupby('YearMonth')
# check if output directory exists, if not create it
if not os.path.exists('output'):
os.makedirs('output')
# Process each month
for period, group_df in monthly_groups:
year = period.year
month = period.month
# Group by date within this month and sum hours
result = group_df.groupby('Datum', as_index=False)['Stunden'].sum()
result['Stunden'] = result['Stunden'].round(2)
# Convert 'Datum' to datetime
result['Datum'] = pd.to_datetime(result['Datum'], format='%d/%m/%Y')
first_day = result['Datum'].iloc[0]
month = first_day.month
year = first_day.year
# Output only date and sum
total_hours = result['Stunden'].sum().round(2)
print(f"Processing {period}: Monthly sum of hours: {total_hours}")
result.to_csv(f'output/{file_prefix}{month}_{year}.csv', sep=';', index=False, header=['Datum', 'Summe'])
print("Monthly sum of hours:", result['Stunden'].sum().round(2))
result.to_excel(f'output/{file_prefix}{month}_{year}.xlsx', index=False, header=['Datum', 'Summe'])