Skript now extracts from _all skipping the dumb filter in notion entirely

2025-11-05 11:00:28 +01:00
parent 8d3a48b34c
commit 623e45861d
1 changed files with 49 additions and 19 deletions
--- a/convert_to_exel.py
+++ b/convert_to_exel.py
@@ -19,27 +19,57 @@ def extract_csvs_from_zip(zip_path, extract_dir):
 for zip_file in glob.glob("input/*.zip"):
    extract_csvs_from_zip(zip_file, "input")

-# Find all csv files, but only use those without '_all' in filename
-all_files = [f for f in glob.glob("input/*.csv") if '_all' not in os.path.basename(f)]
+# check if input folder exists
+if not os.path.exists('input'):
+    os.makedirs('input')
+    print("Created 'input' directory. Please add your zip or csv files there and rerun the script.")
+    exit()

-for file in all_files:
-    print(f"Processing file: {file}")
+# Find the _all csv file
+all_files = [f for f in glob.glob("input/*.csv") if '_all' in os.path.basename(f)]

-    df = pd.read_csv(file)
-    # Extract date from 'Date 1' (format: 'DD/MM/YYYY HH:MM (MESZ)')
-    df['Datum'] = df['Date 1'].str.extract(r'(\d{2}/\d{2}/\d{4})')
-    # Convert 'Stunden' to float (replace comma with dot)
-    df['Stunden'] = df['Stunden'].str.replace(',', '.', regex=False).astype(float)
-    # Group by date and sum hours
-    result = df.groupby('Datum', as_index=False)['Stunden'].sum()
-    # round to 2 decimal places to my benefit
+if not all_files:
+    print("Error: No '_all' file found in input folder. Please ensure your Notion export contains an '_all' CSV file.")
+    exit()
+
+if len(all_files) > 1:
+    print(f"Warning: Found multiple '_all' files. Using the first one: {all_files[0]}")
+
+file = all_files[0]
+print(f"Processing file: {file}")
+
+df = pd.read_csv(file)
+# Extract date from 'Date 1' (format: 'DD/MM/YYYY HH:MM (MESZ)')
+df['Datum'] = df['Date 1'].str.extract(r'(\d{2}/\d{2}/\d{4})')
+# Convert 'Stunden' to float (replace comma with dot)
+df['Stunden'] = df['Stunden'].str.replace(',', '.', regex=False).astype(float)
+# Convert 'Datum' to datetime
+df['Datum'] = pd.to_datetime(df['Datum'], format='%d/%m/%Y')
+
+# Group by date and sum hours
+daily_result = df.groupby('Datum', as_index=False)['Stunden'].sum()
+# round to 2 decimal places
+daily_result['Stunden'] = daily_result['Stunden'].round(2)
+
+# Group by year-month and create separate files for each month
+df['YearMonth'] = df['Datum'].dt.to_period('M')
+monthly_groups = df.groupby('YearMonth')
+
+# check if output directory exists, if not create it
+if not os.path.exists('output'):
+    os.makedirs('output')
+
+# Process each month
+for period, group_df in monthly_groups:
+    year = period.year
+    month = period.month
+
+    # Group by date within this month and sum hours
+    result = group_df.groupby('Datum', as_index=False)['Stunden'].sum()
    result['Stunden'] = result['Stunden'].round(2)
-    # Convert 'Datum' to datetime
-    result['Datum'] = pd.to_datetime(result['Datum'], format='%d/%m/%Y')
-    first_day = result['Datum'].iloc[0]
-    month = first_day.month
-    year = first_day.year
-    # Output only date and sum
+
+    total_hours = result['Stunden'].sum().round(2)
+    print(f"Processing {period}: Monthly sum of hours: {total_hours}")
+
    result.to_csv(f'output/{file_prefix}{month}_{year}.csv', sep=';', index=False, header=['Datum', 'Summe'])
-    print("Monthly sum of hours:", result['Stunden'].sum().round(2))
    result.to_excel(f'output/{file_prefix}{month}_{year}.xlsx', index=False, header=['Datum', 'Summe'])