Skript now extracts from _all skipping the dumb filter in notion entirely
This commit is contained in:
@@ -19,27 +19,57 @@ def extract_csvs_from_zip(zip_path, extract_dir):
|
||||
for zip_file in glob.glob("input/*.zip"):
|
||||
extract_csvs_from_zip(zip_file, "input")
|
||||
|
||||
# Find all csv files, but only use those without '_all' in filename
|
||||
all_files = [f for f in glob.glob("input/*.csv") if '_all' not in os.path.basename(f)]
|
||||
# check if input folder exists
|
||||
if not os.path.exists('input'):
|
||||
os.makedirs('input')
|
||||
print("Created 'input' directory. Please add your zip or csv files there and rerun the script.")
|
||||
exit()
|
||||
|
||||
for file in all_files:
|
||||
print(f"Processing file: {file}")
|
||||
# Find the _all csv file
|
||||
all_files = [f for f in glob.glob("input/*.csv") if '_all' in os.path.basename(f)]
|
||||
|
||||
df = pd.read_csv(file)
|
||||
# Extract date from 'Date 1' (format: 'DD/MM/YYYY HH:MM (MESZ)')
|
||||
df['Datum'] = df['Date 1'].str.extract(r'(\d{2}/\d{2}/\d{4})')
|
||||
# Convert 'Stunden' to float (replace comma with dot)
|
||||
df['Stunden'] = df['Stunden'].str.replace(',', '.', regex=False).astype(float)
|
||||
# Group by date and sum hours
|
||||
result = df.groupby('Datum', as_index=False)['Stunden'].sum()
|
||||
# round to 2 decimal places to my benefit
|
||||
if not all_files:
|
||||
print("Error: No '_all' file found in input folder. Please ensure your Notion export contains an '_all' CSV file.")
|
||||
exit()
|
||||
|
||||
if len(all_files) > 1:
|
||||
print(f"Warning: Found multiple '_all' files. Using the first one: {all_files[0]}")
|
||||
|
||||
file = all_files[0]
|
||||
print(f"Processing file: {file}")
|
||||
|
||||
df = pd.read_csv(file)
|
||||
# Extract date from 'Date 1' (format: 'DD/MM/YYYY HH:MM (MESZ)')
|
||||
df['Datum'] = df['Date 1'].str.extract(r'(\d{2}/\d{2}/\d{4})')
|
||||
# Convert 'Stunden' to float (replace comma with dot)
|
||||
df['Stunden'] = df['Stunden'].str.replace(',', '.', regex=False).astype(float)
|
||||
# Convert 'Datum' to datetime
|
||||
df['Datum'] = pd.to_datetime(df['Datum'], format='%d/%m/%Y')
|
||||
|
||||
# Group by date and sum hours
|
||||
daily_result = df.groupby('Datum', as_index=False)['Stunden'].sum()
|
||||
# round to 2 decimal places
|
||||
daily_result['Stunden'] = daily_result['Stunden'].round(2)
|
||||
|
||||
# Group by year-month and create separate files for each month
|
||||
df['YearMonth'] = df['Datum'].dt.to_period('M')
|
||||
monthly_groups = df.groupby('YearMonth')
|
||||
|
||||
# check if output directory exists, if not create it
|
||||
if not os.path.exists('output'):
|
||||
os.makedirs('output')
|
||||
|
||||
# Process each month
|
||||
for period, group_df in monthly_groups:
|
||||
year = period.year
|
||||
month = period.month
|
||||
|
||||
# Group by date within this month and sum hours
|
||||
result = group_df.groupby('Datum', as_index=False)['Stunden'].sum()
|
||||
result['Stunden'] = result['Stunden'].round(2)
|
||||
# Convert 'Datum' to datetime
|
||||
result['Datum'] = pd.to_datetime(result['Datum'], format='%d/%m/%Y')
|
||||
first_day = result['Datum'].iloc[0]
|
||||
month = first_day.month
|
||||
year = first_day.year
|
||||
# Output only date and sum
|
||||
|
||||
total_hours = result['Stunden'].sum().round(2)
|
||||
print(f"Processing {period}: Monthly sum of hours: {total_hours}")
|
||||
|
||||
result.to_csv(f'output/{file_prefix}{month}_{year}.csv', sep=';', index=False, header=['Datum', 'Summe'])
|
||||
print("Monthly sum of hours:", result['Stunden'].sum().round(2))
|
||||
result.to_excel(f'output/{file_prefix}{month}_{year}.xlsx', index=False, header=['Datum', 'Summe'])
|
||||
Reference in New Issue
Block a user