import pandas as pd
from scripts.io.clear_input import fill_total_and_new_cases as clear_input
from scripts.io.fill_nans import fill


df = pd.read_csv("data/germany_raw.csv")

print("Working with germany_raw.csv data")
print("Shape of original raw DF, before doing anything:", df.shape)

df = clear_input(df, total_col='total_cases', new_col='new_cases', date_col='date')  ## filling nans in '(total && new) cases' columns

# Now, we can delete NaN's becouse all nans are at the start  (3) and at the end (~33)
df = df.dropna(subset=['total_cases']).reset_index(drop=True)
print(f"Shape of germany_raw.csv after removing nans in 'total_cases' column: {df.shape}")

# Now we'll make 3 DF's with only (In my opinion) interesting columns: whole, early, late
# whole - whole df, early - before drop, late - after drop;       drop - biggest drop

cols_target_interesting = ['total_cases', 'new_cases', 'total_deaths', 'new_deaths']
cols_feature_interesting = ['date', 'weekly_hosp_admissions', 'icu_patients', 'weekly_icu_admissions', 'stringency_index',
                           'reproduction_rate', 'total_tests', 'positive_rate', 'tests_per_case',
                           'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'excess_mortality_cumulative_per_million']

df = df[cols_target_interesting + cols_feature_interesting]

# And now we will fill missing values in this DataFrame
df = fill(df)

df_early = df[df['date'] <= '2022-03-27']
df_late = df[df['date'] > '2022-03-27']


# Printing the df's
print("=" * 130)
print("=" * 130)
print("\nFirst 3 rows of whole germany_raw dataset:")
print(df.head())

print("=" * 130)
print("=" * 130)
print("\nFirst 3 rows of germany_early")
print(df_early.head(3))

print("=" * 130)
print("=" * 130)
print("\nFirst 3 rows of germany_late")
print(df_late.head(3))
print("=" * 130)
#Saving these datasets:
df.reset_index(drop=True).to_csv("data/germany_whole.csv", index=False)
df_early.reset_index(drop=True).to_csv("data/germany_early.csv", index=False)
df_late.reset_index(drop=True).to_csv("data/germany_late.csv", index=False)

print("=" * 130)
print("\nAll datasets saved in data/germany_{..}.csv files {whole/early/late}")
print("PS: the highest pick was at '2022-03-27' and it is the splitting point for both early/late .csv's\n")
