import pandas as pd
from scripts.io.clear_input import fill_total_and_new_cases as clear_input
from scripts.io.fill_nans import fill


df = pd.read_csv("data/poland_raw.csv")
df.reset_index(drop=True).to_csv("data/poland_raw.csv", index=False)

print("Working with poland_raw.csv data")
print("Shape of original raw DF, before doing anything:", df.shape)

# Fill NaNs in 'total_cases' and 'new_cases' columns based on their relationship
df = clear_input(df, total_col='total_cases', new_col='new_cases', date_col='date')

# Remove rows where 'total_cases' is NaN (these are typically at the start/end of the dataset)
df = df.dropna(subset=['total_cases']).reset_index(drop=True)
print(f"Shape of poland_raw.csv after removing nans in 'total_cases' column: {df.shape}")

# Define interesting columns for target and features
cols_target_interesting = ['total_cases', 'new_cases', 'total_deaths', 'new_deaths']
cols_feature_interesting = ['date', 'weekly_hosp_admissions', 'icu_patients', 'weekly_icu_admissions', 'stringency_index',
                            'reproduction_rate', 'total_tests', 'tests_per_case', 'new_vaccinations', 'positive_rate',
                            'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'excess_mortality_cumulative_per_million']

# Select only the interesting columns
df = df[cols_target_interesting + cols_feature_interesting]

# Fill any remaining missing values in the selected DataFrame
df = fill(df)

df.reset_index(drop=True).to_csv("data/poland_whole.csv", index=False)
