import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from datetime import timedelta

df = pd.read_csv("norway_covid_2020_2024.csv")  
df['date'] = pd.to_datetime(df['date'])

df_full = df[['date', 'new_cases_smoothed']].dropna().sort_values('date')

# Use data from 2020 to 2022 for training
df_train = df_full[(df_full['date'] >= '2020-01-01') & (df_full['date'] <= '2022-12-31')].copy()
df_train.set_index('date', inplace=True)

#ARIMA model
model = ARIMA(df_train['new_cases_smoothed'], order=(2, 1, 2))
model_fit = model.fit()

#6 months
forecast_horizon = 180
forecast = model_fit.forecast(steps=forecast_horizon)
last_date = df_train.index[-1]
forecast_index = [last_date + timedelta(days=i) for i in range(1, forecast_horizon + 1)]

start_forecast = forecast_index[0]
end_forecast = forecast_index[-1]
df_actual = df_full[(df_full['date'] >= start_forecast) & (df_full['date'] <= end_forecast)].copy()
df_actual.set_index('date', inplace=True)

plt.figure(figsize=(12, 6))
plt.plot(df_train.index, df_train['new_cases_smoothed'], label='Training data (2020–2022)', color='blue')
plt.plot(forecast_index, forecast, label='ARIMA forecast (Jan–Jun 2023)', linestyle='--', color='red')
plt.plot(df_actual.index, df_actual['new_cases_smoothed'], label='Actual values (Jan–Jun 2023)', linestyle=':', color='green')
plt.xlabel('Date')
plt.ylabel('New COVID cases (smoothed)')
plt.title('COVID-19 Cases in Norway: 6-Month Forecast vs. Actual (Based on 2020–2022)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('arima_forecast_2020_2022.png', dpi=400)
plt.show()
