import sklearn
from sklearn.model_selection import train_test_split

import xgboost as xgb
from statsmodels.tsa.holtwinters import ExponentialSmoothing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go


# read walmart sales
features_df = pd.read_csv("features.csv")
stores_df = pd.read_csv("stores.csv")
#test_df = pd.read_csv("test.csv") # only for competition
train_df = pd.read_csv("train.csv")


features_df.head()


# fill MarkDown missing with 0
# for col_name in features_df.columns:
#   if 'MarkDown' in col_name:
#     features_df.loc[:,col_name].fillna(0,inplace=True)


features_df.head()


stores_df.head()


#test_df.head()


train_df.head()


sum(train_df.Weekly_Sales<0)

1285


# remove negatives in sales data
train_df.Weekly_Sales[train_df.Weekly_Sales<0] = 0

C:\Users\kevin\AppData\Local\Temp\ipykernel_15492\1175610044.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.Weekly_Sales[train_df.Weekly_Sales<0] = 0


wk_sales = train_df.Weekly_Sales.copy()
wk_sales[wk_sales<0] = 0
np.log10(wk_sales+1).hist()

<Axes: >


wk_sales.hist()

<Axes: >


# Total monthly, yearly sales per store and per dept
tot_sales_df = train_df.copy()
tot_sales_df["Date"] = pd.to_datetime(tot_sales_df["Date"]) # convert to datetime

tot_sales_df['Week'] = tot_sales_df['Date'].dt.to_period('W').apply(lambda r: r.start_time)
tot_sales_df['Month'] = tot_sales_df['Date'].to_numpy().astype('datetime64[M]')
tot_sales_df['Year'] = tot_sales_df['Date'].to_numpy().astype('datetime64[Y]')


tot_sales_df.drop(columns = ["Date", "IsHoliday"])

tot_sales_df.head()


# Total sales per month per store per dept
tot_sales_month_df = tot_sales_df.groupby(by = ['Store','Dept','Month'])['Weekly_Sales'].sum()
tot_sales_month_df = tot_sales_month_df.rename('Tot_Month_Sales')


# Total sales per year per store per dept
tot_sales_year_df = tot_sales_df.groupby(by = ['Store','Dept','Year'])['Weekly_Sales'].sum()
tot_sales_year_df = tot_sales_year_df.rename('Tot_Year_Sales')


# Total sales per week per store per dept
tot_sales_week_df = tot_sales_df.groupby(by = ['Store','Dept','Week'])['Weekly_Sales'].sum()
tot_sales_week_df = tot_sales_week_df.rename('Week_Sales')


# Total sales per month per store
tot_sales_month_store_df = tot_sales_month_df.groupby(by = ['Store','Month']).sum()
tot_sales_month_store_df = tot_sales_month_store_df.rename('Tot_Month_Sales_Store')


# Total sales per week per store 
tot_sales_week_store_df = tot_sales_week_df.groupby(by = ['Store','Week']).sum()
tot_sales_week_store_df = tot_sales_week_store_df.rename('Tot_Week_Sales_Store')


# Total sales per month per dept
tot_sales_month_dept_df = tot_sales_month_df.groupby(by = ['Dept','Month']).sum()
tot_sales_month_dept_df = tot_sales_month_dept_df.rename('Tot_Month_Sales_Dept')


# Total sales per week per dept
tot_sales_week_dept_df = tot_sales_week_df.groupby(by = ['Dept','Week']).sum()
tot_sales_week_dept_df = tot_sales_week_dept_df.rename('Tot_Week_Sales_Dept')


X_df = train_df.copy().merge(stores_df, left_on="Store", right_on="Store")
X_df = pd.merge(X_df, features_df.loc[:, features_df.columns != "IsHoliday"], how="left")

X_df.head()


print(f"there are {X_df.shape[0]} rows")

print(f"the date is from {min(X_df.Date)} to {max(X_df.Date)}")

for name in X_df.columns:
  print(f"column {name} has {len(set(X_df.loc[:,name]))} unique values")

there are 421570 rows
the date is from 2010-02-05 to 2012-10-26
column Store has 45 unique values
column Dept has 81 unique values
column Date has 143 unique values
column Weekly_Sales has 358786 unique values
column IsHoliday has 2 unique values
column Type has 3 unique values
column Size has 40 unique values
column Temperature has 3528 unique values
column Fuel_Price has 892 unique values
column MarkDown1 has 273166 unique values
column MarkDown2 has 311821 unique values
column MarkDown3 has 286141 unique values
column MarkDown4 has 288547 unique values
column MarkDown5 has 272431 unique values
column CPI has 2145 unique values
column Unemployment has 349 unique values


# convert date to 3 cols year month day
X_df["Date"] = pd.to_datetime(X_df["Date"]) # convert to datetime
X_df['Week_Int'] = X_df['Date'].dt.isocalendar().week.astype(int)
X_df['Month_Int'] = X_df['Date'].dt.month.astype(int)
X_df['Year_Int'] = X_df['Date'].dt.year.astype(int)

X_df['Week'] = X_df['Date'].dt.to_period('W').apply(lambda r: r.start_time) # find Monday of week
X_df['Month'] = X_df['Date'].to_numpy().astype('datetime64[M]') # find first day of the month
X_df['Year'] = X_df['Date'].to_numpy().astype('datetime64[Y]') # find first day of the year
# X_df = X_df.drop(columns=["Date"])


X_df.head()


# Sales information about previous weeks/months/years

# create date lag

# create columns Week_B1 to Week_B8
for week_i in range(1,9):
    X_df.loc[:,'Week_B'+str(week_i)] = (
        X_df['Week'] - pd.DateOffset(weeks=week_i)).dt.to_period('W').apply(lambda r: r.start_time)

X_df['Week_B1Y'] = (X_df['Week'] - pd.DateOffset(years=1)).dt.to_period('W').apply(lambda r: r.start_time)

# create columns Month_B1 to Month_B12
for month_i in range(1,13):
    X_df.loc[:,'Month_B'+str(month_i)] = X_df['Month'] - pd.DateOffset(months=month_i)

X_df['Year_B1'] = X_df['Year'] - pd.DateOffset(years=1)


# join sales information from previous periods
# week
for week_i in list(range(1,9)) + ['1Y']:
    X_df = pd.merge(X_df,
                    tot_sales_week_df.rename('Week_B'+str(week_i)+'_Sales'), 
                    how      = "left", 
                    left_on  = ['Store','Dept','Week_B'+str(week_i)], 
                    right_on = ['Store','Dept','Week'])
    
    # store total
    X_df = pd.merge(X_df,
                    tot_sales_week_store_df.rename('Week_B'+str(week_i)+'_Store_Sales'), 
                    how      = "left", 
                    left_on  = ['Store','Week_B'+str(week_i)], 
                    right_on = ['Store','Week'])
    # dept total
    X_df = pd.merge(X_df,
                    tot_sales_week_dept_df.rename('Week_B'+str(week_i)+'_Dept_Sales'), 
                    how      = "left", 
                    left_on  = ['Dept','Week_B'+str(week_i)], 
                    right_on = ['Dept','Week'])

# month
for month_i in range(1,13):
    X_df = pd.merge(X_df,
                    tot_sales_month_df.rename('Month_B'+str(month_i)+'_Sales'), 
                    how      = "left", 
                    left_on  = ['Store','Dept','Month_B'+str(month_i)], 
                    right_on = ['Store','Dept','Month'])
    
    # store total
    X_df = pd.merge(X_df,
                    tot_sales_month_store_df.rename('Month_B'+str(month_i)+'_Store_Sales'), 
                    how      = "left", 
                    left_on  = ['Store','Month_B'+str(month_i)], 
                    right_on = ['Store','Month'])
    
    # dept total
    X_df = pd.merge(X_df,
                    tot_sales_month_dept_df.rename('Month_B'+str(month_i)+'_Dept_Sales'), 
                    how      = "left", 
                    left_on  = ['Dept','Month_B'+str(month_i)], 
                    right_on = ['Dept','Month'])

# year
X_df = pd.merge(X_df,tot_sales_year_df.rename('Year_B1_Sales'), how="left", left_on=['Store','Dept','Year_B1'], right_on = ['Store','Dept','Year'])


# reset fragmentation
X_df = X_df.copy()


# rolling 3-8 week average
for week_i in range(3,9):
    # rolling average store per store per dept
    X_df.loc[:,'Roll_B1_B'+str(week_i)+'_Week_Sales'] = X_df.copy()[
        ['Week_B'+str(n)+'_Sales' for n in list(range(1,week_i))]].mean(axis=1)
    
    # rolling average store per store 
    X_df.loc[:,'Roll_B1_B'+str(week_i)+'_Week_Store_Sales'] = X_df.copy()[
        ['Week_B'+str(n)+'_Store_Sales' for n in list(range(1,week_i))]].mean(axis=1)
    
    # rolling average store per dept
    X_df.loc[:,'Roll_B1_B'+str(week_i)+'_Week_Dept_Sales'] = X_df.copy()[
        ['Week_B'+str(n)+'_Dept_Sales' for n in list(range(1,week_i))]].mean(axis=1)


X_df.tail()


# convert Type to one hot
X_df = pd.get_dummies(X_df, columns=['Type'])


# convert Store to one hot
X_df = pd.get_dummies(X_df, columns=['Store'])


# convert Dept to one hot
X_df = pd.get_dummies(X_df, columns=['Dept'])


# convert IsHoliday to int
X_df["IsHoliday"] = X_df["IsHoliday"].astype(int)


# normalise Size
# X_df["Size"] = (X_df["Size"] - X_df["Size"].min()) / (X_df["Size"].max() - X_df["Size"].min())


# drop na values
# only lagged sales should be missing 
#X_df = X_df.dropna()


# date columns to drop
date_cols = ["Date","Week","Month","Year"] +\
            ['Week_B'+str(n) for n in list(range(1,9)) + ['1Y']] +\
            ['Month_B'+str(n) for n in range(1,13)] +\
            ["Year_B1"]
X_date_df = X_df[date_cols].copy()
X_df = X_df.drop(columns = date_cols)

# set everything to numeric
#X_df = pd.to_numeric(X_df)


# log10 the sales data and markdown
X_df = X_df.apply(lambda x: np.log10(x + 1) if 'Sales' in x.name else x)
#X_df = X_df.apply(lambda x: np.log10(x + 1) if 'MarkDown' in x.name else x)


X_df.head()


y_df = X_df["Weekly_Sales"]
y_df.head()

0    4.396644
1    4.663140
2    4.619057
3    4.287903
4    4.339032
Name: Weekly_Sales, dtype: float64


X_df = X_df.drop(columns=["Weekly_Sales"])


# hold out test set
# date after April 2012
hold_out_idx = (X_df.Year_Int==2012) & (X_df.Month_Int>=4)


X_df_train, X_df_test, y_df_train, y_df_test = train_test_split(X_df[~hold_out_idx], 
                                                                y_df[~hold_out_idx], 
                                                                test_size = 0.2,
                                                                random_state = 42)

dtrain = xgb.DMatrix(X_df_train, label=y_df_train)
dtest = xgb.DMatrix(X_df_test, label=y_df_test)


dtest.num_row()

66556


# parameters 
param = {'max_depth': 10, 'eta': 0.3, 'objective': 'reg:squarederror'}
#param['nthread'] = 10
param['eval_metric'] = 'rmse'
# param['base_score'] = 0
param['base_score'] = y_df_train.mean()

# eval
evallist = [(dtrain, 'train'), (dtest, 'eval')]


num_round = 10
xgb_model = xgb.train(param, dtrain, num_round, evallist)

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\xgboost\core.py:617: FutureWarning: Pass `evals` as keyword args.
  warnings.warn(msg, FutureWarning)

[0]	train-rmse:0.64008	eval-rmse:0.64209
[1]	train-rmse:0.46550	eval-rmse:0.47026
[2]	train-rmse:0.34758	eval-rmse:0.35630
[3]	train-rmse:0.26972	eval-rmse:0.28300
[4]	train-rmse:0.21992	eval-rmse:0.23862
[5]	train-rmse:0.18887	eval-rmse:0.21239
[6]	train-rmse:0.17084	eval-rmse:0.19832
[7]	train-rmse:0.15860	eval-rmse:0.18961
[8]	train-rmse:0.15178	eval-rmse:0.18488
[9]	train-rmse:0.14729	eval-rmse:0.18232


# continue training
num_round = 90
xgb_model = xgb.train(param, dtrain, num_round, evallist, xgb_model = xgb_model)

[0]	train-rmse:0.14273	eval-rmse:0.17963
[1]	train-rmse:0.14075	eval-rmse:0.17809
[2]	train-rmse:0.13936	eval-rmse:0.17724
[3]	train-rmse:0.13782	eval-rmse:0.17634
[4]	train-rmse:0.13681	eval-rmse:0.17577
[5]	train-rmse:0.13578	eval-rmse:0.17544
[6]	train-rmse:0.13398	eval-rmse:0.17496
[7]	train-rmse:0.13256	eval-rmse:0.17468
[8]	train-rmse:0.13090	eval-rmse:0.17422
[9]	train-rmse:0.12955	eval-rmse:0.17405
[10]	train-rmse:0.12866	eval-rmse:0.17399
[11]	train-rmse:0.12663	eval-rmse:0.17337
[12]	train-rmse:0.12575	eval-rmse:0.17314
[13]	train-rmse:0.12434	eval-rmse:0.17298
[14]	train-rmse:0.12353	eval-rmse:0.17287
[15]	train-rmse:0.12217	eval-rmse:0.17276
[16]	train-rmse:0.12170	eval-rmse:0.17243
[17]	train-rmse:0.12050	eval-rmse:0.17216
[18]	train-rmse:0.11938	eval-rmse:0.17209
[19]	train-rmse:0.11783	eval-rmse:0.17192
[20]	train-rmse:0.11710	eval-rmse:0.17181
[21]	train-rmse:0.11649	eval-rmse:0.17171
[22]	train-rmse:0.11583	eval-rmse:0.17167
[23]	train-rmse:0.11523	eval-rmse:0.17161
[24]	train-rmse:0.11423	eval-rmse:0.17132
[25]	train-rmse:0.11312	eval-rmse:0.17138
[26]	train-rmse:0.11240	eval-rmse:0.17117
[27]	train-rmse:0.11132	eval-rmse:0.17116
[28]	train-rmse:0.11035	eval-rmse:0.17122
[29]	train-rmse:0.10981	eval-rmse:0.17102
[30]	train-rmse:0.10895	eval-rmse:0.17088
[31]	train-rmse:0.10822	eval-rmse:0.17086
[32]	train-rmse:0.10747	eval-rmse:0.17079
[33]	train-rmse:0.10670	eval-rmse:0.17080
[34]	train-rmse:0.10547	eval-rmse:0.17045
[35]	train-rmse:0.10456	eval-rmse:0.17024
[36]	train-rmse:0.10313	eval-rmse:0.16990
[37]	train-rmse:0.10261	eval-rmse:0.16984
[38]	train-rmse:0.10214	eval-rmse:0.16965
[39]	train-rmse:0.10169	eval-rmse:0.16961
[40]	train-rmse:0.10102	eval-rmse:0.16960
[41]	train-rmse:0.10041	eval-rmse:0.16943
[42]	train-rmse:0.09970	eval-rmse:0.16933
[43]	train-rmse:0.09912	eval-rmse:0.16923
[44]	train-rmse:0.09821	eval-rmse:0.16931
[45]	train-rmse:0.09761	eval-rmse:0.16938
[46]	train-rmse:0.09678	eval-rmse:0.16942
[47]	train-rmse:0.09602	eval-rmse:0.16942
[48]	train-rmse:0.09565	eval-rmse:0.16926
[49]	train-rmse:0.09510	eval-rmse:0.16924
[50]	train-rmse:0.09470	eval-rmse:0.16916
[51]	train-rmse:0.09425	eval-rmse:0.16908
[52]	train-rmse:0.09394	eval-rmse:0.16904
[53]	train-rmse:0.09345	eval-rmse:0.16906
[54]	train-rmse:0.09336	eval-rmse:0.16905
[55]	train-rmse:0.09293	eval-rmse:0.16899
[56]	train-rmse:0.09245	eval-rmse:0.16895
[57]	train-rmse:0.09201	eval-rmse:0.16895
[58]	train-rmse:0.09148	eval-rmse:0.16897
[59]	train-rmse:0.09090	eval-rmse:0.16888
[60]	train-rmse:0.09045	eval-rmse:0.16882
[61]	train-rmse:0.08969	eval-rmse:0.16877
[62]	train-rmse:0.08940	eval-rmse:0.16885
[63]	train-rmse:0.08893	eval-rmse:0.16873
[64]	train-rmse:0.08807	eval-rmse:0.16861
[65]	train-rmse:0.08771	eval-rmse:0.16845
[66]	train-rmse:0.08746	eval-rmse:0.16850
[67]	train-rmse:0.08708	eval-rmse:0.16842
[68]	train-rmse:0.08660	eval-rmse:0.16829
[69]	train-rmse:0.08633	eval-rmse:0.16823
[70]	train-rmse:0.08588	eval-rmse:0.16815
[71]	train-rmse:0.08542	eval-rmse:0.16807
[72]	train-rmse:0.08466	eval-rmse:0.16798
[73]	train-rmse:0.08422	eval-rmse:0.16795
[74]	train-rmse:0.08371	eval-rmse:0.16793
[75]	train-rmse:0.08342	eval-rmse:0.16785
[76]	train-rmse:0.08313	eval-rmse:0.16784
[77]	train-rmse:0.08271	eval-rmse:0.16783
[78]	train-rmse:0.08257	eval-rmse:0.16783
[79]	train-rmse:0.08214	eval-rmse:0.16776
[80]	train-rmse:0.08185	eval-rmse:0.16768
[81]	train-rmse:0.08151	eval-rmse:0.16758
[82]	train-rmse:0.08116	eval-rmse:0.16754
[83]	train-rmse:0.08092	eval-rmse:0.16746
[84]	train-rmse:0.08047	eval-rmse:0.16740
[85]	train-rmse:0.08035	eval-rmse:0.16738
[86]	train-rmse:0.08000	eval-rmse:0.16737
[87]	train-rmse:0.07974	eval-rmse:0.16738
[88]	train-rmse:0.07950	eval-rmse:0.16741
[89]	train-rmse:0.07900	eval-rmse:0.16739


# feature importance of xgboost
xgb.plot_importance(xgb_model)
plt.show()


feature_rank = xgb_model.get_score(importance_type='gain') 
feature_rank_df = pd.DataFrame(feature_rank.items(), columns=['feature', 'gain'])
# feature_rank = xgb_model.get_fscore() 
# feature_rank_df = pd.DataFrame(feature_rank.items(), columns=['feature', 'fscore'])


feature_rank_df.sort_values('gain',ascending=False)


y_hold_out_pred = xgb_model.predict(xgb.DMatrix(X_df[hold_out_idx]))


X_df[hold_out_idx].head()


results_df = np.column_stack(
    (X_df[['IsHoliday']][hold_out_idx],
     np.power(10,y_df[hold_out_idx])-1,
     np.power(10,y_hold_out_pred)-1)
)

results_df = pd.DataFrame(results_df, columns = ['IsHoliday','Actual_Sales','Prediction'])
results_df['Date'] = X_date_df[['Date']][hold_out_idx].reset_index(drop=True)

# store information from one hot columns
store_cols = [name for name in X_df.columns if ('Store_' in name) & ('Sale' not in name)]
results_df['Store'] = X_df[store_cols].idxmax(axis=1).str.removeprefix('Store_').astype(int)[hold_out_idx.values].reset_index(drop=True)

# dept information from one hot columns
dept_cols = [name for name in X_df.columns if ('Dept_' in name) & ('Sale' not in name)]
results_df['Dept'] = X_df[dept_cols].idxmax(axis=1).str.removeprefix('Dept_').astype(int)[hold_out_idx.values].reset_index(drop=True)

results_df


# Mean absolute percentage error
# weekly
# sum over all store and dept

results_weekly_df = results_df.groupby(by = ['Date'])[['Actual_Sales','Prediction']].sum()

# Mean absolute percentage error
abs_per_err_df = abs(results_weekly_df.Actual_Sales.sub(results_weekly_df.Prediction, axis='index')).divide(results_weekly_df.Actual_Sales, axis='index')

# correct if zero actual orders
eq_zero = results_weekly_df.Actual_Sales==0
eq_value = results_weekly_df.Prediction.eq(results_weekly_df.Actual_Sales,axis='index')

abs_per_err_df[eq_value & eq_zero] = 0
abs_per_err_df[~eq_value & eq_zero] = 1


mean_abs_per_err = abs_per_err_df.mean()
mean_abs_per_err
# 2.36%

0.02360695609460629


# visualise data and predictions across all stores
national_df = X_date_df[['Date']].copy().reset_index(drop=True)
# national_df['Date'] = X_date_df[['Date']].reset_index(drop=True)
# national_df = X_df[['IsHoliday']].copy()
national_df['IsHoliday'] = X_df[['IsHoliday']].reset_index(drop=True).copy()

national_df['Store'] = X_df[store_cols].idxmax(axis=1).str.removeprefix('Store_').reset_index(drop=True).astype(int)
national_df['Dept'] = X_df[dept_cols].idxmax(axis=1).str.removeprefix('Dept_').reset_index(drop=True).astype(int)
national_df['Actual_Sales'] = np.power(10, y_df.reset_index(drop=True))-1

national_df.head()


# model
y_model_df = xgb_model.predict(xgb.DMatrix(X_df[~hold_out_idx]))


model_df = pd.DataFrame()

model_df['Store'] = X_df[store_cols].idxmax(axis=1).str.removeprefix('Store_')[~hold_out_idx].reset_index(drop=True).astype(int)
model_df['Dept'] = X_df[dept_cols].idxmax(axis=1).str.removeprefix('Dept_')[~hold_out_idx].reset_index(drop=True).astype(int)
model_df['Model'] = np.power(10, y_model_df)-1
model_df['Date'] = X_date_df[['Date']].copy()[~hold_out_idx].reset_index(drop=True)


# add predictions sales on hold-out
national_df = national_df.merge(results_df.drop(columns=['Actual_Sales','IsHoliday']), how="left", on = ["Date","Store","Dept"])

# add model sales on non-hold-out
national_df = national_df.merge(model_df, how="left", on = ["Date","Store","Dept"])


national_df


national_week_df = national_df.groupby(['Date'])[['Actual_Sales','Prediction','Model']].sum()
national_week_df = national_week_df.reset_index()


national_week_df.head()


# hold out 2012/04 onwards
hold_out_date_idx = national_week_df.Date >= '2012-04-01'


fig = go.Figure()

fig.add_trace(go.Scatter(x = national_week_df.Date, 
                         y = national_week_df.Actual_Sales,
                         name = "Actual"))

fig.add_trace(go.Scatter(x = national_week_df.Date[~hold_out_date_idx], 
                         y = national_week_df.Model[~hold_out_date_idx],
                         name = "Model",
                         line = dict(width = 4,
                                     dash = "dot")))

fig.add_trace(go.Scatter(x = national_week_df.Date[hold_out_date_idx], 
                         y = national_week_df.Prediction[hold_out_date_idx],
                         name = "Prediction"))

fig.update_layout(
    title            = f"Walmart Sales - Mean Absolute Percentage Error: {mean_abs_per_err*100:.2f}%",
    xaxis_title      = "weeks",
    yaxis_title      = "sales",
    yaxis_tickprefix = '$',
    font = dict(
        family = "Courier New, monospace",
        size   = 18,
        color  = "RebeccaPurple"
    ))

fig.update_xaxes(rangeslider_visible = True)

fig.show()


national_week_store_df = national_df.groupby(['Date','Store'])[['Actual_Sales','Prediction','Model']].sum()
national_week_store_df = national_week_store_df.reset_index()


# Mean absolute percentage error
# per store
# sum over all dept

results_weekly_store_df = results_df.groupby(by = ['Date','Store'])[['Actual_Sales','Prediction']].sum()

abs_per_err_df = abs(results_weekly_store_df.Actual_Sales.sub(results_weekly_store_df.Prediction, axis='index')).\
divide(results_weekly_store_df.Actual_Sales, axis='index')

# correct if zero actual orders
eq_zero = results_weekly_store_df.Actual_Sales==0
eq_value = results_weekly_store_df.Prediction.eq(results_weekly_store_df.Actual_Sales,axis='index')

abs_per_err_df[eq_value & eq_zero] = 0
abs_per_err_df[~eq_value & eq_zero] = 1


mean_abs_per_err_store = abs_per_err_df.mean()
mean_abs_per_err_store
# 4.0%

0.039974016572863175


# hold out 2012/04 onwards
hold_out_date_store_idx = national_week_store_df.Date >= '2012-04-01'


fig = go.Figure()

for store in national_week_store_df.Store.unique():
    
    store_idx = national_week_store_df.Store == store

    fig.add_trace(go.Scatter(x = national_week_store_df.Date[store_idx], 
                             y = national_week_store_df.Actual_Sales[store_idx],
                             name = "Store_"+ str(store) +" - Actual"))

    fig.add_trace(go.Scatter(x = national_week_store_df.Date[~hold_out_date_store_idx & store_idx], 
                             y = national_week_store_df.Model[~hold_out_date_store_idx & store_idx],
                             name = "Store_"+ str(store) +" - Model",
                             line = dict(width = 4,
                                         dash = "dot")))

    fig.add_trace(go.Scatter(x = national_week_store_df.Date[hold_out_date_store_idx & store_idx], 
                             y = national_week_store_df.Prediction[hold_out_date_store_idx & store_idx],
                             name = "Store_"+ str(store) +" - Prediction",
                             line = dict(width = 4,
                                         dash = "dot")))

fig.update_layout(
    title            = f"Walmart Sales Per Store - Mean Abs %age Error: {mean_abs_per_err_store*100:.2f}%",
    xaxis_title      = "weeks",
    yaxis_title      = "sales",
    yaxis_tickprefix = '$',
    font = dict(
        family = "Courier New, monospace",
        size   = 18,
        color  = "RebeccaPurple"
    ))

fig.update_xaxes(rangeslider_visible = True)

fig.show()

# double click to isolate one line


hw_model = ExponentialSmoothing(national_week_df.Actual_Sales[~hold_out_date_idx], 
                                trend = 'additive', 
                                seasonal = 'additive', 
                                dates = national_week_df.Date[~hold_out_date_idx],
                                seasonal_periods = 52
                               ).fit()

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.


hw_model_values = hw_model.fittedvalues
hw_model_values.name = 'Model'
hw_model_values.index.name = 'Date'


y_hold_out_pred_hw = hw_model.predict(
    start = national_week_df.Date[hold_out_date_idx].values[0],
    end = national_week_df.Date[hold_out_date_idx].values[-1])
y_hold_out_pred_hw.name = 'Prediction'
y_hold_out_pred_hw.index.name = 'Date'


national_hw_df = national_week_df.copy()
# clear Prediction and Model columns 
# national_hw_df = national_hw_df.drop(columns = ['Prediction', 'Model'])
national_hw_df = national_hw_df.drop(columns = ['Prediction'])
national_hw_df['Model'] = None

# add Prediction and Model columns 
national_hw_df = national_hw_df.merge(y_hold_out_pred_hw, how = "left", on = ["Date"])
# national_hw_df = national_hw_df.merge(hw_model_values, how = "left", on = ["Date"])
national_hw_df['Model'][~hold_out_date_idx] = hw_model_values.reset_index(drop=True)

C:\Users\kevin\AppData\Local\Temp\ipykernel_15492\70949110.py:10: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Mean absolute percentage error
# Holt Winter's method

abs_per_err_df = abs(national_hw_df.Actual_Sales.sub(national_hw_df.Prediction, axis='index')).\
divide(national_hw_df.Actual_Sales, axis='index')

# correct if zero actual orders
eq_zero = national_hw_df.Actual_Sales==0
eq_value = national_hw_df.Prediction.eq(national_hw_df.Actual_Sales,axis='index')

abs_per_err_df[eq_value & eq_zero] = 0
abs_per_err_df[~eq_value & eq_zero] = 1


mean_abs_per_err_hw = abs_per_err_df.mean()
mean_abs_per_err_hw
# 2.8%

0.027568508627809338


fig = go.Figure()

fig.add_trace(go.Scatter(x = national_hw_df.Date, 
                         y = national_hw_df.Actual_Sales,
                         name = "Actual"))

fig.add_trace(go.Scatter(x = national_hw_df.Date[~hold_out_date_idx], 
                         y = national_hw_df.Model[~hold_out_date_idx],
                         name = "Model",
                         line = dict(width = 4,
                                     dash = "dot")))

fig.add_trace(go.Scatter(x = national_hw_df.Date[hold_out_date_idx], 
                         y = national_hw_df.Prediction[hold_out_date_idx],
                         name = "Prediction"))

fig.update_layout(
    title            = f"Walmart Sales - Holt Winter's - Mean Abs %age Error: {mean_abs_per_err_hw*100:.2f}%",
    xaxis_title      = "weeks",
    yaxis_title      = "sales",
    yaxis_tickprefix = '$',
    font = dict(
        family = "Courier New, monospace",
        size   = 18,
        color  = "RebeccaPurple"
    ))

fig.update_xaxes(rangeslider_visible = True)

fig.show()


# XGBoost vs Holt Winters
fig = go.Figure()

fig.add_trace(go.Scatter(x = national_week_df.Date[hold_out_date_idx], 
                         y = national_week_df.Actual_Sales[hold_out_date_idx],
                         name = "Actual"))

fig.add_trace(go.Scatter(x = national_week_df.Date[hold_out_date_idx], 
                         y = national_week_df.Prediction[hold_out_date_idx],
                         name = "XGBoost"))

fig.add_trace(go.Scatter(x = national_hw_df.Date[hold_out_date_idx], 
                         y = national_hw_df.Prediction[hold_out_date_idx],
                         name = "Holt Winters"))

fig.update_layout(
    title            = f"Predictions: XGBoost (MAPE={mean_abs_per_err*100:.2f}%) vs Holt Winter's (MAPE={mean_abs_per_err_hw*100:.2f}%)",
    xaxis_title      = "weeks",
    yaxis_title      = "sales",
    yaxis_tickprefix = '$',
    font = dict(
        family = "Courier New, monospace",
        size   = 18,
        color  = "RebeccaPurple"
    ))

fig.show()


national_week_store_hw_df = national_week_store_df.copy()
national_week_store_hw_df['Prediction'] = float('nan')
national_week_store_hw_df['Model'] = float('nan')


# model and predict for each store
for store in national_week_store_df.Store.unique():
    
    store_idx = national_week_store_hw_df.Store == store
    
    # fit model
    hw_store_model = ExponentialSmoothing(
        national_week_store_hw_df.Actual_Sales[~hold_out_date_store_idx & store_idx], 
        trend = 'additive', 
        seasonal = 'additive', 
        dates = national_week_store_hw_df.Date[~hold_out_date_store_idx & store_idx],
        seasonal_periods = 52
    ).fit()
    
    # extract model values
    hw_store_model_values = hw_store_model.fittedvalues
    hw_store_model_values.name = 'Model'
    hw_store_model_values.index.name = 'Date'
    
    # predict 
    y_hold_out_pred_store_hw = hw_store_model.predict(
        start = national_week_store_hw_df.Date[hold_out_date_store_idx & store_idx].values[0],
        end = national_week_store_hw_df.Date[hold_out_date_store_idx & store_idx].values[-1])
    y_hold_out_pred_store_hw.name = 'Prediction'
    y_hold_out_pred_store_hw.index.name = 'Date'
    
    # store values
    national_week_store_hw_df.loc[hold_out_date_store_idx & store_idx, 'Prediction'] = \
      y_hold_out_pred_store_hw.values
    
    national_week_store_hw_df.loc[~hold_out_date_store_idx & store_idx, 'Model'] = \
      hw_store_model_values.values

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency W-FRI will be used.

C:\Users\kevin\AppData\Roaming\Python\Python311\site-packages\statsmodels\tsa\holtwinters\model.py:917: ConvergenceWarning:

Optimization failed to converge. Check mle_retvals.


# Mean absolute percentage error
# per store
# Holt Winters

abs_per_err_df = abs(national_week_store_hw_df.Actual_Sales.sub(national_week_store_hw_df.Prediction, axis='index')).\
divide(national_week_store_hw_df.Actual_Sales, axis='index')

# correct if zero actual orders
eq_zero = national_week_store_hw_df.Actual_Sales==0
eq_value = national_week_store_hw_df.Prediction.eq(national_week_store_hw_df.Actual_Sales,axis='index')

abs_per_err_df[eq_value & eq_zero] = 0
abs_per_err_df[~eq_value & eq_zero] = 1


mean_abs_per_err_store_hw = abs_per_err_df.mean()
mean_abs_per_err_store_hw
# 4.8%

0.04752394983524948


# export model
import pickle
file_name = "xgb_walmart.pkl"

# save
pickle.dump(xgb_model, open(file_name, "wb"))


# to load the model from pickle file
# xgb_model_loaded = pickle.load(open(file_name, "rb"))


# export results
national_df.to_csv('walmart_sales_prediction.csv')

	Store	Date	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment	IsHoliday
0	1	2010-02-05	42.31	2.572	NaN	NaN	NaN	NaN	NaN	211.096358	8.106	False
1	1	2010-02-12	38.51	2.548	NaN	NaN	NaN	NaN	NaN	211.242170	8.106	True
2	1	2010-02-19	39.93	2.514	NaN	NaN	NaN	NaN	NaN	211.289143	8.106	False
3	1	2010-02-26	46.63	2.561	NaN	NaN	NaN	NaN	NaN	211.319643	8.106	False
4	1	2010-03-05	46.50	2.625	NaN	NaN	NaN	NaN	NaN	211.350143	8.106	False

	Store	Date	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment	IsHoliday
0	1	2010-02-05	42.31	2.572	NaN	NaN	NaN	NaN	NaN	211.096358	8.106	False
1	1	2010-02-12	38.51	2.548	NaN	NaN	NaN	NaN	NaN	211.242170	8.106	True
2	1	2010-02-19	39.93	2.514	NaN	NaN	NaN	NaN	NaN	211.289143	8.106	False
3	1	2010-02-26	46.63	2.561	NaN	NaN	NaN	NaN	NaN	211.319643	8.106	False
4	1	2010-03-05	46.50	2.625	NaN	NaN	NaN	NaN	NaN	211.350143	8.106	False

	Store	Type	Size
0	1	A	151315
1	2	A	202307
2	3	B	37392
3	4	A	205863
4	5	B	34875

	Store	Dept	Date	Weekly_Sales	IsHoliday	Week	Month	Year
0	1	1	2010-02-05	24924.50	False	2010-02-01	2010-02-01	2010-01-01
1	1	1	2010-02-12	46039.49	True	2010-02-08	2010-02-01	2010-01-01
2	1	1	2010-02-19	41595.55	False	2010-02-15	2010-02-01	2010-01-01
3	1	1	2010-02-26	19403.54	False	2010-02-22	2010-02-01	2010-01-01
4	1	1	2010-03-05	21827.90	False	2010-03-01	2010-03-01	2010-01-01

	Store	Dept	Date	Weekly_Sales	IsHoliday	Type	Size	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment
0	1	1	2010-02-05	24924.50	False	A	151315	42.31	2.572	NaN	NaN	NaN	NaN	NaN	211.096358	8.106
1	1	1	2010-02-12	46039.49	True	A	151315	38.51	2.548	NaN	NaN	NaN	NaN	NaN	211.242170	8.106
2	1	1	2010-02-19	41595.55	False	A	151315	39.93	2.514	NaN	NaN	NaN	NaN	NaN	211.289143	8.106
3	1	1	2010-02-26	19403.54	False	A	151315	46.63	2.561	NaN	NaN	NaN	NaN	NaN	211.319643	8.106
4	1	1	2010-03-05	21827.90	False	A	151315	46.50	2.625	NaN	NaN	NaN	NaN	NaN	211.350143	8.106

Kaggle Walmart Sales Prediction¶

Data Wrangling¶

Fit XGBoost¶

About XGBoost and machine learning models¶

Look at hold out data¶

Predictions per store¶

Fit Holt Winter's - National Level¶

Fit Holt Winter's store level¶

Export results¶

About Author¶

	Store	Dept	Date	Weekly_Sales	IsHoliday	Type	Size	Temperature	Fuel_Price	MarkDown1	...	Roll_B1_B5_Week_Dept_Sales	Roll_B1_B6_Week_Sales	Roll_B1_B6_Week_Store_Sales	Roll_B1_B6_Week_Dept_Sales	Roll_B1_B7_Week_Sales	Roll_B1_B7_Week_Store_Sales	Roll_B1_B7_Week_Dept_Sales	Roll_B1_B8_Week_Sales	Roll_B1_B8_Week_Store_Sales	Roll_B1_B8_Week_Dept_Sales
421565	45	98	2012-09-28	508.37	False	B	118221	64.88	3.997	4556.61	...	254476.6550	437.428	728874.818	252133.060	447.883333	727811.836667	252516.461667	487.827143	728558.334286	252569.062857
421566	45	98	2012-10-05	628.10	False	B	118221	64.89	3.985	5046.74	...	260393.8850	456.022	727863.156	256254.768	449.251667	726258.006667	254005.420000	456.524286	725720.710000	254066.570000
421567	45	98	2012-10-12	1061.02	False	B	118221	54.47	4.000	1956.28	...	266779.0525	512.434	727694.596	264820.822	484.701667	728795.141667	260633.735000	474.801429	727286.158571	258080.155714
421568	45	98	2012-10-19	760.01	False	B	118221	56.47	3.969	2004.02	...	275613.6650	654.150	721284.936	272192.028	603.865000	728822.890000	269658.006667	567.032857	729605.030000	265378.048571
421569	45	98	2012-10-26	1076.80	False	B	118221	58.85	3.882	4018.91	...	283038.7275	684.960	724461.022	278973.970	671.793333	720758.368333	275562.555000	626.171429	727294.695714	272909.032857

	Weekly_Sales	IsHoliday	Size	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	...	Dept_90	Dept_91	Dept_92	Dept_93	Dept_94	Dept_95	Dept_96	Dept_97	Dept_98	Dept_99
0	4.396644	0	151315	42.31	2.572	NaN	NaN	NaN	NaN	NaN	...	False	False	False	False	False	False	False	False	False	False
1	4.663140	1	151315	38.51	2.548	NaN	NaN	NaN	NaN	NaN	...	False	False	False	False	False	False	False	False	False	False
2	4.619057	0	151315	39.93	2.514	NaN	NaN	NaN	NaN	NaN	...	False	False	False	False	False	False	False	False	False	False
3	4.287903	0	151315	46.63	2.561	NaN	NaN	NaN	NaN	NaN	...	False	False	False	False	False	False	False	False	False	False
4	4.339032	0	151315	46.50	2.625	NaN	NaN	NaN	NaN	NaN	...	False	False	False	False	False	False	False	False	False	False

	feature	gain
14	Week_B1_Sales	168.435791
93	Roll_B1_B8_Week_Sales	154.591888
78	Roll_B1_B3_Week_Sales	18.082771
90	Roll_B1_B7_Week_Sales	13.840375
205	Dept_78	12.012506
...	...	...
161	Dept_21	0.069904
127	Store_31	0.059445
136	Store_40	0.059141
101	Store_3	0.033977
115	Store_19	0.028973

	Size	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	...	Dept_90	Dept_91	Dept_92	Dept_93	Dept_94	Dept_95	Dept_96	Dept_97	Dept_98	Dept_99
113	151315	70.43	3.891	10121.97	NaN	77.98	3750.59	4510.72	221.435611	...	False	False	False	False	False	False	False	False	False	False
114	151315	69.07	3.891	6186.19	3288.69	17.07	1822.55	1063.78	221.510210	...	False	False	False	False	False	False	False	False	False	False
115	151315	66.76	3.877	2230.80	612.02	19.75	275.13	5747.10	221.564074	...	False	False	False	False	False	False	False	False	False	False
116	151315	67.23	3.814	3221.25	NaN	35.49	577.14	6222.25	221.617937	...	False	False	False	False	False	False	False	False	False	False
117	151315	75.55	3.749	21290.13	NaN	69.89	4977.35	3261.04	221.671800	...	False	False	False	False	False	False	False	False	False	False

	IsHoliday	Actual_Sales	Prediction	Date	Store	Dept
0	0.0	57592.12	34020.843750	2012-04-06	1	1
1	0.0	34684.21	39317.019531	2012-04-13	1	1
2	0.0	16976.19	24857.117188	2012-04-20	1	1
3	0.0	16347.60	16679.679688	2012-04-27	1	1
4	0.0	17147.44	18152.654297	2012-05-04	1	1
...	...	...	...	...	...	...
88787	0.0	508.37	543.704224	2012-09-28	45	98
88788	0.0	628.10	565.246155	2012-10-05	45	98
88789	0.0	1061.02	647.877380	2012-10-12	45	98
88790	0.0	760.01	833.042847	2012-10-19	45	98
88791	0.0	1076.80	761.906921	2012-10-26	45	98

	Date	IsHoliday	Store	Dept	Actual_Sales	Prediction	Model
0	2010-02-05	0	1	1	24924.50	NaN	19008.000000
1	2010-02-12	1	1	1	46039.49	NaN	37498.144531
2	2010-02-19	0	1	1	41595.55	NaN	37018.570312
3	2010-02-26	0	1	1	19403.54	NaN	21090.648438
4	2010-03-05	0	1	1	21827.90	NaN	20254.429688
...	...	...	...	...	...	...	...
421565	2012-09-28	0	45	98	508.37	543.704224	NaN
421566	2012-10-05	0	45	98	628.10	565.246155	NaN
421567	2012-10-12	0	45	98	1061.02	647.877380	NaN
421568	2012-10-19	0	45	98	760.01	833.042847	NaN
421569	2012-10-26	0	45	98	1076.80	761.906921	NaN

	Date	Actual_Sales	Model
0	2010-02-05	49750875.98	41117820.0
1	2010-02-12	48336800.10	47698420.0
2	2010-02-19	48277902.33	47753952.0
3	2010-02-26	43970440.65	43142064.0
4	2010-03-05	46872715.16	46803428.0