For the following code to run smoothly it is necessary to have the folders that the line of code calls for.
The main folder is called Forecast, which holds all R-markdowns. The subfolders for the Forecast folder are: Code, Data, Output. The subfolder Code holds scripts.
The subfolder Data holds two more subfolders: Countries, Raw Data. The Countries subfolder [this is an optional subfolder] includes folders for each country to which we individually want to save data on. Raw Data folder only has the raw data on it.
The subfolder Output includes a subfolder called Countries and this folder has folders for each country. Each country then has 2 additional folders: Plots, Tables.
The Plots folder saves all graphs created in the script.
The Tables folder saves all excel files with the performance indices.
Note the models are label by c 1 k. which stands for c -> colombia, 1 -> club 6101 and k -> kitchen trash bags.
It is necessary to change c 1 k whenever you’re analyzing a different set of item and club. This way graphs and models are saved into folders with their specific name.
For easier substitution ctrl+f. In the first box type the name you want to substitute, such as c 1 k. and on the second box include the prefix you want to substitute with, suck as c 2 k. Then click the last All option.
This section sets up the language in which the codes are run, and working directory. Two parameters, namely len_dec and len_inv reflect the purchasing strategy. Parameter len_dec refers to the frequency in which buyers make purchase decisions; len_inv refers to the time span from purcahse to selling of an item.
Three functions are defined in this section. firstly, performance_index_dtds function calculates the performance of prediction models on de-trended and de-seasonalized quantity data. Secondly, performance_index_raw function calculates the performance of prediction models on raw data. Thirdly, feature_engineering function performs feature engineering to prepare for machine learning data.
# This scripts stores performance_index function
performance_index_dtds<-function(df_Actual, pred_name, pred_value){
assign(pred_name, exp(pred_value+(retrend_log-trend_log[1])+reseasonal_log))
df_Actual[,colnames(df_Actual)==pred_name]<-NULL
df_Actual<-cbind(df_Actual, get(pred_name)%>%as.numeric())
colnames(df_Actual)[colnames(df_Actual)=="get(pred_name) %>% as.numeric()"]<-pred_name
assign(paste0("MEAN_", pred_name, "_total"),
mean(get(pred_name))/df_Actual%>%select("Actual")%>%data.matrix()%>%mean()-1,
envir = .GlobalEnv
)
assign(paste0("MEAN_", pred_name, "_target"),
mean((get(pred_name))[(len_inv+1):(len_inv+len_dec)])/mean((df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)])-1,
envir=.GlobalEnv
)
assign(paste0("RMSE_", pred_name, "_total" ),
RMSE(get(pred_name),df_Actual%>%select("Actual")%>%data.matrix()),
envir = .GlobalEnv)
assign(paste0("RMSE_", pred_name, "_target" ),
RMSE((get(pred_name)[(len_inv+1):(len_inv+len_dec)]),(df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)]),
envir = .GlobalEnv)
assign(paste0("RMSE_", pred_name, "_total" ),
RMSE(get(pred_name),df_Actual%>%select("Actual")%>%data.matrix()),
envir = .GlobalEnv)
assign(paste0("RMSE_", pred_name, "_target" ),
RMSE((get(pred_name)[(len_inv+1):(len_inv+len_dec)]),(df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)]),
envir = .GlobalEnv)
assign(paste0("MAE_", pred_name, "_total" ),
MAE(get(pred_name),df_Actual%>%select("Actual")%>%data.matrix()),
envir = .GlobalEnv)
assign(paste0("MAE_", pred_name, "_target" ),
MAE((get(pred_name)[(len_inv+1):(len_inv+len_dec)]),(df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)]),
envir = .GlobalEnv)
assign(paste0("MPE_", pred_name, "_total" ),
MPE(get(pred_name),df_Actual%>%select("Actual")%>%data.matrix()),
envir = .GlobalEnv)
assign(paste0("MPE_", pred_name, "_target" ),
MPE((get(pred_name)[(len_inv+1):(len_inv+len_dec)]),(df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)]),
envir = .GlobalEnv)
assign(paste0("MAPE_", pred_name, "_total" ),
Metrics::mape(get(pred_name),df_Actual%>%select("Actual")%>%data.matrix()),
envir = .GlobalEnv)
assign(paste0("MAPE_", pred_name, "_target" ),
Metrics::mape((get(pred_name)[(len_inv+1):(len_inv+len_dec)]),(df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)]),
envir = .GlobalEnv)
assign(paste0("MASE_", pred_name, "_total" ),
Metrics::mase(df_Actual%>%select("Actual")%>%data.matrix()%>%as.numeric(), get(pred_name)%>%as.numeric()),
envir = .GlobalEnv)
assign(paste0("MASE_", pred_name, "_target" ),
Metrics::mase((df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)]%>%as.numeric(), (get(pred_name)[(len_inv+1):(len_inv+len_dec)])%>%as.numeric()),
envir = .GlobalEnv)
return(df_Actual)
}
performance_index_raw<-function(df_Actual, pred_name, pred_value){
assign(pred_name, exp(pred_value))
df_Actual[,colnames(df_Actual)==pred_name]<-NULL
df_Actual<-cbind(df_Actual, get(pred_name)%>%as.numeric())
colnames(df_Actual)[colnames(df_Actual)=="get(pred_name) %>% as.numeric()"]<-pred_name
assign(paste0("MEAN_", pred_name, "_total"),
mean(get(pred_name))/df_Actual%>%select("Actual")%>%data.matrix()%>%mean()-1,
envir = .GlobalEnv
)
assign(paste0("MEAN_", pred_name, "_target"),
mean((get(pred_name))[(len_inv+1):(len_inv+len_dec)])/mean((df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)])-1,
envir=.GlobalEnv
)
assign(paste0("RMSE_", pred_name, "_total" ),
RMSE(get(pred_name),df_Actual%>%select("Actual")%>%data.matrix()),
envir = .GlobalEnv)
assign(paste0("RMSE_", pred_name, "_target" ),
RMSE((get(pred_name)[(len_inv+1):(len_inv+len_dec)]),(df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)]),
envir = .GlobalEnv)
assign(paste0("MAE_", pred_name, "_total" ),
MAE(get(pred_name),df_Actual%>%select("Actual")%>%data.matrix()),
envir = .GlobalEnv)
assign(paste0("MAE_", pred_name, "_target" ),
MAE((get(pred_name)[(len_inv+1):(len_inv+len_dec)]),(df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)]),
envir = .GlobalEnv)
assign(paste0("MPE_", pred_name, "_total" ),
MPE(get(pred_name),df_Actual%>%select("Actual")%>%data.matrix()),
envir = .GlobalEnv)
assign(paste0("MPE_", pred_name, "_target" ),
MPE((get(pred_name)[(len_inv+1):(len_inv+len_dec)]),(df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)]),
envir = .GlobalEnv)
assign(paste0("MAPE_", pred_name, "_total" ),
Metrics::mape(get(pred_name),df_Actual%>%select("Actual")%>%data.matrix()),
envir = .GlobalEnv)
assign(paste0("MAPE_", pred_name, "_target" ),
Metrics::mape((get(pred_name)[(len_inv+1):(len_inv+len_dec)]),(df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)]),
envir = .GlobalEnv)
assign(paste0("MASE_", pred_name, "_total" ),
Metrics::mase(df_Actual%>%select("Actual")%>%data.matrix()%>%as.numeric(), get(pred_name)%>%as.numeric()),
envir = .GlobalEnv)
assign(paste0("MASE_", pred_name, "_target" ),
Metrics::mase((df_Actual%>%select("Actual")%>%data.matrix())[(len_inv+1):(len_inv+len_dec)]%>%as.numeric(), (get(pred_name)[(len_inv+1):(len_inv+len_dec)])%>%as.numeric()),
envir = .GlobalEnv)
return(df_Actual)
}
feature_engineering<-function(t_v, feature_list){
function_names<-c("avg" , "diff")
function_list<-list(function(x) roll_meanr(x, n= 4), function(x) c(diff(x), NA))
weeks<-c(1, 4, 5, 8, 9, 12, 13, 17, 18, 22, 26, 52)
for (feature in feature_list){
for (lag_number in (weeks[weeks>=len_dec+len_inv])){
assign(paste0("lag", "_", lag_number, "_", feature), lag((t_v%>%select(feature))[[1]], lag_number))
t_v<-cbind(t_v, get(paste0("lag", "_", lag_number, "_", feature))%>%as.numeric())
colnames(t_v)[colnames(t_v)=="get(paste0(\"lag\", \"_\", lag_number, \"_\", feature)) %>% as.numeric()"]<-paste0("lag", "_", lag_number, "_", feature)
for (i in (1:length(function_names))){
assign(paste0(function_names[i], "_", lag_number, "_", feature), lag(function_list[[i]]((t_v%>%select(feature))[[1]]), lag_number))
t_v<-cbind(t_v, get(paste0(function_names[i], "_", lag_number, "_", feature))%>%as.numeric())
colnames(t_v)[colnames(t_v)=="get(paste0(function_names[i], \"_\", lag_number, \"_\", feature)) %>% "]<-paste0(function_names[i], "_", lag_number, "_", feature)
}
}
}
return(t_v)
}
This section transforms the formats of some variables, and create dummies for date and month categorical variables.
c1k contains data for one item in one club.
head(c1k, 10)
## X1 TransactionDate number_transactions number_members sales_usd
## 122484 740049 2015-01-02 100 97 820.56
## 116457 704433 2015-01-03 106 101 986.26
## 119636 723140 2015-01-04 22 20 189.36
## 118109 714112 2015-01-05 1 1 7.89
## 115908 701277 2015-01-07 89 85 743.85
## 117876 712776 2015-01-08 69 66 593.56
## 117378 709815 2015-01-09 81 77 690.64
## 119638 723154 2015-01-10 87 84 839.96
## 121951 736750 2015-01-11 92 90 800.71
## 121748 735595 2015-01-12 106 102 926.32
## sales_local exchange_rate quantity Apr Aug Dec Jan July June Mar
## 122484 1963416 0.000418000 104 0 0 0 1 0 0 0
## 116457 2359875 0.000418000 125 0 0 0 1 0 0 0
## 119636 453096 0.000418000 24 0 0 0 1 0 0 0
## 118109 18879 0.000418000 1 0 0 0 1 0 0 0
## 115908 1793505 0.000415000 95 0 0 0 1 0 0 0
## 117876 1453683 0.000408467 77 0 0 0 1 0 0 0
## 117378 1680231 0.000411000 89 0 0 0 1 0 0 0
## 119638 2020053 0.000416000 107 0 0 0 1 0 0 0
## 121951 1925658 0.000416000 102 0 0 0 1 0 0 0
## 121748 2227722 0.000416000 118 0 0 0 1 0 0 0
## May Nov Oct Sep Fri Sat Sun Th Tu Wed
## 122484 0 0 0 0 1 0 0 0 0 0
## 116457 0 0 0 0 0 1 0 0 0 0
## 119636 0 0 0 0 0 0 1 0 0 0
## 118109 0 0 0 0 0 0 0 0 0 0
## 115908 0 0 0 0 0 0 0 0 0 1
## 117876 0 0 0 0 0 0 0 1 0 0
## 117378 0 0 0 0 1 0 0 0 0 0
## 119638 0 0 0 0 0 1 0 0 0 0
## 121951 0 0 0 0 0 0 1 0 0 0
## 121748 0 0 0 0 0 0 0 0 0 0
# Time span
## Beginning and Ending of dataset
summary(c1k$TransactionDate)
## Min. 1st Qu. Median Mean 3rd Qu.
## "2015-01-02" "2015-12-07" "2016-11-11" "2016-11-16" "2017-10-31"
## Max.
## "2018-10-05"
## Gaps
c(min(c1k$TransactionDate):max(c1k$TransactionDate))[c(min(c1k$TransactionDate):max(c1k$TransactionDate))%in%c1k$TransactionDate==FALSE]%>%as.Date(origin="1970-1-1")
## [1] "2015-01-06" "2015-04-03" "2015-09-08" "2015-09-16" "2015-12-25"
## [6] "2016-01-01" "2016-03-25" "2016-09-06" "2016-12-25" "2017-01-01"
## [11] "2017-01-29" "2017-01-30" "2017-01-31" "2017-02-01" "2017-02-02"
## [16] "2017-04-14" "2017-06-18" "2017-06-19" "2017-06-20" "2017-06-21"
## [21] "2017-06-22" "2017-06-23" "2017-10-20" "2017-10-21" "2017-10-22"
## [26] "2017-10-23" "2017-10-24" "2017-12-25" "2018-01-01" "2018-03-30"
c1k_quant_dist<-c(c1k$quantity, rep(0, sum(c(min(c1k$TransactionDate):max(c1k$TransactionDate))%in%c1k$TransactionDate==FALSE)))
hist(c1k_quant_dist, breaks = 20)
c1k$SalePrice_local<- c1k$sales_local/c1k$quantity
c1k$SalePrice_usd<- c1k$sales_usd/c1k$quantity
# trends and cycles
## Price in local currency hiked in late 2015 and again in late 2018
ggplot(data=c1k, aes(x=TransactionDate, y=SalePrice_local))+geom_line()
## Price in USD increased gradually
ggplot(data=c1k, aes(x=TransactionDate, y=SalePrice_usd))+geom_line()
## Volumes of sales remained relatively stable
ggplot(data=c1k, aes(x=TransactionDate, y=quantity))+geom_line()
Cetegorical data, namely sales data of all items in the same category as the item under study added up, are also used.
category_club contains data at the same club, but with sales of all goods in the same cateory added up.
head(category_club)
## # A tibble: 6 x 4
## TransactionDate Category_Sales_usd Category_Sales_local Category_quantity
## <date> <dbl> <dbl> <dbl>
## 1 2015-07-29 1039. 2967434 139
## 2 2015-06-29 1894. 4843364 200
## 3 2015-02-04 1472. 3547368 160
## 4 2018-08-18 1344. 4060388 137
## 5 2016-05-30 1682. 5144080 185
## 6 2015-11-06 1059. 2983677 99
c1k$TransactionYear<-year(c1k$TransactionDate) #Extract Transaction Year
c1k$week_number<-c1k$TransactionDate%>%date2ISOweek()%>%substr(7,8)
c1k$week_number_year<-c1k$TransactionDate%>%date2ISOweek()%>%substr(1,4)
#aggregation
c1k_week<-c1k%>%group_by(week_number_year,week_number)%>%summarise(
transaction_avrg=mean(number_transactions, na.rm = TRUE),
members_avrg=mean(number_members, na.rm = TRUE),
sales_local_avrg=mean(sales_local, na.rm = TRUE),
sales_usd_avrg=mean(sales_usd, na.rm = TRUE),
exchange_rate_avrg=mean(exchange_rate, na.rm = TRUE),
category_sales_usd_avrg=mean(Category_Sales_usd, na.rm = TRUE),
category_sales_local_avrg=mean(Category_Sales_local, na.rm = TRUE),
category_quantity_avrg=mean(Category_quantity, na.rm = TRUE),
quantity_avrg=mean(quantity, na.rm = TRUE),
salePrice_local_avrg = mean(SalePrice_local, na.rm = TRUE),
salePrice_usd_avrg = mean(SalePrice_usd, na.rm = TRUE))
c1k_week$TransactionTime<-paste0(c1k_week$week_number_year,"-W", c1k_week$week_number, "-3")%>%ISOweek2date()
c1k_week$week_number<- as.numeric(c1k_week$week_number)
c1k_week$week_number_year<-as.numeric(c1k_week$week_number_year)
head(c1k_week, 10)
## # A tibble: 10 x 14
## # Groups: week_number_year [1]
## week_number_year week_number transaction_avrg members_avrg
## <dbl> <dbl> <dbl> <dbl>
## 1 2015 1 76 72.7
## 2 2015 2 69.8 67.2
## 3 2015 3 78.4 76.7
## 4 2015 4 71 69
## 5 2015 5 66.7 64.3
## 6 2015 6 60 58
## 7 2015 7 47 46
## 8 2015 8 40.6 40.1
## 9 2015 9 51.9 50.4
## 10 2015 10 57.1 55.6
## # ... with 10 more variables: sales_local_avrg <dbl>,
## # sales_usd_avrg <dbl>, exchange_rate_avrg <dbl>,
## # category_sales_usd_avrg <dbl>, category_sales_local_avrg <dbl>,
## # category_quantity_avrg <dbl>, quantity_avrg <dbl>,
## # salePrice_local_avrg <dbl>, salePrice_usd_avrg <dbl>,
## # TransactionTime <date>
The periodogram function does not detect high frequency seasons accurately. To solve this problem, one of the built in functions need to be modified.
Run the following line in r:
trace(spec.pgram, edit=TRUE)
Then change line 9 to:
N <- N0 <- nrow(x) * 128
Then click “save”. The function is hence temporarily changed. Judgement is still needed to determine what type of seasonality it is capturing. In this case is year, half-year and three-month.
The seasons detected are only rough estimates and we need to use knowledge about annual time series to determine the real seasons (semi-annual, annual etc.).Therefore, the identification of seasons here is not completely automated. Personal judgement is still needed.
Firstly, 52.14, the number of weeks per year, is always included.
Secondly, only periods shorter than 52.14 should be included, and their length should be adjusted to its closest whole-month length. For example, if 13.02 is reported is a strong period by Fourier transformation, 12.53 (three months) should be the season used in analyses.
Thirdly, none period should be multiple of the other (otherwise Fourier regressor does not work). Therefore, half-year period is changed from 26.07 to 25.07 weeks. We believe better ways to deal with this problem exist.
trace(spec.pgram, edit=TRUE)
## Tracing function "spec.pgram" in package "stats"
## [1] "spec.pgram"
#detecting seasonality
p <- periodogram(c1k_week$quantity_avrg[1:(nrow(c1k_week)-len_inv-len_dec)])
dd <- data.frame(freq=p$freq, spec=p$spec)
order <- dd[order(-dd$spec),]
top10 <- head(order, 10)
# convert frequency to time periods
time = 1/top10$freq
time
## [1] 25.58333 307.00000 27.90909 51.16667 12.79167 17.05556 153.50000
## [8] 21.92857 102.33333 10.23333
season1<-25.07
season2<-52.14
season3<-13.14
ts function is used to create time series, and frequency is set to be 52.14, the number of weeks in a year.
#creating time series: log
t_log<-ts(log(c1k_week$quantity_avrg)[1:(nrow(c1k_week)-len_inv-len_dec)], frequency = 52.14, start = c(c1k_week$week_number_year[1], c1k_week$week_number[1]))
plot(t_log)+title("Log sales quantity, club 6106")
## integer(0)
ggplot(c1k_week)+geom_line(aes(x=TransactionTime, y=quantity_avrg))+xlab("Transaction time")+ylab("weekly average")+ggtitle(paste("Sales quantity", club))
In stl function, s.window and t.window are both set to be 13, which simply follows the example here.
decompose_log <- stl(t_log, s.window = 13, t.window = 13)
plot(decompose_log)
adjust_log<- t_log - decompose_log$time.series[,1]
tsclean function is used to take out outliers.
outlier_free_log<- tsclean(adjust_log)
trend_log<- decompose_log$time.series[, 2]
detrend_ts_log <- outlier_free_log-(trend_log - trend_log[1])
# adjust_log is trend and random combined, and outlier_free_log is adjust_log with outliers taken out
autoplot(adjust_log)+autolayer(outlier_free_log)
## Warning in is.na(main): is.na() applied to non-(list or vector) of type
## 'NULL'
# detrend_ts_log is random part of time series
plot(detrend_ts_log)
Validation set is the last part of the existing data set, with a length of the sum of len_inv and len_dec. Training set is the rest of the data set.
training_log<-ts(detrend_ts_log[1:(nrow(c1k_week)-len_inv-len_dec)], frequency = 52.14, start = c(c1k_week$week_number_year[1], c1k_week$week_number[1]))
# future trend and seasonality, which will be added back:
trend_fit_log <- auto.arima(trend_log)
trend_for_log <- forecast(trend_fit_log, len_inv+len_dec)$mean
retrend_log<-trend_for_log
reseasonal_log<-forecast(decompose_log$time.series[,1], len_inv+len_dec)$mean
##validation set
validation_log<-ts(log(c1k_week$quantity_avrg)[(nrow(c1k_week)-len_inv-len_dec+1):nrow(c1k_week)], frequency = 52.14, end = c(c1k_week$week_number_year[length(c1k_week$week_number_year)], c1k_week$week_number[length(c1k_week$week_number_year)]))
validation<-ts(c1k_week$quantity_avrg[(nrow(c1k_week)-len_inv-len_dec+1):nrow(c1k_week)], frequency = 52.14, end = c(c1k_week$week_number_year[length(c1k_week$week_number_year)], c1k_week$week_number[length(c1k_week$week_number_year)]))
theme_ts <- theme(panel.border = element_rect(fill = NA,
colour = "grey10"),
panel.background = element_blank(),
panel.grid.minor = element_line(colour = "grey85"),
panel.grid.major = element_line(colour = "grey85"),
panel.grid.major.x = element_line(colour = "grey85"),
axis.text = element_text(size = 13, face = "bold"),
axis.title = element_text(size = 15, face = "bold"),
plot.title = element_text(size = 16, face = "bold"),
strip.text = element_text(size = 16, face = "bold"),
strip.background = element_rect(colour = "black"),
legend.text = element_text(size = 15),
legend.title = element_text(size = 16, face = "bold"),
legend.background = element_rect(fill = "white"),
legend.key = element_rect(fill = "white"))
# Decomposition of log series
decomp_stl_log<- data.table(Quant = c(t_log, decompose_log$time.series[, 1], decompose_log$time.series[, 2]-decompose_log$time.series[, 2][1], detrend_ts_log),
Date = rep(c1k_week$TransactionTime[1:(nrow(c1k_week)-len_inv-len_dec)], ncol(decompose_log$time.series)+1),
Type = factor(rep(c("log quantity", colnames(decompose_log$time.series)),
each = nrow(decompose_log$time.series)),
levels = c("log quantity", colnames(decompose_log$time.series))))
ggplot(decomp_stl_log, aes(x = Date, y = Quant)) +
geom_line() +
facet_grid(Type ~ ., scales = "free_y", switch = "y") +
labs(x = "Time", y = NULL,
title = "Time Series Decomposition by STL (log quantity)") +
theme_ts
ggsave("Output/Countries/Colombia/Plots/decomp_training_log_c1k.jpg", width = 6, height = 10)
decomp_stl_training_log<- decomp_stl_log
decomp_stl_training_log$set<-"training"
decomp_stl_forecast_log<-data.table(Quant=c(rep(NA, len_dec+len_inv), reseasonal_log, retrend_log-trend_log[1], rep(NA, len_dec+len_inv)),
Date = rep(c1k_week$TransactionTime[(nrow(c1k_week)-len_dec-len_inv+1): nrow(c1k_week)], ncol(decompose_log$time.series)+1),
Type = factor(rep(c("log quantity", colnames(decompose_log$time.series)),
each = len_dec+len_inv),
levels = c("log quantity", colnames(decompose_log$time.series))))
decomp_stl_forecast_log$set<-"forecast"
decomp_stl_combined_log<-rbind(decomp_stl_training_log, decomp_stl_forecast_log)
ggplot(decomp_stl_combined_log, aes(x = Date, y = Quant, col=set)) +
geom_line() +
facet_grid(Type ~ ., scales = "free_y", switch = "y") +
labs(x = "Time", y = NULL,
title = "Time Series Decomposition by STL (log quantity)") +
theme_ts
## Warning: Removed 36 rows containing missing values (geom_path).
ggsave("Output/Countries/Colombia/Plots/decomp_combined_log_c1k.jpg", width = 8, height = 8)
## Warning: Removed 36 rows containing missing values (geom_path).
Four simple forecasting models are used in this section.
#Average Method
Average_Method<-meanf(training_log, h = len_dec+len_inv)
#Naive Method
Naive_Method<-naive(training_log, h = len_dec+len_inv)
#Seasonal Naive Method
Seasonal_Naive_Method<-snaive(training_log , h = len_dec+len_inv)
## Warning in lag.default(y, -lag): 'k' is not an integer
#Drift Method
Drift_Method<-rwf(training_log, h = len_dec+len_inv, drift = TRUE)
autoplot(training_log) +
autolayer(validation_log-(retrend_log-trend_log[1])-reseasonal_log, series="Validation")+
autolayer(Average_Method,
series="Mean", PI=FALSE) +
autolayer(Naive_Method,
series="Naïve", PI=FALSE) +
autolayer(Seasonal_Naive_Method,
series="Seasonal naïve", PI=FALSE) +
autolayer(Drift_Method,
series = "Drift", PI = FALSE)+
ggtitle("Forecasts for Random Component: simple log models") +
xlab("Year") + ylab( paste(item, "\n logged, detrended and de-seasonalized"))+
guides(colour=guide_legend(title="Forecast"))
## Warning in is.na(main): is.na() applied to non-(list or vector) of type
## 'NULL'
ggsave("Output/Countries/Colombia/Plots/simple_log_total_c1k.jpg", width = 10, height = 5)
c1k_week_quantity_models is the dataset that stores the forecast results of all models.
c1k_week_quantity_models<-as.data.frame(validation)
colnames(c1k_week_quantity_models)<-"Actual"
c1k_week_quantity_models$TransactionYear<-c1k_week$week_number_year[(nrow(c1k_week)-len_dec-len_inv+1):nrow(c1k_week)]
c1k_week_quantity_models$week_number<-c1k_week$week_number[(nrow(c1k_week)-len_dec-len_inv+1):nrow(c1k_week)]
c1k_week_quantity_models$TransactionTime<-paste0(c1k_week_quantity_models$TransactionYear,"-", c1k_week_quantity_models$week_number, "-3")%>%as.Date("%Y-%W-%w")
c1k_week_quantity_models<-performance_index_dtds(df_Actual = c1k_week_quantity_models, pred_name = "Average_Method", pred_value = Average_Method$mean)
c1k_week_quantity_models<-performance_index_dtds(df_Actual = c1k_week_quantity_models, pred_name = "Naive_Method", pred_value = Naive_Method$mean)
c1k_week_quantity_models<-performance_index_dtds(df_Actual = c1k_week_quantity_models, pred_name = "Seasonal_Naive_Method", pred_value = Seasonal_Naive_Method$mean)
c1k_week_quantity_models<-performance_index_dtds(df_Actual = c1k_week_quantity_models, pred_name = "Drift_Method", pred_value = Drift_Method$mean)
# re-trended and re-seasonalized data: plots
c1k_week_quantity_models_plot<-gather(c1k_week_quantity_models%>%select(-c("Actual", "TransactionYear", "week_number")), model, quantity, Average_Method:Drift_Method, factor_key=TRUE)
#for club 6107 run the following:
#c1k_week_quantity_models_plot<-gather(c1k_week_quantity_models%>%select(-c("Actual", "TransactionYear", "week_number")), model, quantity, factor_key=TRUE)
c1k_week_quantity_models_plot$model<-as.character(c1k_week_quantity_models_plot$model)
c1k_week_quantity_models_plot$quantity<-as.numeric(c1k_week_quantity_models_plot$quantity)
c1k_week_int_Actual<-c1k_week%>%ungroup()%>%select(c("TransactionTime", "quantity_avrg"))
c1k_week_int_Actual$model<-"Actual"
c1k_week_int_Actual$quantity<-as.numeric(c1k_week_int_Actual$quantity_avrg)
c1k_week_int_Actual<- c1k_week_int_Actual[(nrow(c1k_week)-len_dec-len_inv+1):nrow(c1k_week),]
c1k_week_int_Actual$quantity_avrg<-NULL
c1k_week_quantity_models_plot<-rbind(as.data.frame(c1k_week_int_Actual), as.data.frame(c1k_week_quantity_models_plot))
This section utilizes the auto-arima function.
Simple_Arima<- auto.arima(training_log)
summary(Simple_Arima)
## Series: training_log
## ARIMA(5,0,0)(1,0,0)[52] with non-zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 sar1 mean
## -0.1825 -0.1186 -0.2620 -0.2044 -0.2047 -0.3207 4.2232
## s.e. 0.0732 0.0736 0.0713 0.0729 0.0740 0.0829 0.0019
##
## sigma^2 estimated as 0.00396: log likelihood=241.57
## AIC=-467.15 AICc=-466.3 BIC=-441.65
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set 0.0008088031 0.06168521 0.04837278 -0.00268697 1.145589
## MASE ACF1
## Training set 0.5521407 -0.03315475
fc_Simple_Arima_1<- forecast(Simple_Arima, len_dec+len_inv, bootstrap = TRUE)
#Graph the forecasted results
autoplot(fc_Simple_Arima_1) +
autolayer(log(validation)-(retrend_log-trend_log[1])-reseasonal_log, series = "Logged Quantity")+
xlab("Year") + ylab(paste("Logged Quantity Sold of", item))+
guides(colour=guide_legend(title="Validation Set"))
#Checking accuracy
cr_Simple_Arima_1<-checkresiduals(fc_Simple_Arima_1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,0)(1,0,0)[52] with non-zero mean
## Q* = 47.693, df = 29, p-value = 0.01581
##
## Model df: 7. Total lags used: 36
print(cr_Simple_Arima_1$data.name)
## NULL
print(cr_Simple_Arima_1$p.value)
## NULL
# Bring fc_Simple_Arima_1$mean back through de-trending and de-seasonalizing
c1k_week_quantity_models<-performance_index_dtds(df_Actual = c1k_week_quantity_models, pred_name = "Simple_Arima_1", pred_value = fc_Simple_Arima_1$mean)
This section uses grid to tune the K parameter to get the Fourier regressor.
##Finding best fit model
Arima_Fourier_AIC<-list(aicc=Inf)
for(K in seq(25)) {
fit <- auto.arima(training_log, xreg=fourier(training_log, K=K),
seasonal=FALSE)
if(fit[["aicc"]] < Arima_Fourier_AIC[["aicc"]]) {
Arima_Fourier_AIC <- fit
bestK <- K
}
}
Arima_Fourier_AIC
## Series: training_log
## Regression with ARIMA(0,0,3) errors
##
## Coefficients:
## ma1 ma2 ma3 intercept S1-52 C1-52
## -0.2337 -0.2146 -0.2469 4.2228 0.0032 0.0014
## s.e. 0.0738 0.0744 0.0662 0.0015 0.0025 0.0027
##
## sigma^2 estimated as 0.004387: log likelihood=234.68
## AIC=-455.37 AICc=-454.71 BIC=-433.05
fc_Arima_Fourier_AIC <- forecast(Arima_Fourier_AIC,xreg=fourier(training_log, K=bestK, h=len_dec+len_inv))
autoplot(fc_Arima_Fourier_AIC) + autolayer(log(validation)-(retrend_log-trend_log[1])-reseasonal_log, series = "Logged Quantity")+
xlab("Year") + ylab(paste("Logged Quantity Sold of", item))+
guides(colour=guide_legend(title="Validation Set"))
cr_ARIMA_Fourier_AIC<-checkresiduals(fc_Arima_Fourier_AIC)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(0,0,3) errors
## Q* = 50.479, df = 30, p-value = 0.01105
##
## Model df: 6. Total lags used: 36
print(cr_ARIMA_Fourier_AIC$data.name)
## NULL
# Bring fc_Arima_Fourier_AIC$mean back through de-trending and de-seasonalizing
c1k_week_quantity_models<-performance_index_dtds(df_Actual = c1k_week_quantity_models, pred_name = "Arima_Fourier_AIC", pred_value = fc_Arima_Fourier_AIC$mean)
This arima model uses fourier and the two seasonality periods obtained above.
Arima_AIC <- auto.arima(training_log)
bestfit <- list(aicc=Arima_AIC$aicc, i=0, j=0, fit=Arima_AIC)
fc_ARIMA_fourier<- forecast(Arima_AIC, h = len_dec+len_inv)
autoplot(fc_ARIMA_fourier) +
autolayer(log(validation)-(retrend_log-trend_log[1])-reseasonal_log, series = "Logged Quantity")+
xlab("Year") + ylab(paste("Logged Quantity Sold of", item))+
guides(colour=guide_legend(title="Validation Set"))
#choose the best model by AICc
for(i in 1:3) {
for (j in 1:3){
z1 <- fourier(ts(training_log, frequency= season1), K=i)
z2 <- fourier(ts(training_log, frequency= season2), K=j)
Arima_Seasons1_2 <- auto.arima(training_log, xreg=cbind(z1, z2), seasonal=F)
if(Arima_Seasons1_2$aicc < bestfit$aicc) {
bestfit <- list(aicc=Arima_Seasons1_2$aicc, i=i, j=j, Arima_Seasons1_2=Arima_Seasons1_2)
}
}
}
bestfit
## $aicc
## [1] -466.3026
##
## $i
## [1] 0
##
## $j
## [1] 0
##
## $fit
## Series: training_log
## ARIMA(5,0,0)(1,0,0)[52] with non-zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 sar1 mean
## -0.1825 -0.1186 -0.2620 -0.2044 -0.2047 -0.3207 4.2232
## s.e. 0.0732 0.0736 0.0713 0.0729 0.0740 0.0829 0.0019
##
## sigma^2 estimated as 0.00396: log likelihood=241.57
## AIC=-467.15 AICc=-466.3 BIC=-441.65
fc_Arima_Seasons1_2 <- forecast(bestfit$fit,
xreg=cbind(
fourier(ts(training_log, frequency=season1), K=bestfit$i, h=len_dec+len_inv),
fourier(ts(training_log, frequency=season2), K=bestfit$j, h=len_dec+len_inv)))
## Warning in forecast.Arima(bestfit$fit, xreg =
## cbind(fourier(ts(training_log, : xreg not required by this model, ignoring
## the provided regressors
fc_Arima_Seasons1_2 <- forecast(bestfit$fit, h=len_dec+len_inv)
autoplot(fc_Arima_Seasons1_2) +
autolayer(validation_log-(retrend_log-trend_log[1])-reseasonal_log, series = "Logged Quantity")+
xlab("Year") + ylab("Logged Quantity Sold of Kitchen Trash Bags")+
guides(colour=guide_legend(title="Validation Set"))
cr_ARIMA_seasons1_2<-checkresiduals(fc_Arima_Seasons1_2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,0)(1,0,0)[52] with non-zero mean
## Q* = 47.693, df = 29, p-value = 0.01581
##
## Model df: 7. Total lags used: 36
print(cr_ARIMA_seasons1_2$data.name)
## NULL
print(cr_ARIMA_seasons1_2$p.value)
## NULL
# Bring fc_ARIMA$mean back through de-trending and de-seasonalizing
c1k_week_quantity_models<-performance_index_dtds(df_Actual = c1k_week_quantity_models, pred_name = "Arima_Seasons1_2", pred_value = fc_Arima_Seasons1_2$mean)
This section generates visualizations of three ARIMA models.
# before de-trending and de-seasoning
autoplot(training_log) +
autolayer(validation_log-(retrend_log-trend_log[1])-reseasonal_log, series="Validation", PI=FALSE)+
autolayer(fc_Simple_Arima_1,
series="Default Arima", PI=FALSE) +
autolayer(fc_Arima_Seasons1_2,
series="Fourier double seasons", PI=FALSE) +
autolayer(fc_Arima_Fourier_AIC,
series="Fourier single season", PI=FALSE)+
ggtitle(paste("Forecast for Log Quantity Sold of", item, "\n", country, ": Club", club)) +
xlab("Year") + ylab(paste(item,"\n detrended and de-seasonalized"))+
guides(colour=guide_legend(title="Forecast"))
## Warning in is.na(main): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: Ignoring unknown parameters: PI
ggsave("Output/Countries/Colombia/Plots/ARIMA_log_total_weekly_c1k.jpg", width = 10, height = 5)
autoplot(validation_log-(retrend_log-trend_log[1])-reseasonal_log, series="Validation")+
autolayer(fc_Simple_Arima_1,
series="Default Arima", PI=FALSE) +
autolayer(fc_Arima_Seasons1_2,
series="Fourier double seasons", PI=FALSE) +
autolayer(fc_Arima_Fourier_AIC,
series="Fourier single season", PI=FALSE)+
ggtitle(paste("Forecast for Log Quantity Sold of", item, "\n", country, ": Club", club)) +
xlab("Year") + ylab(paste(item,"\n detrended and de-seasonalized"))+
guides(colour=guide_legend(title="Forecast"))
## Warning in is.na(main): is.na() applied to non-(list or vector) of type
## 'NULL'
ggsave("Output/Countries/Colombia/Plots/ARIMA_log_total_weekly1_c1k.jpg", width = 10, height = 5)
decomp_stl<- data.table(Quant = c(t_log, as.numeric(decompose_log$time.series)),
Date = rep(c1k_week$TransactionTime[1:(nrow(c1k_week)-len_dec-len_inv)], ncol(decompose_log$time.series)+1),
Type = factor(rep(c("original data", colnames(decompose_log$time.series)),
each = nrow(decompose_log$time.series)),
levels = c("original data", colnames(decompose_log$time.series))))
decomp_ARIMA_forecast_log<-data.table(Quant=c(fc_Simple_Arima_1$mean+reseasonal_log+(retrend_log-trend_log[1]), fc_Arima_Seasons1_2$mean+reseasonal_log+(retrend_log-trend_log[1]), fc_Arima_Fourier_AIC$mean+reseasonal_log+(retrend_log-trend_log[1]), reseasonal_log, (retrend_log-trend_log[1]), fc_Simple_Arima_1$mean, fc_Arima_Seasons1_2$mean, fc_Arima_Fourier_AIC$mean),
Date = rep(c1k_week$TransactionTime[(nrow(c1k_week)-len_dec-len_inv+1): nrow(c1k_week)], ncol(decompose_log$time.series)+5),
Type = factor(rep(c(rep("log quantity", 3), colnames(decompose_log$time.series)[1:2], rep(colnames(decompose_log$time.series)[3], 3)),
each = len_dec+len_inv),
levels = c("log quantity", colnames(decompose_log$time.series))))
decomp_ARIMA_forecast_log$set<-rep(c("default arima", "Fourier double seasons", "Fourier single season", "forecast seasonal", "forecast trend", "default arima", "Fourier double seasons", "Fourier single season"), each=len_dec+len_inv)
decomp_ARIMA_combined_log<-rbind(decomp_stl_training_log, decomp_ARIMA_forecast_log)
ggplot(decomp_ARIMA_combined_log, aes(x = Date, y = Quant, col=set)) +
geom_line() +
facet_grid(Type ~ ., scales = "free_y", switch = "y") +
labs(x = "Time", y = NULL,
title = "Decomposition by STL (log quantity) and ARIMA predictions") +
theme_ts
ggsave("Output/Countries/Colombia/Plots/ARIMA_log_grid_weekly_c1k.jpg", width = 10, height = 5)
ggplot(decomp_ARIMA_combined_log[decomp_ARIMA_combined_log$Date>=(max(decomp_ARIMA_combined_log$Date, na.rm = TRUE)-119),], aes(x = Date, y = Quant, col=set)) +
geom_line() + facet_grid(Type ~ ., scales = "free_y", switch = "y") +
labs(x = "Time", y = NULL,
title = "Decomposition by STL (log quantity) and ARIMA predictions") +
theme_ts
ggsave("Output/Countries/Colombia/Plots/ARIMA_log_grid_weekly1_c1k.jpg", width = 10, height = 5)
Five TBATS models (Exponential smoothing state space model with Box-Cox transformation, ARMA errors, Trend and Seasonal components) are tried out in these sections. TBATS use a combination of Fourier terms with an exponential smoothing state space model and a Box-Cox transformation. Seasonality is allowed to change slowly over time. ## Feed Raw Data to this model (with seasonality, trend and outliers)
fit_TBATS_Raw_1<- tbats(t_log, use.box.cox = NULL, use.trend = TRUE, use.damped.trend = NULL, seasonal.periods = 52, use.arma.errors = TRUE, biasadj = TRUE)
fc_TBATS_Raw_1<- forecast(fit_TBATS_Raw_1, h=len_dec+len_inv, bootstrap = TRUE)
autoplot(fc_TBATS_Raw_1) + autolayer(log(validation), series = "Logged Quantity")+
xlab("Year") + ylab(paste("Logged Quantity Sold of", item))+
guides(colour=guide_legend(title="Validation Set"))
checkresiduals(fc_TBATS_Raw_1)
##
## Ljung-Box test
##
## data: Residuals from TBATS(1, {0,0}, 0.839, {<52,5>})
## Q* = 35.404, df = 19, p-value = 0.01248
##
## Model df: 17. Total lags used: 36
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "fc_TBATS_Raw_1", pred_value = fc_TBATS_Raw_1$mean)
fit_TBATS_Season1_2<- tbats(t_log, use.box.cox = NULL, use.trend = NULL, use.damped.trend = NULL, seasonal.periods = c(season2,season1), use.arma.errors = TRUE, biasadj = TRUE)
fc_TBATS_Season1_2<- forecast(fit_TBATS_Season1_2, h=len_dec+len_inv, bootstrap = TRUE)
autoplot(fc_TBATS_Season1_2) + autolayer(log(validation), series = "Logged Quantity")+
xlab("Year") + ylab(paste("Logged Quantity Sold of", item))+
guides(colour=guide_legend(title="Validation Set"))
checkresiduals(fc_TBATS_Season1_2)
##
## Ljung-Box test
##
## data: Residuals from TBATS(1, {0,0}, -, {<25.07,2>, <52.14,5>})
## Q* = 41.218, df = 16, p-value = 0.0005157
##
## Model df: 20. Total lags used: 36
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "fc_TBATS_Season1_2", pred_value = fc_TBATS_Season1_2$mean)
fc_TBATS_Season2_3<- forecast(tbats(t_log, use.box.cox = NULL, use.trend = NULL, use.damped.trend = NULL, seasonal.periods = c(season2,season3), use.arma.errors = TRUE, biasadj = TRUE),h=len_dec+len_inv)
autoplot(fc_TBATS_Season2_3)+ autolayer(log(validation), series = "Logged Quantity")+
xlab("Year") + ylab(paste("Logged Quantity Sold of", item))+
guides(colour=guide_legend(title="Validation Set"))
checkresiduals(fc_TBATS_Season2_3)
##
## Ljung-Box test
##
## data: Residuals from TBATS(0.993, {0,0}, -, {<13.14,4>, <52.14,5>})
## Q* = 40.359, df = 11, p-value = 3.106e-05
##
## Model df: 25. Total lags used: 36
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "fc_TBATS_Season2_3", pred_value = fc_TBATS_Season2_3$mean)
fc_TBATS_Season1_3 <- forecast(tbats(t_log, seasonal.periods=c(season1,season3)), h=len_dec+len_inv)
autoplot(fc_TBATS_Season1_3)+ autolayer(log(validation), series = "Logged Quantity")+
xlab("Year") + ylab(paste("Logged Quantity Sold of", item))+
guides(colour=guide_legend(title="Validation Set"))
checkresiduals(fc_TBATS_Season1_3)
##
## Ljung-Box test
##
## data: Residuals from BATS(1, {0,0}, 0.8, -)
## Q* = 27.732, df = 31, p-value = 0.635
##
## Model df: 5. Total lags used: 36
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "fc_TBATS_Season1_3", pred_value = fc_TBATS_Season1_3$mean)
fit_TBATS_NoSeason<- tbats(t_log)
fc_TBATS_NoSeason<- forecast(fit_TBATS_NoSeason, h=len_dec+len_inv)
autoplot(fc_TBATS_NoSeason) + autolayer(validation_log)
autoplot(fc_TBATS_NoSeason)+ autolayer(log(validation), series = "TBATS No Season")+
xlab("Year") + ylab(paste("Logged Quantity Sold of", item))+
ggtitle(paste("Forecast for Quantity Sold of", item, "\n", country, ": Club", club)) +
guides(colour=guide_legend(title="Validation Set"))
checkresiduals(fc_TBATS_NoSeason)
##
## Ljung-Box test
##
## data: Residuals from TBATS(1, {0,0}, -, {<52.14,5>})
## Q* = 40.259, df = 22, p-value = 0.01008
##
## Model df: 14. Total lags used: 36
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "fc_TBATS_NoSeason", pred_value = fc_TBATS_NoSeason$mean)
This section generates visualizations of TBATS models.
autoplot(t_log) +
autolayer(validation_log, series="Validation", PI=FALSE)+
autolayer(fc_TBATS_Raw_1,
series="Single Season", PI=FALSE) +
autolayer(fc_TBATS_Season1_2,
series=paste("Season", season1, "and", season2), PI=FALSE) +
autolayer(fc_TBATS_Season2_3,
series=paste("Season", season2, "and", season3), PI=FALSE) +
autolayer(fc_TBATS_Season1_3,
series=paste( "Season", season1, "and", season3), PI=FALSE) +
autolayer(fc_TBATS_NoSeason,
series="Season Unspecified", PI=FALSE) +
ggtitle(paste("Forecast for Logged Quantity Sold of", item, "\n", country, ": Club", club)) +
xlab("Year") + ylab(paste(item, "\n detrended and de-seasonalized"))+
guides(colour=guide_legend(title="Forecast"))
## Warning in is.na(main): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: Ignoring unknown parameters: PI
ggsave("Output/Countries/Colombia/Plots/tbats_log_total_weekly_c1k.jpg", width = 10, height = 5)
autoplot(validation_log, series="Validation", PI=FALSE)+
autolayer(fc_TBATS_Raw_1,
series="Single Season", PI=FALSE) +
autolayer(fc_TBATS_Season1_2,
series=paste("Season", season1, "and", season2), PI=FALSE) +
autolayer(fc_TBATS_Season2_3,
series=paste("Season", season2, "and", season3), PI=FALSE) +
autolayer(fc_TBATS_Season1_3,
series=paste("Season", season1, "and", season3), PI=FALSE) +
autolayer(fc_TBATS_NoSeason,
series="Season Unspecified", PI=FALSE) +
ggtitle(paste("Forecast for Logged Quantity Sold of", item, "\n", country, ": Club", club)) +
xlab("Year") + ylab(paste(item, "\n detrended and de-seasonalized"))+
guides(colour=guide_legend(title="Forecast"))
## Warning: Ignoring unknown parameters: PI
## Warning in is.na(main): is.na() applied to non-(list or vector) of type
## 'NULL'
ggsave("Output/Countries/Colombia/Plots/tbats_log_total_weekly1_c1k.jpg", width = 10, height = 5)
#In NNAR(p,P,k)m, p = lagged inputs, P = equivalent to ARIMA(p,0,0)(P,0,0)m, k = nods in the single hidden layer.
fit_NN_1<- nnetar(training_log, lambda = "auto")
fc_NN_1<- forecast(fit_NN_1, h=len_dec+len_inv)
autoplot(fc_NN_1) + autolayer(log(validation)-(retrend_log-trend_log[1])-reseasonal_log, series = "Logged Quantity")+
xlab("Year") + ylab(paste("Logged Quantity Sold of", item))+
guides(colour=guide_legend(title="Validation Set"))
checkresiduals(fc_NN_1)
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.
c1k_week_quantity_models<-performance_index_dtds(df_Actual = c1k_week_quantity_models, pred_name = "fc_NN_1", pred_value = fc_NN_1$mean)
fit_NN_Raw<- nnetar(t_log, lambda = "auto")
fc_NN_Raw<- forecast(fit_NN_Raw, h=len_dec+len_inv)
autoplot(fc_NN_Raw) + autolayer(log(validation), series = "Logged Quantity")+
xlab("Year") + ylab(paste("Logged Quantity Sold of", item))+
guides(colour=guide_legend(title="Validation Set"))
checkresiduals(fc_NN_Raw)
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "fc_NN_Raw", pred_value = fc_NN_Raw$mean)
The combined uni-variate model is the average of four models. Two were selected by RMSE on the whole forecast period; two were selected by RMSE on the target (len_dec) period.
tsmodels<-c("Average_Method","Naive_Method","Seasonal_Naive_Method","Drift_Method","Simple_Arima_1","Arima_Seasons1_2", "Arima_Fourier_AIC","fc_TBATS_Raw_1","fc_TBATS_Season1_2","fc_TBATS_Season2_3","fc_TBATS_Season1_3","fc_TBATS_NoSeason","fc_NN_1","fc_NN_Raw", "Combination_uni")
indices<-c("MEAN", "RMSE", "MAE", "MPE", "MAPE", "MASE")
performance_week_total<-matrix(nrow = length(tsmodels), ncol = length(indices))
for (i in 1:length(tsmodels)){
for (j in 1:length(indices)){
performance_week_total[i, j]=get(paste0(indices[j], "_", tsmodels[i], "_total"))
}
}
performance_week_total<-as.data.frame(performance_week_total)
colnames(performance_week_total)<-indices
rownames(performance_week_total)<-tsmodels
write.csv(x = performance_week_total, file = "Output/Countries/Colombia/Tables/performance_week_total_c1k.csv")
performance_week_total
## MEAN RMSE MAE MPE MAPE
## Average_Method -0.020359307 4.833933 3.746772 -2.0764580 0.07392996
## Naive_Method 0.002463138 4.719307 3.766251 0.2474523 0.07262466
## Seasonal_Naive_Method -0.012450268 6.431938 5.161395 -1.9912521 0.09975956
## Drift_Method 0.004792085 4.748761 3.812052 0.4743401 0.07335596
## Simple_Arima_1 -0.021062445 5.609997 4.202171 -2.3537557 0.08377475
## Arima_Seasons1_2 -0.021062445 5.609997 4.202171 -2.3537557 0.08377475
## Arima_Fourier_AIC -0.022711766 4.865470 3.802633 -2.3236902 0.07524264
## fc_TBATS_Raw_1 0.033113569 5.098434 3.993996 3.0269435 0.07284353
## fc_TBATS_Season1_2 0.069408055 8.490606 6.905108 5.4652057 0.11937853
## fc_TBATS_Season2_3 0.031278442 4.999765 3.970225 2.9409067 0.07325741
## fc_TBATS_Season1_3 0.030605925 5.215859 4.336095 2.9209483 0.08138486
## fc_TBATS_NoSeason 0.035945294 5.389188 4.223400 3.2654544 0.07728402
## fc_NN_1 -0.016187900 5.777606 4.593613 -2.0072220 0.09139242
## fc_NN_Raw -0.025775581 3.765815 2.922250 -2.9640275 0.05969052
## Combination_uni 0.009642981 3.956677 3.332475 1.0339777 0.06345847
## MASE
## Average_Method 0.6607378
## Naive_Method 0.6641730
## Seasonal_Naive_Method 0.9102046
## Drift_Method 0.6722498
## Simple_Arima_1 0.7410467
## Arima_Seasons1_2 0.7410467
## Arima_Fourier_AIC 0.6705889
## fc_TBATS_Raw_1 0.7043355
## fc_TBATS_Season1_2 1.2177058
## fc_TBATS_Season2_3 0.7001434
## fc_TBATS_Season1_3 0.7646640
## fc_TBATS_NoSeason 0.7447904
## fc_NN_1 0.8100770
## fc_NN_Raw 0.5153346
## Combination_uni 0.5876771
performance_week_target<-matrix(nrow = length(tsmodels), ncol = length(indices))
for (i in 1:length(tsmodels)){
for (j in 1:length(indices)){
performance_week_target[i, j]=get(paste0(indices[j], "_", tsmodels[i], "_target"))
}
}
performance_week_target<-as.data.frame(performance_week_target)
colnames(performance_week_target)<-indices
rownames(performance_week_target)<-tsmodels
write.csv(x = performance_week_total, file = "Output/Countries/Colombia/Tables/performance_week_target_c1k.csv")
This part of the code is based on this site Feature engineering functions defined in section “Define functions” are applied to both level and log forms (for some) of variables
# feature engineering
c1k_week$month<-month(c1k_week$TransactionTime)
c1k_week$month<-as.factor(c1k_week$month)
c1k_week$log1p_transaction_avrg<-log1p(c1k_week$transaction_avrg) #
c1k_week$log1p_members_avrg<-log1p(c1k_week$members_avrg) #
c1k_week$log1p_sales_local_avrg<-log1p(c1k_week$sales_local_avrg) #
c1k_week$log1p_sales_usd_avrg<-log1p(c1k_week$sales_usd_avrg) #
c1k_week$log1p_category_sales_local_avrg<-log1p(c1k_week$category_sales_local_avrg) #
c1k_week$log1p_quantity_avrg<-log1p(c1k_week$quantity_avrg) #
c1k_week$log1p_category_sales_local_avrg<- log1p(c1k_week$category_sales_local_avrg)
c1k_week$log1p_category_sales_usd_avrg<-log1p(c1k_week$category_sales_usd_avrg)
c1k_week$log1p_category_quantity_avrg<- log1p(c1k_week$category_quantity_avrg)
c1k_week$log1p_salePrice_local_avrg<- log1p(c1k_week$salePrice_local_avrg)
c1k_week$log1p_salePrice_usd_avrg<- log1p(c1k_week$salePrice_usd_avrg)
t_v_c1k<-c1k_week
t_v_c1k<- t_v_c1k[order(t_v_c1k$TransactionTime),]%>%ungroup()
t_v_c1k<- feature_engineering(t_v_c1k, c("quantity_avrg", "transaction_avrg", "members_avrg", "sales_local_avrg", "exchange_rate_avrg", "sales_usd_avrg", "category_sales_local_avrg", "category_sales_usd_avrg", "category_quantity_avrg","salePrice_local_avrg","salePrice_usd_avrg", "log1p_quantity_avrg","log1p_category_sales_local_avrg","log1p_salePrice_local_avrg","log1p_salePrice_usd_avrg"))
# creating training matrix
t_v_c1k<- fastDummies::dummy_cols(t_v_c1k, select_columns = c("week_number_year", "week_number", "month"))
t_v_c1k$month<-NULL
t_v_c1k$week_number_year<-NULL
t_v_c1k$week_number<-NULL
train_c1k<-t_v_c1k[1:(nrow(t_v_c1k)-len_dec-len_inv),]
test_c1k<-t_v_c1k[(nrow(t_v_c1k)-len_dec-len_inv+1):nrow(t_v_c1k),]
c1k_week_quantity_models_train<-data.frame(Actual=train_c1k$quantity_avrg, TransactionTime=train_c1k$TransactionTime)
c1k_week_quantity_models_train<-c1k_week_quantity_models_train[rowSums(is.na(train_c1k))==0,]
trainc1k_XGB<-train_c1k%>%select(-c("transaction_avrg", "members_avrg", "sales_local_avrg", "exchange_rate_avrg", "sales_usd_avrg", "category_sales_local_avrg", "category_sales_usd_avrg", "category_quantity_avrg","salePrice_local_avrg","salePrice_usd_avrg", "log1p_quantity_avrg", "log1p_transaction_avrg", "log1p_members_avrg", "log1p_sales_local_avrg", "log1p_sales_usd_avrg", "log1p_category_sales_local_avrg","log1p_category_sales_usd_avrg", "log1p_category_quantity_avrg","log1p_salePrice_local_avrg","log1p_salePrice_usd_avrg","TransactionTime"))
testc1k_XGB<-test_c1k%>%select(-c("transaction_avrg", "members_avrg", "sales_local_avrg", "exchange_rate_avrg", "sales_usd_avrg", "category_sales_local_avrg", "category_sales_usd_avrg", "category_quantity_avrg","salePrice_local_avrg","salePrice_usd_avrg", "log1p_quantity_avrg", "log1p_transaction_avrg", "log1p_members_avrg", "log1p_sales_local_avrg", "log1p_sales_usd_avrg", "log1p_category_sales_local_avrg","log1p_category_sales_usd_avrg", "log1p_category_quantity_avrg","log1p_salePrice_local_avrg","log1p_salePrice_usd_avrg","TransactionTime"))
trainTask <- makeRegrTask(data = trainc1k_XGB, target = "quantity_avrg")
testTask <- makeRegrTask(data = testc1k_XGB, target = "quantity_avrg")
xgb_learner <- makeLearner(
"regr.xgboost",
predict.type = "response",
par.vals = list(
objective = "reg:linear",
eval_metric = "rmse",
nrounds = 200
)
)
## Warning in makeParam(id = id, type = "numeric", learner.param = TRUE, lower = lower, : NA used as a default value for learner parameter missing.
## ParamHelpers uses NA as a special value for dependent parameters.
# Create a model
xgb_model <- mlr::train(xgb_learner, task = trainTask)
xgb_params <- makeParamSet(
# The number of trees in the model (each one built sequentially)
makeIntegerParam("nrounds", lower = 100, upper = 500),
# number of splits in each tree
makeIntegerParam("max_depth", lower = 1, upper = 10),
# "shrinkage" - prevents overfitting
makeNumericParam("eta", lower = .1, upper = .5),
# L2 regularization - prevents overfitting
makeNumericParam("lambda", lower = -1, upper = 0, trafo = function(x) 10^x)
)
control <- makeTuneControlRandom(maxit = 1)
resample_desc <- makeResampleDesc("CV", iters = 10)
tuned_params <- tuneParams(
learner = xgb_learner,
task = trainTask,
resampling = resample_desc,
par.set = xgb_params,
control = control
)
## [Tune] Started tuning learner regr.xgboost for parameter set:
## Type len Def Constr Req Tunable Trafo
## nrounds integer - - 100 to 500 - TRUE -
## max_depth integer - - 1 to 10 - TRUE -
## eta numeric - - 0.1 to 0.5 - TRUE -
## lambda numeric - - -1 to 0 - TRUE Y
## With control class: TuneControlRandom
## Imputation value: Inf
## [Tune-x] 1: nrounds=332; max_depth=5; eta=0.308; lambda=0.789
## [Tune-y] 1: mse.test.mean=49.4681514; time: 0.6 min
## [Tune] Result: nrounds=332; max_depth=5; eta=0.308; lambda=0.789 : mse.test.mean=49.4681514
xgb_tuned_learner <- setHyperPars(
learner = xgb_learner,
par.vals = tuned_params$x
)
xgb_model <- mlr::train(xgb_tuned_learner, trainTask)
XGBoost_pred <- predict(xgb_model ,testTask)
XGBoost_pred_train <- predict(xgb_model ,trainTask)
c1k_week_quantity_models_train<-performance_index_raw(df_Actual = c1k_week_quantity_models_train, pred_name = "XGBoost_train", pred_value = log(XGBoost_pred_train$data$response[rowSums(is.na(trainc1k_XGB))==0]))
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "XGBoost", pred_value = log(XGBoost_pred$data$response))
This section uses very simple pre-set parameters (ntree = 1000, mtry = 3, nodesize = 5, importance = TRUE) in the randomForecast function.
trainc1k_RF<-train_c1k%>%select(-c("transaction_avrg", "members_avrg", "sales_local_avrg", "exchange_rate_avrg", "sales_usd_avrg", "category_sales_local_avrg", "category_sales_usd_avrg", "category_quantity_avrg","salePrice_local_avrg","salePrice_usd_avrg", "log1p_quantity_avrg", "log1p_transaction_avrg", "log1p_members_avrg", "log1p_sales_local_avrg", "log1p_sales_usd_avrg", "log1p_category_sales_local_avrg","log1p_category_sales_usd_avrg", "log1p_category_quantity_avrg","log1p_salePrice_local_avrg","log1p_salePrice_usd_avrg","TransactionTime"))
testc1k_RF<-test_c1k%>%select(-c("transaction_avrg", "members_avrg", "sales_local_avrg", "exchange_rate_avrg", "sales_usd_avrg", "category_sales_local_avrg", "category_sales_usd_avrg", "category_quantity_avrg","salePrice_local_avrg","salePrice_usd_avrg", "log1p_quantity_avrg", "log1p_transaction_avrg", "log1p_members_avrg", "log1p_sales_local_avrg", "log1p_sales_usd_avrg", "log1p_category_sales_local_avrg","log1p_category_sales_usd_avrg", "log1p_category_quantity_avrg","log1p_salePrice_local_avrg","log1p_salePrice_usd_avrg","TransactionTime"))
RF_1 <- randomForest(quantity_avrg ~. , data = trainc1k_RF,
ntree = 1000, mtry = 3, nodesize = 5, importance = TRUE, na.action = na.omit)
varImpPlot(RF_1, main = "Variable importance")
RF1_pred<-predict(RF_1, testc1k_RF)
RF1_pred_training<- predict(RF_1, trainc1k_RF)
c1k_week_quantity_models_train<-performance_index_raw(df_Actual = c1k_week_quantity_models_train, pred_name = "RF1_train", pred_value = log(RF1_pred_training[rowSums(is.na(trainc1k_XGB))==0]))
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "RF_1", pred_value = log(RF1_pred))
Parameters of Random Forest model are set through grid search. Best model is built using the caret package.
#Defining the Control
trControl<- trainControl(method = "cv", number = 10, search = "grid")
metric <- "RMSE"
seed<- set.seed(156230)
rf_default<- caret::train(quantity_avrg~ .
, data = trainc1k_RF
, method = "rf", metric = "RMSE", trControl = trControl, na.action=na.exclude)
rf_default
## Random Forest
##
## 179 samples
## 249 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 111, 112, 110, 112, 112, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 6.577128 0.3300550 4.983390
## 125 6.433182 0.3113334 4.916035
## 249 6.414178 0.3104555 4.915300
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 249.
#RMSE was used to select the optimal model using the smallest value.
#mtry is the number of variables available for splitting at each tree node
#our train_c1k_RF has 250 total variables
tuneGrid<- expand.grid(.mtry = seq(1, ncol(trainc1k_RF), by=5))
rf_mtry<- caret::train(quantity_avrg ~ .,
data = trainc1k_RF,
method = "rf", metric = "RMSE", tuneGrid = tuneGrid, trControl = trControl, importance = TRUE, na.action=na.exclude)
rf_mtry
## Random Forest
##
## 179 samples
## 249 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 112, 112, 110, 112, 111, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 1 6.853779 0.2796867 5.153571
## 6 6.450934 0.3090172 4.916242
## 11 6.345750 0.3251419 4.839104
## 16 6.372442 0.3122920 4.911708
## 21 6.356963 0.3205633 4.881006
## 26 6.366742 0.3226834 4.911689
## 31 6.362120 0.3185732 4.914679
## 36 6.365110 0.3154758 4.928504
## 41 6.365519 0.3118460 4.919150
## 46 6.405941 0.3112151 5.017704
## 51 6.373639 0.3238092 4.938800
## 56 6.374798 0.3174350 4.944653
## 61 6.399254 0.3138854 4.958984
## 66 6.357039 0.3193396 4.932255
## 71 6.394708 0.3092921 4.947357
## 76 6.392910 0.3119114 4.962904
## 81 6.387796 0.3129558 4.951709
## 86 6.387421 0.3088907 4.956551
## 91 6.374528 0.3180524 4.929859
## 96 6.355079 0.3231717 4.937847
## 101 6.402078 0.3052011 5.013015
## 106 6.397618 0.3171980 4.973375
## 111 6.389174 0.3194880 4.974468
## 116 6.478571 0.2965595 5.063439
## 121 6.410886 0.3072056 4.994746
## 126 6.415241 0.3092700 4.981548
## 131 6.414486 0.3133749 4.980930
## 136 6.465822 0.3000802 5.037734
## 141 6.442024 0.3006910 5.040200
## 146 6.439953 0.3044826 5.022418
## 151 6.390621 0.3133707 4.954877
## 156 6.376466 0.3181501 4.959407
## 161 6.490105 0.3008532 5.037873
## 166 6.420718 0.3082612 5.016268
## 171 6.422606 0.3158441 4.983817
## 176 6.422668 0.3043746 4.977810
## 181 6.436667 0.3073578 5.024373
## 186 6.465638 0.2971367 5.017007
## 191 6.416872 0.3046683 5.010734
## 196 6.431821 0.3022203 4.990068
## 201 6.433151 0.3029956 4.989908
## 206 6.455220 0.3017746 5.026499
## 211 6.441186 0.3028676 5.031627
## 216 6.411212 0.3155633 4.988758
## 221 6.439576 0.3036548 5.003008
## 226 6.469974 0.2963298 5.069123
## 231 6.466944 0.2962559 5.029585
## 236 6.464776 0.2966221 5.037645
## 241 6.439377 0.3004673 5.006935
## 246 6.437850 0.3026709 5.011037
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 11.
#RMSE was used to select the optimal model using the smallest value.
best_mtry<- rf_mtry$bestTune$mtry #store the best value for mtry
min(rf_mtry$results$RMSE)
## [1] 6.34575
store_maxnode<- list() # create a list to find the optimal max of nodes
tuneGrid<- expand.grid(.mtry= best_mtry)
for (maxnodes in c(1, 2, 3, 4, 5, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70)){
set.seed(156230)
rf_maxnode<- caret::train(quantity_avrg ~ .,
data = trainc1k_RF,
method = "rf", metric = "RMSE", tuneGrid = tuneGrid, trControl = trControl, importance = TRUE, maxnodes = maxnodes, nodesize = 4, na.action = na.exclude)
current_iteration<- toString(maxnodes)
store_maxnode[[current_iteration]]<- rf_maxnode
}
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
results_node<- resamples(store_maxnode)
results_node<-summary(results_node)
nnode_optimal<-results_node$models[results_node$statistics$RMSE%>%as.data.frame()%>%select("Mean")%>%as.matrix()%>%as.numeric()%>%which.min()]%>%as.numeric()
store_maxtrees <- list()
for (ntree in c(10, 20, 30, 40, 50, 100 , 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700)) {
set.seed(156230)
rf_maxtrees <- caret::train(quantity_avrg ~ .,
data = trainc1k_RF,
method = "rf",
metric = "RMSE",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
maxnodes = nnode_optimal,
ntree = ntree,
na.action = na.exclude)
key <- toString(ntree)
store_maxtrees[[key]] <- rf_maxtrees
}
results_tree <- resamples(store_maxtrees)
results_tree<-summary(results_tree)
ntrees_optimal<-results_tree$models[results_tree$statistics$RMSE%>%as.data.frame()%>%select("Mean")%>%as.matrix()%>%as.numeric()%>%which.min()]%>%as.numeric()
fit_rf<- caret::train(quantity_avrg ~ .,
data = trainc1k_RF,
method = "rf",
metric = "RMSE",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
maxnodes = nnode_optimal,
ntree = ntrees_optimal,
na.action = na.exclude)
fit_rf
## Random Forest
##
## 179 samples
## 249 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 112, 111, 112, 112, 112, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 6.484887 0.3266407 4.959018
##
## Tuning parameter 'mtry' was held constant at a value of 11
pred_rf2<- predict(fit_rf,test_c1k, predict.all = TRUE)
pred_rf2_train<- predict(fit_rf,train_c1k, predict.all = FALSE)
pred_rf2_train<-merge(data.frame(week=1:nrow(train_c1k)), data.frame(week=pred_rf2_train%>%names(), pred=pred_rf2_train), all.x = TRUE)
# Visualize Results
pred_rf2_dt<- data.table(Predicted_Quantity = pred_rf2, TransactionTime = test_c1k$TransactionTime)
ggplot() +
geom_line(pred_rf2_dt, mapping = aes(TransactionTime, Predicted_Quantity,color = "Predicted Quantity"))+
geom_line(test_c1k, mapping = aes(TransactionTime, quantity_avrg, color = "Actual Quantity"))+
labs(x = "Time", y = "Quantity", title = paste("Random Forest","\nQuantity Sold of", item, "\n",country, ": Club", club)) +
scale_color_manual(values = c("Predicted Quantity" = 'darkblue', "Actual Quantity" = 'red'))
varImp(fit_rf)
## rf variable importance
##
## only 20 most important variables shown (out of 249)
##
## Overall
## month_12 100.00
## lag_52_quantity_avrg 98.69
## lag_52_category_sales_local_avrg 95.07
## avg_18_salePrice_local_avrg 94.17
## lag_52_sales_local_avrg 92.65
## avg_52_sales_local_avrg 90.38
## avg_52_log1p_category_sales_local_avrg 90.02
## lag_18_log1p_salePrice_local_avrg 86.03
## lag_52_log1p_category_sales_local_avrg 85.33
## lag_52_log1p_quantity_avrg 85.24
## avg_52_log1p_quantity_avrg 85.23
## avg_18_log1p_salePrice_local_avrg 83.88
## lag_52_transaction_avrg 82.18
## avg_52_exchange_rate_avrg 79.11
## avg_52_quantity_avrg 78.84
## lag_52_members_avrg 75.75
## lag_18_salePrice_local_avrg 71.64
## avg_52_members_avrg 71.10
## avg_52_category_sales_local_avrg 67.88
## diff_18_log1p_salePrice_usd_avrg 67.16
plot(varImp(fit_rf)) #variable of importance
c1k_week_quantity_models_train<-performance_index_raw(df_Actual = c1k_week_quantity_models_train, pred_name = "RF2_train", pred_value = log(pred_rf2_train$pred[rowSums(is.na(trainc1k_XGB))==0]))
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "RF_2", pred_value = log(pred_rf2))
Random Forest 3 model uses optimal parameters tuned in Random Forest 2 model, but with randomForest function in randomForest package.
RF_3<- randomForest(quantity_avrg ~ .,
data = trainc1k_RF,
mtry = best_mtry,
importance = TRUE,
maxnodes = nnode_optimal,
ntree = ntrees_optimal,
na.action = na.exclude)
varImpPlot(RF_3)
pred_rf3 <- predict(RF_3, test_c1k)
pred_rf3<- data.table(Predicted_Quantity = pred_rf3, TransactionTime = test_c1k$TransactionTime)
pred_rf3_train <- predict(RF_3, train_c1k)
ggplot() +
geom_line(pred_rf3, mapping = aes(TransactionTime, Predicted_Quantity,color = "Predicted Quantity"))+
geom_line(test_c1k, mapping = aes(TransactionTime, quantity_avrg, color = "Actual Quantity"))+
labs(x = "Time", y = "Quantity", title = "Random Forest forecasts") +
scale_color_manual(values = c("Predicted Quantity" = 'darkblue', "Actual Quantity" = 'red'))
c1k_week_quantity_models_train<-performance_index_raw(df_Actual = c1k_week_quantity_models_train, pred_name = "RF3_train", pred_value = log(pred_rf3_train[rowSums(is.na(trainc1k_XGB))==0]))
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "RF_3", pred_value = log(pred_rf3$Predicted_Quantity))
#GLMNET does not use the formula method (y ~ x). So prior to modeling we need to create our feature and target set.
train_c1k_GLMNET<-drop_na(train_c1k%>%select(-c("transaction_avrg", "members_avrg", "sales_local_avrg", "exchange_rate_avrg", "sales_usd_avrg", "category_sales_local_avrg", "category_sales_usd_avrg", "category_quantity_avrg", "salePrice_local_avrg","salePrice_usd_avrg", "log1p_quantity_avrg", "log1p_transaction_avrg", "log1p_members_avrg", "log1p_sales_local_avrg", "log1p_sales_usd_avrg", "log1p_category_sales_local_avrg","log1p_category_sales_usd_avrg", "log1p_category_quantity_avrg","log1p_salePrice_local_avrg","log1p_salePrice_usd_avrg", "TransactionTime")))
test_c1k_GLMNET<-drop_na(test_c1k%>%select(-c("transaction_avrg", "members_avrg", "sales_local_avrg", "exchange_rate_avrg", "sales_usd_avrg", "category_sales_local_avrg", "category_sales_usd_avrg", "category_quantity_avrg", "salePrice_local_avrg","salePrice_usd_avrg", "log1p_quantity_avrg", "log1p_transaction_avrg", "log1p_members_avrg", "log1p_sales_local_avrg", "log1p_sales_usd_avrg", "log1p_category_sales_local_avrg","log1p_category_sales_usd_avrg", "log1p_category_quantity_avrg","log1p_salePrice_local_avrg","log1p_salePrice_usd_avrg", "TransactionTime")))
y_train<- data.matrix(train_c1k_GLMNET["quantity_avrg"])
x_train<-data.matrix(subset(train_c1k_GLMNET, select=-c(quantity_avrg)))
y_test<- data.matrix(test_c1k_GLMNET["quantity_avrg"])
x_test<-data.matrix(subset(test_c1k_GLMNET, select=-c(quantity_avrg)))
#Ridge is performed with alpha=0
ridge <- glmnet(x_train,y_train,alpha = 0)
plot(ridge, xvar = "lambda")
ridge$lambda %>% head()
## [1] 4198.643 4007.808 3825.647 3651.765 3485.787 3327.352
#Tuning to find the right value for lamda
ridge_cv <- cv.glmnet(x_train,y_train,alpha = 0)
plot(ridge_cv)
# as we constrain our coefficients with log ( λ ) ≥ 0 penalty, the MSE rises considerably. The numbers at the top of the plot just refer to the number of variables in the model. Ridge regression does not force any variables to exactly zero so all features will remain in the model
#The first and second vertical dashed lines represent the λ value with the minimum MSE and the largest λ value within one standard error of the minimum MSE.
#extract our minimum and one standard error MSE and λ values
min(ridge_cv$cvm) #minimum MSE
## [1] 44.43904
ridge_cv$lambda.min #lambda for this minimum MSE
## [1] 46.08005
ridge_cv$cvm[ridge_cv$lambda == ridge_cv$lambda.1se] # 1 st.error of min MSE
## [1] 52.9557
ridge_cv$lambda.1se # lambda for this MSE
## [1] 542.2756
ridge_min <- glmnet(x_train,y_train, alpha = 0)
plot(ridge_min, xvar = "lambda")
abline(v = log(ridge_cv$lambda.1se), col = "red", lty = "dashed")
#Most Influential Feautures to predict accuracy
coef(ridge_cv, s = "lambda.1se") %>%
tidy() %>%
filter(row != "(Intercept)") %>%
top_n(10, wt = abs(value)) %>%
ggplot(aes(value, reorder(row, value), color = value > 0)) +
geom_point(show.legend = FALSE) +
ggtitle("Influential variables") +
xlab("Coefficient") +
ylab(NULL)
## Warning: 'tidy.dgCMatrix' is deprecated.
## See help("Deprecated")
## Warning: 'tidy.dgTMatrix' is deprecated.
## See help("Deprecated")
min(ridge_cv$cvm)
## [1] 44.43904
#Ridge model will retain all variables. Therefore, a ridge model is good only if we believe that we need to retain all features in the model yet reduce the noise that less influential variable smay create and minimize collinearity. Ridge doesn't perform feature selection
#predicting
ridge_pred<- predict(ridge_cv, s=ridge_cv$lambda.1se, x_test, type = "response")
ridge_pred_train<-predict(ridge_cv, s=ridge_cv$lambda.1se, x_train, type = "response")
ridge_pred_train<-merge(data.frame(week=1:nrow(train_c1k)), data.frame(week=ridge_pred_train%>%rownames()%>%as.numeric(), pred=ridge_pred_train), all.x = TRUE)
colnames(ridge_pred_train)<-c("week", "pred")
#Graph
pred_ridge<- data.frame(Predicted_Quantity = ridge_pred, TransactionTime = test_c1k$TransactionTime)
colnames(pred_ridge)<-c("Predicted_Quantity", "TransactionTime")
ggplot() +
geom_line(pred_ridge, mapping = aes(TransactionTime, Predicted_Quantity, color = "Predicted Quantity"))+
geom_line(test_c1k, mapping = aes(TransactionTime, quantity_avrg, color = "Actual Quantity"))+
labs(x = "Time", y = "Quantity", title = "Ridge") + scale_color_manual(values = c("Predicted Quantity" = 'darkblue', "Actual Quantity" = 'red'))
c1k_week_quantity_models_train<-performance_index_raw(df_Actual = c1k_week_quantity_models_train, pred_name = "Ridge_train", pred_value = log((ridge_pred_train[rowSums(is.na(trainc1k_XGB))==0,])%>%select(pred)%>%unlist()))
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "Ridge", pred_value = log(ridge_pred))
lasso <- glmnet(x_train,y_train,alpha = 1)
plot(lasso, xvar = "lambda")
#Tuning to find the right value for lambda
lasso_cv <- cv.glmnet(x_train,y_train,alpha = 1)
plot(lasso_cv)
#extract our minimum and one standard error MSE and λ values
min(lasso_cv$cvm) #minimum MSE
## [1] 42.48352
lasso_cv$lambda.min #lambda for this minimum MSE
## [1] 1.141438
lasso_cv$cvm[lasso_cv$lambda == lasso_cv$lambda.1se] # 1 st.error of min MSE
## [1] 48.77975
lasso_cv$lambda.1se # lambda for this MSE
## [1] 2.63687
lasso_min <- glmnet(x_train,y_train, alpha = 1)
plot(lasso_min, xvar = "lambda")
abline(v = log(lasso_cv$lambda.min), col = "red", lty = "dashed")
abline(v = log(lasso_cv$lambda.1se), col = "red", lty = "dashed")
#Most Influential Feautures to predict accuracy
coef(lasso_cv, s = "lambda.1se") %>%
tidy() %>%
filter(row != "(Intercept)") %>%
ggplot(aes(value, reorder(row, value), color = value > 0)) +
geom_point(show.legend = FALSE) +
ggtitle("Influential variables") +
xlab("Coefficient") +
ylab(NULL)
## Warning: 'tidy.dgCMatrix' is deprecated.
## See help("Deprecated")
## Warning: 'tidy.dgTMatrix' is deprecated.
## See help("Deprecated")
#Minimum RIDGE MSE
min(ridge_cv$cvm)
## [1] 44.43904
#minimum LASSO MSE
min(lasso_cv$cvm)
## [1] 42.48352
#predicting
lasso_pred<- predict(lasso, s=lasso_cv$lambda.min, x_test, type = "response")
lasso_pred_train<-predict(lasso, s=lasso_cv$lambda.min, x_train, type = "response")
results_train<-merge(data.frame(week=1:nrow(train_c1k)), data.frame(week=lasso_pred_train%>%rownames()%>%as.numeric(), pred=lasso_pred_train), all.x = TRUE)
colnames(results_train)<-c("week", "pred")
#Graph
pred_lasso<- data.table(Predicted_Quantity = lasso_pred, TransactionTime = test_c1k$TransactionTime)
colnames(pred_lasso)<-c("Predicted_Quantity", "TransactionTime")
ggplot() +
geom_line(pred_lasso, mapping = aes(TransactionTime, Predicted_Quantity, color = "Predicted Quantity"))+
geom_line(test_c1k, mapping = aes(TransactionTime, quantity_avrg, color = "Actual Quantity"))+
labs(x = "Time", y = "Quantity", title = "Lasso") + scale_color_manual(values = c("Predicted Quantity" = 'darkblue', "Actual Quantity" = 'red'))
c1k_week_quantity_models_train<-performance_index_raw(df_Actual = c1k_week_quantity_models_train, pred_name = "Lasso_train", pred_value = log(lasso_pred_train[rowSums(is.na(trainc1k_XGB))==0]))
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "Lasso", pred_value = log(lasso_pred))
This part of the code is based on this site
#Tune λ and the alpha parameters.
# maintain the same folds across all models
fold_id <- sample(1:10, size = length(y_train), replace=TRUE)
# search across a range of alphas
tuning_grid <- tibble::tibble(
alpha = seq(0, 1, by = .1),
mse_min = NA,
mse_1se = NA,
lambda_min = NA,
lambda_1se = NA
)
#Now we can iterate over each alpha value, apply a CV elastic net, and extract the minimum and one standard error MSE values and their respective λ values.
for(i in seq_along(tuning_grid$alpha)) {
# fit CV model for each alpha value
fit <- cv.glmnet(x_train, y_train, alpha = tuning_grid$alpha[i], foldid = fold_id)
# extract MSE and lambda values
tuning_grid$mse_min[i] <- fit$cvm[fit$lambda == fit$lambda.min]
tuning_grid$mse_1se[i] <- fit$cvm[fit$lambda == fit$lambda.1se]
tuning_grid$lambda_min[i] <- fit$lambda.min
tuning_grid$lambda_1se[i] <- fit$lambda.1se
}
tuning_grid
## # A tibble: 11 x 5
## alpha mse_min mse_1se lambda_min lambda_1se
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 43.3 51.6 44.0 341.
## 2 0.1 43.3 51.5 7.51 21.9
## 3 0.2 43.1 51.3 4.32 12.0
## 4 0.3 43.0 51.2 3.80 8.39
## 5 0.4 42.7 51.4 2.85 6.59
## 6 0.5 42.5 50.8 2.39 5.27
## 7 0.6 42.3 51.1 1.99 4.60
## 8 0.7 42.2 50.7 1.71 3.95
## 9 0.8 42.1 50.4 1.49 3.45
## 10 0.9 42.0 50.1 1.39 3.07
## 11 1 42.0 50.7 1.25 2.89
#plot the MSE
elastic_tuning<-tuning_grid %>%mutate(se = mse_1se - mse_min)
elastic_tuning%>%
ggplot(aes(alpha, mse_min)) +
geom_line(size = 2) +
geom_ribbon(aes(ymax = mse_min + se, ymin = mse_min - se), alpha = .25) +
ggtitle("MSE ± one standard error")
alpha_optimal<-(elastic_tuning%>%as.data.frame()%>%select("alpha"))[which.min(elastic_tuning%>%as.data.frame()%>%select("mse_1se")%>%unlist()%>%as.numeric()),1]
#advantage of the elastic net model is that it enables effective regularization via the ridge penalty with the feature selection characteristics of the lasso penalty
elastic_cv<-cv.glmnet(x_train,y_train,alpha = alpha_optimal)
elastic_pred<- predict(elastic_cv, s=elastic_cv$lambda.1se, x_test, type = "response")
elastic_pred_train<- predict(elastic_cv, s=elastic_cv$lambda.1se, x_train, type = "response")
results_train<-merge(data.frame(week=1:nrow(train_c1k)), data.frame(week=elastic_pred_train%>%rownames()%>%as.numeric(), pred=elastic_pred_train), all.x = TRUE)
colnames(results_train)<-c("week", "pred")
#Graph
pred_elastic_cv<- data.table(Predicted_Quantity = elastic_pred, TransactionTime = test_c1k$TransactionTime)
colnames(pred_elastic_cv)<-c("Predicted_Quantity", "TransactionTime")
ggplot() +
geom_line(pred_elastic_cv, mapping = aes(TransactionTime, Predicted_Quantity, color = "Predicted Quantity"))+
geom_line(test_c1k, mapping = aes(TransactionTime, quantity_avrg, color = "Actual Quantity"))+
labs(x = "Time", y = "Quantity", title = "Elastic Net") + scale_color_manual(values = c("Predicted Quantity" = 'darkblue', "Actual Quantity" = 'red'))
c1k_week_quantity_models_train<-performance_index_raw(df_Actual = c1k_week_quantity_models_train, pred_name = "elastic_train", pred_value = log(elastic_pred_train[rowSums(is.na(trainc1k_XGB))==0]))
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "elastic", pred_value = log(elastic_pred))
# Best two models with total prediction
tsmodels<-c("Average_Method","Naive_Method","Seasonal_Naive_Method","Drift_Method","Simple_Arima_1","Arima_Seasons1_2", "Arima_Fourier_AIC","fc_TBATS_Raw_1","fc_TBATS_Season1_2","fc_TBATS_Season2_3","fc_TBATS_Season1_3","fc_TBATS_NoSeason","fc_NN_1","fc_NN_Raw")
tsmodels_ml<-c("XGBoost", "RF_1", "RF_2","RF_3", "Ridge", "Lasso", "elastic")
best_two_total_ml<-tsmodels_ml[rank(mget(paste0("RMSE_", tsmodels_ml, "_total"))%>%as.numeric())<=2]
# Best two models with target
best_two_target_ml<-tsmodels_ml[rank(mget(paste0("RMSE_", tsmodels_ml, "_target"))%>%as.numeric())<=2]
#
Combination_ml<-(1/4*(c1k_week_quantity_models%>%select(best_two_total_ml[1])+c1k_week_quantity_models%>%select(best_two_total_ml[2])+c1k_week_quantity_models%>%select(best_two_target_ml[1])+c1k_week_quantity_models%>%select(best_two_target_ml[2])))%>%unlist()
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "Combination_ml", pred_value = log(Combination_ml))
The final predictive model Combination_uni_ml is the average of four models. Two are best univariate models, selected by whole forecast period RMSE. Two are best machine learning models. selected by whole forecast period RMSE.
# Best two models with total prediction
best_two_total<-tsmodels[rank(mget(paste0("RMSE_", tsmodels, "_total"))%>%as.numeric())<=2]
#
Combination_uni_ml<-(1/4*(c1k_week_quantity_models%>%select(best_two_total_ml[1])+c1k_week_quantity_models%>%select(best_two_total_ml[2])+c1k_week_quantity_models%>%select(best_two_total[1])+c1k_week_quantity_models%>%select(best_two_total[2])))%>%unlist()
c1k_week_quantity_models<-performance_index_raw(df_Actual = c1k_week_quantity_models, pred_name = "Combination_uni_ml", pred_value = log(Combination_uni_ml))
best_two_total
## [1] "Naive_Method" "fc_NN_Raw"
tsmodels<-c("Average_Method","Naive_Method","Seasonal_Naive_Method","Drift_Method","Simple_Arima_1","Arima_Seasons1_2", "Arima_Fourier_AIC","fc_TBATS_Raw_1","fc_TBATS_Season1_2","fc_TBATS_Season2_3","fc_TBATS_Season1_3","fc_TBATS_NoSeason","fc_NN_1","fc_NN_Raw", "Combination_uni")
best_two_total_ml
## [1] "RF_2" "RF_3"
tsmodels_ml<-c("XGBoost", "RF_1", "RF_2","RF_3", "Ridge", "Lasso", "elastic", "Combination_ml", "Combination_uni_ml")
indices<-c("MEAN", "RMSE", "MAE", "MPE", "MAPE", "MASE")
performance_week_total_ml<-matrix(nrow = length(tsmodels_ml), ncol = length(indices))
for (i in 1:length(tsmodels_ml)){
for (j in 1:length(indices)){
performance_week_total_ml[i, j]=get(paste0(indices[j], "_", tsmodels_ml[i], "_total"))
}
}
performance_week_total_ml<-as.data.frame(performance_week_total_ml)
colnames(performance_week_total_ml)<-indices
rownames(performance_week_total_ml)<-tsmodels_ml
write.csv(x = performance_week_total_ml, file = "Output/Countries/Colombia/Tables/performance_week_total_ml_c1k.csv")
performance_week_target_ml<-matrix(nrow = length(tsmodels_ml), ncol = length(indices))
for (i in 1:length(tsmodels_ml)){
for (j in 1:length(indices)){
performance_week_target_ml[i, j]=get(paste0(indices[j], "_", tsmodels_ml[i], "_target"))
}
}
performance_week_target_ml<-as.data.frame(performance_week_target_ml)
colnames(performance_week_target_ml)<-indices
rownames(performance_week_target_ml)<-tsmodels_ml
write.csv(x = performance_week_target_ml, file = "Output/Countries/Colombia/Tables/performance_week_target_ml_c1k.csv")
Four tables below summarized performance of all models.
# Univariate models, total forecast period
performance_week_total
## MEAN RMSE MAE MPE MAPE
## Average_Method -0.020359307 4.833933 3.746772 -2.0764580 0.07392996
## Naive_Method 0.002463138 4.719307 3.766251 0.2474523 0.07262466
## Seasonal_Naive_Method -0.012450268 6.431938 5.161395 -1.9912521 0.09975956
## Drift_Method 0.004792085 4.748761 3.812052 0.4743401 0.07335596
## Simple_Arima_1 -0.021062445 5.609997 4.202171 -2.3537557 0.08377475
## Arima_Seasons1_2 -0.021062445 5.609997 4.202171 -2.3537557 0.08377475
## Arima_Fourier_AIC -0.022711766 4.865470 3.802633 -2.3236902 0.07524264
## fc_TBATS_Raw_1 0.033113569 5.098434 3.993996 3.0269435 0.07284353
## fc_TBATS_Season1_2 0.069408055 8.490606 6.905108 5.4652057 0.11937853
## fc_TBATS_Season2_3 0.031278442 4.999765 3.970225 2.9409067 0.07325741
## fc_TBATS_Season1_3 0.030605925 5.215859 4.336095 2.9209483 0.08138486
## fc_TBATS_NoSeason 0.035945294 5.389188 4.223400 3.2654544 0.07728402
## fc_NN_1 -0.016187900 5.777606 4.593613 -2.0072220 0.09139242
## fc_NN_Raw -0.025775581 3.765815 2.922250 -2.9640275 0.05969052
## Combination_uni 0.009642981 3.956677 3.332475 1.0339777 0.06345847
## MASE
## Average_Method 0.6607378
## Naive_Method 0.6641730
## Seasonal_Naive_Method 0.9102046
## Drift_Method 0.6722498
## Simple_Arima_1 0.7410467
## Arima_Seasons1_2 0.7410467
## Arima_Fourier_AIC 0.6705889
## fc_TBATS_Raw_1 0.7043355
## fc_TBATS_Season1_2 1.2177058
## fc_TBATS_Season2_3 0.7001434
## fc_TBATS_Season1_3 0.7646640
## fc_TBATS_NoSeason 0.7447904
## fc_NN_1 0.8100770
## fc_NN_Raw 0.5153346
## Combination_uni 0.5876771
# Machine learning models, total forecast period
performance_week_total_ml
## MEAN RMSE MAE MPE MAPE
## XGBoost 0.048510998 5.626334 4.911172 4.156918 0.09195722
## RF_1 0.019801755 4.087474 3.432258 2.015122 0.06503999
## RF_2 0.023084600 3.888526 3.338107 2.332280 0.06312015
## RF_3 0.024888692 3.935571 3.411089 2.490672 0.06443311
## Ridge 0.035370538 4.885089 4.077782 3.441112 0.07604120
## Lasso 0.030905393 4.415237 3.779878 2.947682 0.07149839
## elastic 0.041741558 4.663642 3.919572 4.059648 0.07259127
## Combination_ml 0.025941844 4.013226 3.455157 2.576297 0.06529962
## Combination_uni_ml 0.006165212 3.505992 3.014320 0.702622 0.05791517
## MASE
## XGBoost 0.8660780
## RF_1 0.6052737
## RF_2 0.5886703
## RF_3 0.6015406
## Ridge 0.7191109
## Lasso 0.6665760
## elastic 0.6912108
## Combination_ml 0.6093119
## Combination_uni_ml 0.5315709
# Univariate models, traget period
performance_week_target
## MEAN RMSE MAE MPE
## Average_Method -0.05143811 2.7467953 2.7467953 -5.4227473
## Naive_Method -0.02933970 1.5667402 1.5667402 -3.0226542
## Seasonal_Naive_Method -0.03796111 2.0271232 2.0271232 -3.9459016
## Drift_Method -0.02506247 1.3383359 1.3383359 -2.5706745
## Simple_Arima_1 -0.04277011 2.2839241 2.2839241 -4.4681132
## Arima_Seasons1_2 -0.04277011 2.2839241 2.2839241 -4.4681132
## Arima_Fourier_AIC -0.05461870 2.9166388 2.9166388 -5.7774259
## fc_TBATS_Raw_1 -0.04970521 2.6542584 2.6542584 -5.2305047
## fc_TBATS_Season1_2 -0.10359853 5.5321613 5.5321613 -11.5571570
## fc_TBATS_Season2_3 0.01746351 0.9325514 0.9325514 1.7163769
## fc_TBATS_Season1_3 0.00919202 0.4908539 0.4908539 0.9108297
## fc_TBATS_NoSeason -0.03651686 1.9500004 1.9500004 -3.7900882
## fc_NN_1 0.02804840 1.4977845 1.4977845 2.7283150
## fc_NN_Raw -0.10286626 5.4930584 5.4930584 -11.4661013
## Combination_uni -0.02638761 1.4090984 1.4090984 -2.7102787
## MAPE MASE
## Average_Method 0.054227473 NA
## Naive_Method 0.030226542 NA
## Seasonal_Naive_Method 0.039459016 NA
## Drift_Method 0.025706745 NA
## Simple_Arima_1 0.044681132 NA
## Arima_Seasons1_2 0.044681132 NA
## Arima_Fourier_AIC 0.057774259 NA
## fc_TBATS_Raw_1 0.052305047 NA
## fc_TBATS_Season1_2 0.115571570 NA
## fc_TBATS_Season2_3 0.017163769 NA
## fc_TBATS_Season1_3 0.009108297 NA
## fc_TBATS_NoSeason 0.037900882 NA
## fc_NN_1 0.027283150 NA
## fc_NN_Raw 0.114661013 NA
## Combination_uni 0.027102787 NA
# Machine learning models, traget period
performance_week_target_ml
## MEAN RMSE MAE MPE
## XGBoost 0.0368643629 1.968556976 1.968556976 3.55536984
## RF_1 -0.0209350274 1.117930464 1.117930464 -2.13826743
## RF_2 -0.0177011210 0.945239859 0.945239859 -1.80200969
## RF_3 -0.0057068859 0.304747705 0.304747705 -0.57396413
## Ridge 0.0094784967 0.506151725 0.506151725 0.93894984
## Lasso 0.0001297882 0.006930689 0.006930689 0.01297713
## elastic 0.0090545807 0.483514607 0.483514607 0.89733309
## Combination_ml -0.0072462761 0.386951145 0.386951145 -0.72991679
## Combination_uni_ml -0.0389034935 2.077446551 2.077446551 -4.04782383
## MAPE MASE
## XGBoost 0.0355536984 NA
## RF_1 0.0213826743 NA
## RF_2 0.0180200969 NA
## RF_3 0.0057396413 NA
## Ridge 0.0093894984 NA
## Lasso 0.0001297713 NA
## elastic 0.0089733309 NA
## Combination_ml 0.0072991679 NA
## Combination_uni_ml 0.0404782383 NA