install.packages("data.table") install.packages("dplyr") install.packages("ggplot2") install.packages("caret") install.packages("xgboost") install.packages("e1071") install.packages("cowplot") library(data.table) library(dplyr) library(ggplot2) library(caret) library(xgboost) library(e1071) library(cowplot) test[, Item_Outlet_Sales := NA] combi = rbind(train, test) missing_index = which(is.na(combi$Item_Weight)) for(i in missing_index){ item = combi$Item_Identifier[i] combi$Item_Weight[i] = mean(combi$Item_Weight[combi$Item_Identifier == item], na.rm = T) } zero_index = which(combi$Item_Visibility == 0) for(i in zero_index){ item = combi$Item_Identifier[i] combi$Item_Visibility[i] = mean(combi$Item_Visibility[combi$Item_Identifier == item], na.rm = T) } combi[, Outlet_Size_num := ifelse(Outlet_Size == "Small", 0, ifelse(Outlet_Size == "Medium", 1, 2))] combi[, Outlet_Location_Type_num := ifelse(Outlet_Location_Type == "Tier 3", 0, ifelse(Outlet_Location_Type == "Tier 2", 1, 2))] combi[, c("Outlet_Size", "Outlet_Location_Type") := NULL] ohe_1 = dummyVars("~.", data = combi[, -c("Item_Identifier", "Outlet_Establishment_Year", "Item_Type")], fullRank = T) ohe_df = data.table(predict(ohe_1, combi[, -c("Item_Identifier", "Outlet_Establishment_Year", "Item_Type")])) combi = cbind(combi[, "Item_Identifier"], ohe_df) skewness(combi$Item_Visibility) skewness(combi$price_per_unit_wt) combi[, Item_Visibility := log(Item_Visibility + 1)] num_vars = which(sapply(combi, is.numeric)) num_vars_names = names(num_vars) combi_numeric = combi[, setdiff(num_vars_names, "Item_Outlet_Sales"), with = F] prep_num = preProcess(combi_numeric, method = c("center", "scale")) combi_numeric_norm = predict(prep_num, combi_numeric) combi[, setdiff(num_vars_names, "Item_Outlet_Sales") := NULL] combi = cbind(combi, combi_numeric_norm) train = combi[1:nrow(train)] test = combi[(nrow(train) + 1):nrow(combi)] test[, Item_Outlet_Sales := NULL] param_list = list( objective = "reg:linear", eta = 0.01, gamma = 1, max_depth = 6, subsample = 0.8, colsample_bytree = 0.5 ) Dtrain = xgb.DMatrix(data = as.matrix(train[, -c("Item_Identifier", "Item_Outlet_Sales")]), label = train$Item_Outlet_Sales) Dtest = xgb.DMatrix(data = as.matrix(test[, -c("Item_Identifier")])) set.seed(112) xgbcv = xgb.cv(params = param_list, data = Dtrain, nrounds = 1000, nfold = 5, print_every_n = 10, early_stopping_rounds = 30, maximize = F) xgb_model = xgb.train(data = Dtrain, params = param_list, nrounds = 428) xgb_model