########### Training data (rankings only, no dates): con = url("http://www.tau.ac.il/~saharon/StatsLearn2017/train_ratings_all.dat") X.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2017/train_y_rating.dat") y.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2017/movie_titles.txt") titles = read.table(con,sep=",") names(X.tr) = substr(as.character(titles[,2]),1,15) movies = substr(as.character(titles[,2]),1,15) ########### Divide training data into training and validation n = dim(X.tr)[1] nva=2000 va.id = sample (n,nva) # choose 2000 points for validation trtr = data.frame (X = X.tr[-va.id,],y=y.tr[-va.id,]) #va = data.frame (X = X.tr[va.id,], yda=y.da.tr[va.id,], y=y.tr[va.id,]) va = data.frame (X = X.tr[va.id,],y=y.tr[va.id,]) ########### baseline RMSE sqrt(mean((va$y-mean(trtr$y))^2)) ########### regression on all rankings (with missing = 0!!!!) lin.mod = lm (y~.,data=trtr) ########### in-sample RMSE lin.insamp = predict (lin.mod) sqrt(mean((trtr$y-lin.insamp)^2)) ########### RMSE on validation data lin.pred = predict (lin.mod, newdata=va) sqrt(mean((va$y-lin.pred)^2)) ########### rankings can't be higher than 5! lin.pred.cap = pmin (lin.pred,5) sqrt(mean((va$y-lin.pred.cap)^2)) linreg = sqrt(mean((va$y-lin.pred.cap)^2)) library(rpart) ############# Boost trees with missing=0 rate = 0.01 y.now = trtr$y-mean(trtr$y) err.boost=err.tr.boost=NULL pred.boost = numeric(dim(va)[1])+mean(trtr$y) tr.boost = numeric(dim(trtr)[1])+mean(trtr$y) for (i in 1:2500){ tree.mod= rpart (y.now~.-y,data=trtr,maxdepth=2,cp=0.00001) pred.boost = pred.boost + rate*predict(tree.mod, newdata=va) tr.boost = tr.boost + rate*predict(tree.mod) y.now = trtr$y-tr.boost if (i%%10==0) cat (i, "train:", sqrt(mean ((tr.boost-trtr$y)^2)), " test:", sqrt(mean ((pred.boost-va$y)^2)),"\n") err.boost = c(err.boost, sqrt(mean ((pred.boost-va$y)^2))) err.tr.boost = c(err.tr.boost, sqrt(mean ((tr.boost-trtr$y)^2)))} #pdf ("//goliath.tau.ac.il/saharon/public_html/StatsLearn2017/boost-miss0.pdf") plot ((1:2500), err.boost, main="Boosting with alpha=0.01, maxdepth=2 and miss=0", xlab="number of steps", ylab="RMSE",ylim=c(linreg-0.03, linreg+0.05), type="l") lines (1:2500, err.tr.boost, lty=2) lines (c(1, 2500), c(linreg, linreg), lty=3) legend ("topright",legend=c("linear regression", "boost - valid", "boost - train"), lty=c(3,1,2)) #dev.off() ######################## Tag missing data for the tree X.tr.tree = X.tr is.na(X.tr.tree) = X.tr.tree==0 tr.tree = data.frame (X = X.tr.tree[-va.id,],y=y.tr[-va.id,1]) va.tree = data.frame (X = X.tr.tree[va.id,],y=y.tr[va.id,1]) ntrtr=dim(tr.tree)[1] ptrtr=dim(tr.tree)[2]-1 library(rpart) ############# Boost trees rate = 0.01 y.now = tr.tree$y-mean(tr.tree$y) err.boost=err.tr.boost=NULL pred.boost = numeric(dim(va.tree)[1])+mean(tr.tree$y) tr.boost = numeric(dim(tr.tree)[1])+mean(tr.tree$y) for (i in 1:2500){ tree.mod= rpart (y.now~.-y,data=tr.tree,maxdepth=2,cp=0.00001) pred.boost = pred.boost + rate*predict(tree.mod, newdata=va.tree) tr.boost = tr.boost + rate*predict(tree.mod) y.now = tr.tree$y-tr.boost if (i%%10==0) cat (i, "train:", sqrt(mean ((tr.boost-tr.tree$y)^2)), " test:", sqrt(mean ((pred.boost-va.tree$y)^2)),"\n") err.boost = c(err.boost, sqrt(mean ((pred.boost-va.tree$y)^2))) err.tr.boost = c(err.tr.boost, sqrt(mean ((tr.boost-tr.tree$y)^2)))} #pdf ("//goliath.tau.ac.il/saharon/public_html/StatsLearn2017/boost-missNA.pdf") plot ((1:2500), err.boost, main="Boosting with alpha=0.01, maxdepth=2 and miss=0", xlab="number of steps", ylab="RMSE",ylim=c(linreg-0.03, linreg+0.05), type="l") lines (1:2500, err.tr.boost, lty=2) lines (c(1, 2500), c(linreg, linreg), lty=3) legend ("topright",legend=c("linear regression", "boost - valid", "boost - train"), lty=c(3,1,2)) #dev.off() library ("e1071") ######################## Kernel support vector regression err.rbf = err.tr.rbf = NULL gamma.vals = exp((-10):(-1)) for (ga in gamma.vals){ mod.svm = svm(y~., data=trtr, type="ep", gamma=ga) pr.svm=predict(mod.svm,newdata=va) pr.tr.svm=predict(mod.svm) err.rbf = c(err.rbf, sqrt(mean ((pr.svm-va$y)^2))) err.tr.rbf = c(err.tr.rbf, sqrt(mean ((pr.tr.svm-trtr$y)^2))) cat ("RBF kernel, gamma=",ga," train:", sqrt(mean ((pr.tr.svm-trtr$y)^2)), " test:", sqrt(mean ((pr.svm-va$y)^2)),"\n") } pdf ("//goliath.tau.ac.il/saharon/public_html/StatsLearn2017/SVR-RBF.pdf") plot (gamma.vals, err.rbf, main="Support vector regression-RBF kernel", xlab="Gamma", ylab="RMSE",ylim=c(linreg-0.03, linreg+0.05), type="l",log="x") lines (gamma.vals, err.tr.rbf, lty=2) lines (gamma.vals, numeric(length(gamma.vals))+linreg, lty=3) legend ("topleft",legend=c("linear regression", "SVR+RBF - valid", "SVR+RBF - train"), lty=c(3,1,2)) dev.off() err.rbf = err.tr.rbf = NULL cost.vals = exp(((-10):(10))/10) for (co in cost.vals){ mod.svm = svm(y~., data=trtr, type="ep", gamma=3*10^(-3),cost=co) pr.svm=predict(mod.svm,newdata=va) pr.tr.svm=predict(mod.svm) err.rbf = c(err.rbf, sqrt(mean ((pr.svm-va$y)^2))) err.tr.rbf = c(err.tr.rbf, sqrt(mean ((pr.tr.svm-trtr$y)^2))) cat ("RBF kernel, co=",co," train:", sqrt(mean ((pr.tr.svm-trtr$y)^2)), " test:", sqrt(mean ((pr.svm-va$y)^2)),"\n") } pdf ("//goliath.tau.ac.il/saharon/public_html/StatsLearn2017/SVR-RBF-cost.pdf") plot (gamma.vals, err.rbf, main="Support vector regression-RBF kernel Gamma=0.003", xlab="Cost", ylab="RMSE",ylim=c(linreg-0.05, linreg+0.05), type="l",log="x") lines (gamma.vals, err.tr.rbf, lty=2) lines (gamma.vals, numeric(length(gamma.vals))+linreg, lty=3) legend ("topleft",legend=c("linear regression", "SVR+RBF - valid", "SVR+RBF - train"), lty=c(3,1,2)) dev.off()