########### Training data (rankings only, no dates): con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/train_ratings_all.dat") X.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/train_y_rating.dat") y.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/movie_titles.txt") titles = read.table(con,sep=",") names(X.tr) = substr(as.character(titles[,2]),1,15) movies = substr(as.character(titles[,2]),1,15) ########### Divide training data into training and validation n = dim(X.tr)[1] nva=2000 va.id = sample (n,nva) # choose 2000 points for validation trtr = data.frame (X = X.tr[-va.id,],y=y.tr[-va.id,]) #va = data.frame (X = X.tr[va.id,], yda=y.da.tr[va.id,], y=y.tr[va.id,]) va = data.frame (X = X.tr[va.id,],y=y.tr[va.id,]) ########### baseline RMSE sqrt(mean((va$y-mean(trtr$y))^2)) ########### regression on all rankings (with missing = 0!!!!) lin.mod = lm (y~.,data=trtr) ########### in-sample RMSE lin.insamp = predict (lin.mod) sqrt(mean((trtr$y-lin.insamp)^2)) ########### RMSE on validation data lin.pred = predict (lin.mod, newdata=va) sqrt(mean((va$y-lin.pred)^2)) ########### rankings can't be higher than 5! lin.pred.cap = pmin (lin.pred,5) sqrt(mean((va$y-lin.pred.cap)^2)) linreg = sqrt(mean((va$y-lin.pred.cap)^2)) ######################## Tag missing data for the tree is.na(X.tr) = X.tr==0 trtr = data.frame (X = X.tr[-va.id,],y=y.tr[-va.id,]) ntrtr=dim(trtr)[1] ptrtr=dim(trtr)[2]-1 va = data.frame (X = X.tr[va.id,],y=y.tr[va.id,]) library(rpart) ######### build toy tree tree.mod= rpart (y~.,data=trtr,cp=0.02) summary(tree.mod) plot (tree.mod) text(tree.mod, all=T,use.n=T) ######### proper methodology: start big, then prune tree.mod= rpart (y~.,data=trtr,cp=0.0001) # evaluate pruning sequence tree.res = printcp(tree.mod) #1-SE rule chosen.prune = min ((1:dim(tree.res)[1]) [tree.res[,"xerror"] < min(tree.res[,"xerror"]+tree.res[,"xstd"])]) tree.prune = prune(tree.mod, cp=tree.res[chosen.prune,"CP"]) # predict validation pred= predict(tree.prune, newdata=va) cat ("chosen pruning level: cp ", tree.res[chosen.prune,"CP"], " nodes ", tree.res[chosen.prune,"nsplit"]+1, " RMSE: ", sqrt(mean ((pred-va$y)^2)), "\n") # compare to prediction from big tree pred= predict(tree.mod, newdata=va) cat("RMSE for full tree:", sqrt(mean ((pred-va$y)^2)),"\n") ######### Bagging: ####### Bagging small trees err.small=NULL pred = numeric(dim(va)[1]) for (i in 1:100){ use = sample (ntrtr,ntrtr,rep=T) tree.mod= rpart (y~.,data=trtr[use,],cp=0.01) pred = pred + predict(tree.mod, newdata=va) if (i%%10==0) cat (i, sqrt(mean ((pred/i-va$y)^2)),"\n") err.small = c(err.small, sqrt(mean ((pred/i-va$y)^2))) } ####### Bagging bigger trees err.med=NULL pred = numeric(dim(va)[1]) for (i in 1:100){ use = sample (ntrtr,ntrtr,rep=T) tree.mod= rpart (y~.,data=trtr[use,],cp=0.001) pred = pred + predict(tree.mod, newdata=va) if (i%%10==0) cat (i, sqrt(mean ((pred/i-va$y)^2)),"\n") err.med = c(err.med, sqrt(mean ((pred/i-va$y)^2)))} ##### results: plot (1:100, err.small, xlab="Bagging iterations", ylab="RMSE", ylim=c(0.75, 0.85), type="l", lty=2) lines (1:100, err.med,lty=3) lines (c(1,100), c(linreg, linreg),lty=1) legend ("topright",legend=c("Linear regression", "Bagging CP=0.01", "Bagging CP=0.001"),lty=c(1,2,3)) err.big=NULL ############ Yet bigger trees #sink("bagging.txt") ######## Bagging! pred = numeric(dim(va)[1]) for (i in 1:100){ use = sample (ntrtr,ntrtr,rep=T) tree.mod= rpart (y~.,data=trtr[use,],cp=0.0001) pred = pred + predict(tree.mod, newdata=va) #cat (i, sqrt(mean ((pred/i-va$y)^2)),"\n")} err.big = c(err.big, sqrt(mean ((pred/i-va$y)^2)))} #sink() lines (1:100, err.big,lty=4) legend ("topright",legend=c("Linear regression", "Bagging CP=0.01", "Bagging CP=0.001", "Bagging CP=0.0001"),lty=c(1,2,3,4)) ################# plot to file pdf("Bagging.pdf") plot (1:100, err.small, xlab="Bagging iterations", ylab="RMSE", ylim=c(0.75, 0.85), type="l", lty=2) lines (1:100, err.med,lty=3) lines (c(1,100), c(linreg, linreg),lty=1) lines (1:100, err.big,lty=4) legend ("topright",legend=c("Linear regression", "Bagging CP=0.01", "Bagging CP=0.001", "Bagging CP=0.0001"),lty=c(1,2,3,4)) dev.off()