########### Training data (rankings only, no dates): con = url("http://www.tau.ac.il/~saharon/StatsLearn2022/train_ratings_all.dat") X.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2022/train_y_rating.dat") y.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2022/movie_titles.txt") titles = read.table(con,sep=",") names(X.tr) = substr(as.character(titles[,2]),1,15) movies = substr(as.character(titles[,2]),1,15) ########### Divide training data into training and validation n = dim(X.tr)[1] nva=2000 va.id = sample (n,nva) # choose 2000 points for validation trtr = data.frame (X = X.tr[-va.id,],y=y.tr[-va.id,]) ntrtr=dim(trtr)[1] ptrtr=dim(trtr)[2]-1 va = data.frame (X = X.tr[va.id,],y=y.tr[va.id,]) library ("e1071") ######################## Kernel support vector regression err.rbf = err.tr.rbf = NULL gamma.vals = exp((-10):(-1)) for (ga in gamma.vals){ mod.svm = svm(y~., data=trtr, type="ep", gamma=ga) pr.svm=predict(mod.svm,newdata=va) pr.tr.svm=predict(mod.svm) err.rbf = c(err.rbf, sqrt(mean ((pr.svm-va$y)^2))) err.tr.rbf = c(err.tr.rbf, sqrt(mean ((pr.tr.svm-trtr$y)^2))) cat ("RBF kernel, gamma=",ga," train:", sqrt(mean ((pr.tr.svm-trtr$y)^2)), " test:", sqrt(mean ((pr.svm-va$y)^2)),"\n") } plot (gamma.vals, err.rbf, main="Support vector regression-RBF kernel", xlab="Gamma", ylab="RMSE",ylim=c(linreg-0.03, linreg+0.05), type="l",log="x") lines (gamma.vals, err.tr.rbf, lty=2) lines (gamma.vals, numeric(length(gamma.vals))+linreg, lty=3) legend ("topleft",legend=c("linear regression", "SVR+RBF - valid", "SVR+RBF - train"), lty=c(3,1,2)) err.rbf = err.tr.rbf = NULL cost.vals = exp(((-10):(10))/10) for (co in cost.vals){ mod.svm = svm(y~., data=trtr, type="ep", gamma=3*10^(-3),cost=co) pr.svm=predict(mod.svm,newdata=va) pr.tr.svm=predict(mod.svm) err.rbf = c(err.rbf, sqrt(mean ((pr.svm-va$y)^2))) err.tr.rbf = c(err.tr.rbf, sqrt(mean ((pr.tr.svm-trtr$y)^2))) cat ("RBF kernel, co=",co," train:", sqrt(mean ((pr.tr.svm-trtr$y)^2)), " test:", sqrt(mean ((pr.svm-va$y)^2)),"\n") } plot (gamma.vals, err.rbf, main="Support vector regression-RBF kernel Gamma=0.003", xlab="Cost", ylab="RMSE",ylim=c(linreg-0.05, linreg+0.05), type="l",log="x") lines (gamma.vals, err.tr.rbf, lty=2) lines (gamma.vals, numeric(length(gamma.vals))+linreg, lty=3) legend ("topleft",legend=c("linear regression", "SVR+RBF - valid", "SVR+RBF - train"), lty=c(3,1,2))