########### Training data (rankings only, no dates): con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/train_ratings_all.dat") X.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/train_y_rating.dat") y.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/train_y_date.dat") y.da.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/movie_titles.txt") titles = read.table(con,sep=",") names (X.tr) = substr(as.character(titles[,2]),1,10) ########### Divide training data into training and validation n = dim(X.tr)[1] va.id = sample (n,2000) # choose 2000 points for validation #trtr = data.frame (X = X.tr[-va.id,], yda=y.da.tr[-va.id,], y=y.tr[-va.id,]) trtr = data.frame (X = X.tr[-va.id,],y=y.tr[-va.id,]) #va = data.frame (X = X.tr[va.id,], yda=y.da.tr[va.id,], y=y.tr[va.id,]) va = data.frame (X = X.tr[va.id,],y=y.tr[va.id,]) #### reminder: linear regression analysis with grades as numbers lin.mod = lm (y~.,data=trtr) ########### RMSE on validation data lin.pred = predict (lin.mod, newdata=va) sqrt(mean((va$y-lin.pred)^2)) #### linear regression for classification Y = matrix (0, nrow=dim(trtr)[1], ncol=5) for (cl in 1:5) Y[trtr$y==cl,cl]=1 X = as.matrix(cbind(1,X.tr[-va.id,])) B = solve(t(X)%*%X)%*%t(X)%*%Y Xva = as.matrix(cbind(1,X.tr[va.id,])) preds = Xva%*%B #prediction as highest score pred1 = (1:5)[apply(preds, 1, which.max)] summary(pred1) #prediction as "expected" score pred2 = as.numeric((preds%*%(1:5)) / apply(preds,1,sum)) summary(pred2) # similar? cor(pred1,pred2) # how do they do? sqrt(mean((va$y-pred1)^2)) sqrt(mean((va$y-pred2)^2)) #### logistic regression analysis library(nnet) lr.mod = multinom (as.factor(y)~., family = multinomial, data=trtr, maxit=300) lr.pred = predict(lr.mod, newdata=va) summary(lr.pred) # prediction as most likely class pred1 = as.numeric (lr.pred) summary(pred1) # prediction as expected score lr.post = predict(lr.mod, newdata=va,type="prob") pred2 = as.numeric(lr.post%*%(1:5)) summary(pred2) # similar? cor(pred1,pred2) # how do they do? sqrt(mean((va$y-pred1)^2)) sqrt(mean((va$y-pred2)^2)) ### running and interpreting simple logistic regression lr2.mod = glm (y==4~., family = binomial, data=trtr[,c(1:14,100)]) summary(lr2.mod) #### LDA library(MASS) lda.mod = lda (as.factor(y)~.,data=trtr) lda.pred = predict(lda.mod, newdata=va) summary(lda.pred) # prediction as most likely class pred1 = as.numeric (lda.pred$class) summary(pred1) # prediction as expected score pred2 = as.numeric(lda.pred$posterior%*%(1:5)) summary(pred2) # similar? cor(pred1,pred2) # how do they do? sqrt(mean((va$y-pred1)^2)) sqrt(mean((va$y-pred2)^2))