########### Training data (rankings only, no dates): con = url("http://www.tau.ac.il/~saharon/StatsLearn2022/train_ratings_all.dat") X.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2022/train_y_rating.dat") y.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2022/movie_titles.txt") titles = read.table(con,sep=",") names(X.tr) = substr(as.character(titles[,2]),1,15) movies = substr(as.character(titles[,2]),1,15) ########### Divide training data into training and validation n = dim(X.tr)[1] nva=2000 va.id = sample (n,nva) # choose 2000 points for validation trtr = data.frame (X = X.tr[-va.id,],y=y.tr[-va.id,]) ntrtr=dim(trtr)[1] ptrtr=dim(trtr)[2]-1 va = data.frame (X = X.tr[va.id,],y=y.tr[va.id,]) ########## Look at PCA: X.now = as.matrix(X.tr[,1:14]) # center (no scaling!) m.now = apply (X.now, 2, mean) X.now = X.now - matrix (data=m.now, nrow=dim(X.now)[1], ncol=dim(X.now)[2], byrow=T) colnames(X.now) = movies[1:14] ## #temp=svd(X.now/sqrt(n)) # SVD (sqrt(n) to compare to normalized covariance matrix in princomp) temp1=princomp(X.now, cor=F) print (temp1$sdev) # sorted singlur values print (temp1$loadings) # loadings of top PCs #pdf ("pca.pdf") # some plots of PCA --- any patterns of "typical" users? par (mfrow=c(2,2)) plot (temp1$scores[1:100,1],temp1$scores[1:100,2],xlab="comp 1",ylab="comp 2") plot (temp1$scores[1:100,1],temp1$scores[1:100,3],xlab="comp 1",ylab="comp 3") plot (temp1$scores[1:100,2],temp1$scores[1:100,3],xlab="comp 2",ylab="comp 3") #dev.off() # Scores for Miss Congeniality from users that are high on 2nd PC table(y.tr[order (temp1$scores[,1])[1:100],1]) # Scores for Miss Congeniality from users that are low on 2nd PC table(y.tr[order (temp1$scores[,1],decreasing=T)[1:100],1]) tr.sc = data.frame(temp1$scores[-va.id,], y=y.tr[-va.id,]) va.sc = data.frame(temp1$scores[va.id,],y=y.tr[va.id,]) # Example ourtput for regression on 2 PCAs print(summary(lm(y~., data=tr.sc[,c(1:2,15)]))) ######### PCA regression on various number of PCAs for (npca in 1:14){ pca=lm(y~., data=tr.sc[,c(1:npca,15)]) va.pred = predict (pca,newdata=va.sc[,c(1:npca,15)]) cat (npca, "PCA, validation error:", sqrt(mean((va$y-va.pred)^2)),"\n")} #pdf("Ridge.pdf") # in case want output to file ########## Ridge library(MASS) lambda.vals = exp(seq(-15,10,by=0.1)) par(mfrow=c(2,3)) colnow=1 for (nuse in c(50,500,5000)){ use.id = sample (ntrtr,nuse) mods = lm.ridge(y~.,data=trtr[use.id,c(1:14,100)], lambda=lambda.vals) preds = as.matrix(va[,1:14]) %*% t(coef(mods)[,-1]) + rep(1,nva) %o% coef(mods)[,1] resids = matrix (data=va$y, nrow=dim(preds)[1], ncol=dim(preds)[2], byrow=F)-preds RSSs = apply (resids^2, 2, sum) par(mfg=c(1,colnow)) # plot on top row plot (mods$lambda, sqrt(RSSs/nva), main=paste("Ridge using ",nuse, " training data",sep=""),ylab="RMSE",xlab="lambda",log="x",ylim=c(0.8,1)) par(mfg=c(2,colnow)) # bottom row plot (apply(mods$coef^2, 2, sum), sqrt(RSSs/nva), ylab="RMSE",xlab="s",ylim=c(0.8,1)) colnow=colnow+1 } ########## Lasso using glmnet library(glmnet) #norm.vals = seq (10e-5, 1,by=0.01) lambda.vals = exp(seq(-15,10,by=0.1)) par(mfrow=c(3,3)) colnow=1 for (nuse in c(50,500,5000)){ use.id = sample (ntrtr,nuse) mods = glmnet(trtr[use.id,1:14], trtr[use.id,100], lambda=lambda.vals, family="gaussian", intercept = T, alpha=1) preds=predict(mods, newx=as.matrix(va[,1:14])) resids = apply (preds, 2, "-", va$y) RSSs = apply(resids^2, 2, sum) lambdas = mods$lambda norm.vals = apply(abs(mods$beta), 2, sum) non.zero = apply(mods$beta!=0, 2, sum) par(mfg=c(1,colnow)) plot (lambdas, sqrt(RSSs/nva), main=paste("Lasso using ",nuse, " training data",sep=""),ylab="RMSE",xlab="lambda",log="x",ylim=c(0.8,1)) par(mfg=c(2,colnow)) plot (norm.vals, sqrt(as.numeric(RSSs)/nva), ylab="RMSE",xlab="s",ylim=c(0.8,1)) par(mfg=c(3,colnow)) plot (norm.vals, non.zero, ylab="non-zero",xlab="s") colnow=colnow+1 } #dev.off() inpreds = predict (mods, newx=as.matrix(trtr[use.id,1:14])) inresids = -apply (inpreds, 2, "-", trtr$y[use.id]) coefs = mods$beta num.zero=apply(coefs==0,2,sum) bounds = apply(abs(coefs), 2, sum) prods = t(trtr[use.id,1:14])%*%inresids plot (bounds, num.zero)