#setwd("//taudisk.tau.ac.il/saharon/public_html/StatsLearn2011")
########### Training data (rankings only, no dates):
con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/train_ratings_all.dat")
X.tr = read.table (con)
con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/train_y_rating.dat")
y.tr = read.table (con)
con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/movie_titles.txt")
titles = read.table(con,sep=",")
names(X.tr) = substr(as.character(titles[,2]),1,15)
movies = substr(as.character(titles[,2]),1,15)
########### Divide training data into training and validation
n = dim(X.tr)[1]
nva=2000
va.id = sample (n,nva) # choose 2000 points for validation
trtr = data.frame (X = X.tr[-va.id,],y=y.tr[-va.id,])
ntrtr=dim(trtr)[1]
ptrtr=dim(trtr)[2]-1
va = data.frame (X = X.tr[va.id,],y=y.tr[va.id,])
########## Look at PCA:
X.now = as.matrix(X.tr[,1:14])
# center (no scaling!)
m.now = apply (X.now, 2, mean)
X.now = X.now - matrix (data=m.now, nrow=dim(X.now)[1], ncol=dim(X.now)[2], byrow=T)
colnames(X.now) = movies[1:14]
##
#temp=svd(X.now/sqrt(n)) # SVD (sqrt(n) to compare to normalized covariance matrix in princomp)
temp1=princomp(X.now, cor=F)
print (temp1$sdev) # sorted singlur values
print (temp1$loadings) # loadings of top PCs
pdf ("pca.pdf")
# some plots of PCA --- any patterns of "typical" users?
par (mfrow=c(2,2))
plot (temp1$scores[1:100,1],temp1$scores[1:100,2],xlab="comp 1",ylab="comp 2")
plot (temp1$scores[1:100,1],temp1$scores[1:100,3],xlab="comp 1",ylab="comp 3")
plot (temp1$scores[1:100,2],temp1$scores[1:100,3],xlab="comp 2",ylab="comp 3")
dev.off()
# Scores for Miss Congeniality from users that are high on 2nd PC
table(y.tr[order (temp1$scores[,1])[1:100],1])
tr.sc = data.frame(temp1$scores[-va.id,], y=y.tr[-va.id,])
va.sc = data.frame(temp1$scores[va.id,],y=y.tr[va.id,])
# Example ourtput for regression on 2 PCAs
print(summary(lm(y~., data=tr.sc[,c(1:2,15)])))
######### PCA regression on various number of PCAs
for (npca in 1:14){
pca=lm(y~., data=tr.sc[,c(1:npca,15)])
va.pred = predict (pca,newdata=va.sc[,c(1:npca,15)])
cat (npca, "PCA, validation error:", sqrt(mean((va$y-va.pred)^2)),"\n")}
pdf("Ridge.pdf") # in case want output to file
########## Ridge
library(MASS)
lambda.vals = exp(seq(-15,10,by=0.1))
par(mfrow=c(2,3))
colnow=1
for (nuse in c(50,500,5000)){
use.id = sample (ntrtr,nuse)
mods = lm.ridge(y~.,data=trtr[use.id,c(1:14,100)], lambda=lambda.vals)
preds = as.matrix(va[,1:14]) %*% t(coef(mods)[,-1]) + rep(1,nva) %o% coef(mods)[,1]
resids = matrix (data=va$y, nrow=dim(preds)[1], ncol=dim(preds)[2], byrow=F)-preds
RSSs = apply (resids^2, 2, sum)
par(mfg=c(1,colnow)) # plot on top row
plot (mods$lambda, sqrt(RSSs/nva), main=paste("Ridge using ",nuse, " training data",sep=""),ylab="RMSE",xlab="lambda",log="x",ylim=c(0.8,1))
par(mfg=c(2,colnow)) # bottom row
plot (apply(mods$coef^2, 2, sum), sqrt(RSSs/nva), ylab="RMSE",xlab="s",ylim=c(0.8,1))
colnow=colnow+1
}
dev.off()
pdf("Lasso.pdf") # in case want output to file
########## Lasso
library(lasso2)
norm.vals = seq (10e-5, 1,by=0.01)
par(mfrow=c(2,3))
colnow=1
for (nuse in c(50,500,5000)){
use.id = sample (ntrtr,nuse)
mods = l1ce(y~.+1,data=trtr[use.id,c(1:14,100)], bound=norm.vals) # l1ce outputs a list of models
preds = sapply (mods, predict,newdata=va[,c(1:14,100)]) # example of lapply!
resids = apply (preds, 2, "-", va$y)
RSSs = apply(resids^2, 2, sum)
lambdas = as.numeric(sapply (mods, "[",5))
par(mfg=c(1,colnow))
plot (lambdas, sqrt(RSSs/nva), main=paste("Lasso using ",nuse, " training data",sep=""),ylab="RMSE",xlab="lambda",log="x",ylim=c(0.8,1))
par(mfg=c(2,colnow))
plot (norm.vals, sqrt(as.numeric(RSSs)/nva), ylab="RMSE",xlab="s",ylim=c(0.8,1))
colnow=colnow+1
}
dev.off()
inpreds = sapply (mods, predict)
inresids = -apply (inpreds, 2, "-", trtr$y[use.id])
coefs = sapply (mods, "[",1)
num.zero=apply(sapply(coefs,"==",0),2,sum)
bounds = sapply (mods, "[",4)
prods = t(trtr[use.id,1:14])%*%inresids
plot (bounds, num.zero)