########### Training data (rankings only, no dates): con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/train_ratings_all.dat") #X.tr = read.table (con,colClasses="factor") X.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/train_y_rating.dat") y.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/train_y_date.dat") y.da.tr = read.table (con) con = url("http://www.tau.ac.il/~saharon/StatsLearn2011/movie_titles.txt") titles = read.table(con,sep=",") names (X.tr) = substr(as.character(titles[,2]),1,10) ########### Get to know our data a little: table (y.tr) # What rankings does our target get? apply(data.frame(X.tr[,1:14],y.tr),2,mean) # Which movies are liked? cor(y.tr,X.tr[,1:14]) # which movies correlated with Miss Congeniality? apply (X.tr==0, 2, sum) # how many missing? cor (y.tr, y.da.tr) # changes with time? ########### Divide training data into training and validation n = dim(X.tr)[1] nva=2000 ntr=n-nva va.id = sample (n,nva) # choose 2000 points for validation #trtr = data.frame (X = X.tr[-va.id,], yda=y.da.tr[-va.id,], y=y.tr[-va.id,]) trtr = data.frame (X = X.tr[-va.id,],y=y.tr[-va.id,]) #va = data.frame (X = X.tr[va.id,], yda=y.da.tr[va.id,], y=y.tr[va.id,]) va = data.frame (X = X.tr[va.id,],y=y.tr[va.id,]) ################### KNN va.X = X.tr[va.id,] tr.X = X.tr[-va.id,] eq = (va.X==1) %*% t(tr.X==1) + (va.X==2) %*% t(tr.X==2) + (va.X==3) %*% t(tr.X==3) + (va.X==4) %*% t(tr.X==4) + (va.X==5) %*% t(tr.X==5) + (va.X==0) %*% t(tr.X==0) eq1 = (va.X==1) %*% t(tr.X==1) + (va.X==2) %*% t(tr.X==2) + (va.X==3) %*% t(tr.X==3) + (va.X==4) %*% t(tr.X==4) + (va.X==5) %*% t(tr.X==5) + 0.2 * (va.X==0) %*% t(tr.X>=0)+ 0.3*(va.X==0) %*% t(tr.X==0) va.n = apply (va.X^2, 1, sum) tr.n = apply (tr.X^2, 1, sum) eq2 = matrix (data=va.n, nrow=nva,ncol=ntr,byrow=F) -2*as.matrix(va.X)%*%t(as.matrix(tr.X)) + matrix (data=tr.n, nrow=nva,ncol=ntr,byrow=T) oo = t(apply (eq,1,order,decreasing=TRUE)) oo1 = t(apply (eq1,1,order,decreasing=TRUE)) oo2 = t(apply (eq2,1,order)) nk = c(1,5,10,20,50,100,500) kerr = kerr1 = kerr2 = NULL for (j in 1:length(nk)){ k = nk[j] pred=pred1=pred2=NULL for (i in 1:nva){ pred = c(pred, mean(trtr$y[oo[i,1:k]])) pred1 = c(pred1, mean(trtr$y[oo1[i,1:k]])) pred2 = c(pred2, mean(trtr$y[oo2[i,1:k]])) } err = sqrt(mean( (pred-va$y)^2)) err1 = sqrt(mean( (pred1-va$y)^2)) err2 = sqrt(mean( (pred2-va$y)^2)) cat (k,": err ",err, "err1",err1, "err2", err2,"\n") kerr = c(kerr,err) kerr1 = c(kerr1,err1) kerr2 = c(kerr2,err2) } plot (nk,kerr, type="l", col=1, log="x", ylim=c(0.75,0.95)) lines (nk,kerr1, col=2) lines (nk,kerr2, col=3) legend ("topright",legend = c("Method 1","Method 2", "Method 3"),lty = c(1,1,1),col=c(1,2,3))