con = url ("https://web.stanford.edu/~hastie/ElemStatLearn/datasets/prostate.data") prost=read.csv(con,row.names=1,sep="\t") summary(prost) plot (prost$age, prost$lcavol) # standard plot plot (prost) # all vs all in the R window prost.tr = prost[prost$train,] # train observations prost.te = prost[!prost$train,] # test observations prost.linreg = lm (lpsa~.-train, data=prost.tr) summary(prost.linreg) pred.te = predict (prost.linreg, newdata=prost.te) mean((prost.te$lpsa-pred.te)^2) ##################### # now let's do k-NN # ##################### k.vals = c(1,5,20,40) X.te = as.matrix(prost.te[,-c(9,10)]) # remove last two columns (response & training indicator) X.tr = as.matrix(prost.tr[,-c(9,10)]) nte = dim(prost.te)[1] # how many test observations ntr = dim(prost.tr)[1] # how many train observations # create matrix of norms norm.te = apply (X.te^2, 1, sum) # apply function to rows of matrix norm.tr = apply (X.tr^2, 1, sum) mat.norm.te = matrix (data=norm.te, nrow=nte, ncol=ntr, byrow=F) # each column is the norms of the test observations mat.norm.tr = matrix (data=norm.tr, nrow=nte, ncol=ntr, byrow=T) # each row is the norms of the test observations # matrix of distances dist.te.tr = mat.norm.te + mat.norm.tr - 2*X.te%*%t(X.tr) # matrix multiplication # matrix to store regression errors sq.err = matrix (nrow=nte, ncol=length(k.vals)) colnames(sq.err) = k.vals for (i in 1:nte) { neighbors = order (dist.te.tr[i,]) for (j in 1:length(k.vals)){ k = k.vals[j] sq.err[i,j] = (prost.te$lpsa[i] - mean(prost.tr$lpsa[neighbors[1:k]]))^2}} round(apply (sq.err,2,summary),3) plot(k.vals, apply (sq.err,2,mean),col=2,type="l",ylim=c(0,2),ylab="Test MSE",xlab="No. neighbors") ################################### # now again with standardization! # ################################### sd.tr = apply (X.tr, 2, sd) Xs.tr = X.tr / matrix(data=sd.tr, nrow=ntr, ncol=length(sd.tr), byrow=T) Xs.te = X.te / matrix(data=sd.tr, nrow=nte, ncol=length(sd.tr), byrow=T) # create matrix of distances norm.te = apply (Xs.te^2, 1, sum) # apply function to rows of matrix norm.tr = apply (Xs.tr^2, 1, sum) mat.norm.te = matrix (data=norm.te, nrow=nte, ncol=ntr, byrow=F) # each column is the norms of the test observations mat.norm.tr = matrix (data=norm.tr, nrow=nte, ncol=ntr, byrow=T) # each row is the norms of the test observations # matrix of distances dist.te.tr = mat.norm.te + mat.norm.tr - 2*Xs.te%*%t(Xs.tr) # matrix multiplication # matrix to store regression errors sq.err = matrix (nrow=nte, ncol=length(k.vals)) colnames(sq.err) = k.vals for (i in 1:nte) { neighbors = order (dist.te.tr[i,]) for (j in 1:length(k.vals)){ k = k.vals[j] sq.err[i,j] = (prost.te$lpsa[i] - mean(prost.tr$lpsa[neighbors[1:k]]))^2}} round(apply (sq.err,2,summary),3) lines(k.vals, apply (sq.err,2,mean),col=3) abline (mean((prost.te$lpsa-pred.te)^2),0,ylim=c(0,10)) legend ("topright", col=1:3,lty=1,legend = c("Least squares","KNN (original)","KNN (standardized)"))