help (read.csv) #example of help... con = url ("http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/prostate.data") prost=read.csv(con,row.names=1,sep="\t") # alternatively: setwd("C:/Documents and Settings/Administrator/Desktop") # prost=read.csv("prostate.data",row.names=1,sep="\t") summary(prost) plot (prost$age, prost$lcavol) # standard plot plot (prost) # all vs all in the R window # into file: # postscript("prost.ps") # plot (prost) #into postscript file in current directory (also see commands pdf, jpeg, etc..) # dev.off() prost.tr = prost[prost$train,] # train observations prost.te = prost[!prost$train,] # test observations attach(prost.tr) # now we can treat columns as variables summary (age) detach() prost.linreg = lm (lpsa~.-train, data=prost.tr) summary(prost.linreg) pred.te = predict (prost.linreg, newdata=prost.te) summary((prost.te$lpsa-pred.te)^2) # now let's do k-NN k.vals = c(1,5,20) X.te = as.matrix(prost.te[,-c(9,10)]) # remove last two columns (response & training indicator) X.tr = as.matrix(prost.tr[,-c(9,10)]) nte = dim(prost.te)[1] # how many test observations ntr = dim(prost.tr)[1] # how many train observations # variables should be standardized for properly applying k-NN! # sd.tr = apply (X.tr, 2, sd) # X.tr = X.tr / matrix(data=sd.tr, nrow=ntr, ncol=length(sd.tr), byrow=T) # X.te = X.te / matrix(data=sd.tr, nrow=nte, ncol=length(sd.tr), byrow=T) # create matrix of distances norm.te = apply (X.te^2, 1, sum) # apply function to rows of matrix norm.tr = apply (X.tr^2, 1, sum) mat.norm.te = matrix (data=norm.te, nrow=nte, ncol=ntr, byrow=F) # each column is the norms of the test observations mat.norm.tr = matrix (data=norm.tr, nrow=nte, ncol=ntr, byrow=T) # each row is the norms of the test observations # matrix of distances dist.te.tr = mat.norm.te + mat.norm.tr - 2*X.te%*%t(X.tr) # matrix multiplication # matrix to store regression errors sq.err = matrix (nrow=nte, ncol=length(k.vals)) for (i in 1:nte) { neighbors = order (dist.te.tr[i,]) for (j in 1:length(k.vals)){ k = k.vals[j] sq.err[i,j] = (prost.te$lpsa[i] - mean(prost.tr$lpsa[neighbors[1:k]]))^2}} apply (sq.err,2,summary) # now again with standardization! sd.tr = apply (X.tr, 2, sd) X.tr = X.tr / matrix(data=sd.tr, nrow=ntr, ncol=length(sd.tr), byrow=T) X.te = X.te / matrix(data=sd.tr, nrow=nte, ncol=length(sd.tr), byrow=T) # create matrix of distances norm.te = apply (X.te^2, 1, sum) # apply function to rows of matrix norm.tr = apply (X.tr^2, 1, sum) mat.norm.te = matrix (data=norm.te, nrow=nte, ncol=ntr, byrow=F) # each column is the norms of the test observations mat.norm.tr = matrix (data=norm.tr, nrow=nte, ncol=ntr, byrow=T) # each row is the norms of the test observations # matrix of distances dist.te.tr = mat.norm.te + mat.norm.tr - 2*X.te%*%t(X.tr) # matrix multiplication # matrix to store regression errors sq.err = matrix (nrow=nte, ncol=length(k.vals)) for (i in 1:nte) { neighbors = order (dist.te.tr[i,]) for (j in 1:length(k.vals)){ k = k.vals[j] sq.err[i,j] = (prost.te$lpsa[i] - mean(prost.tr$lpsa[neighbors[1:k]]))^2}} apply (sq.err,2,summary)