help (read.csv) #example of help...

con = url ("http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/prostate.data")
prost=read.csv(con,row.names=1,sep="\t")
# alternatively:    setwd("C:/Documents and Settings/Administrator/Desktop")
#                   prost=read.csv("prostate.data",row.names=1,sep="\t")

summary(prost)

plot (prost$age, prost$lcavol) # standard plot

plot (prost) # all vs all in the R window
# into file:
#       postscript("prost.ps")
#       plot (prost) #into postscript file in current directory (also see commands pdf, jpeg, etc..)
#       dev.off()


prost.tr = prost[prost$train,] # train observations
prost.te = prost[!prost$train,] # test observations

attach(prost.tr) # now we can treat columns as variables
summary (age)
detach()

prost.linreg =  lm (lpsa~.-train, data=prost.tr)
summary(prost.linreg)


pred.te = predict (prost.linreg, newdata=prost.te)
summary((prost.te$lpsa-pred.te)^2)


# now let's do k-NN
k.vals = c(1,5,20)
X.te =  as.matrix(prost.te[,-c(9,10)]) # remove last two columns (response & training indicator)
X.tr =  as.matrix(prost.tr[,-c(9,10)])
nte = dim(prost.te)[1] # how many test observations
ntr = dim(prost.tr)[1] # how many train observations

# variables should be standardized for properly applying k-NN!
#   sd.tr = apply (X.tr, 2, sd)
#   X.tr = X.tr / matrix(data=sd.tr, nrow=ntr, ncol=length(sd.tr), byrow=T)
#   X.te = X.te / matrix(data=sd.tr, nrow=nte, ncol=length(sd.tr), byrow=T)


# create matrix of distances
norm.te = apply (X.te^2, 1, sum) # apply function to rows of matrix
norm.tr = apply (X.tr^2, 1, sum) 

mat.norm.te = matrix (data=norm.te, nrow=nte, ncol=ntr, byrow=F) # each column is the norms of the test observations
mat.norm.tr = matrix (data=norm.tr, nrow=nte, ncol=ntr, byrow=T) # each row is the norms of the test observations

# matrix of distances
dist.te.tr = mat.norm.te + mat.norm.tr - 2*X.te%*%t(X.tr) # matrix multiplication

# matrix to store regression errors
sq.err = matrix (nrow=nte, ncol=length(k.vals)) 
for (i in 1:nte) {
    neighbors = order (dist.te.tr[i,])
    for (j in 1:length(k.vals)){
        k = k.vals[j]
        sq.err[i,j] = (prost.te$lpsa[i] - mean(prost.tr$lpsa[neighbors[1:k]]))^2}}

apply (sq.err,2,summary)



# now again with standardization!
   sd.tr = apply (X.tr, 2, sd)
   X.tr = X.tr / matrix(data=sd.tr, nrow=ntr, ncol=length(sd.tr), byrow=T)
   X.te = X.te / matrix(data=sd.tr, nrow=nte, ncol=length(sd.tr), byrow=T)


# create matrix of distances
norm.te = apply (X.te^2, 1, sum) # apply function to rows of matrix
norm.tr = apply (X.tr^2, 1, sum) 

mat.norm.te = matrix (data=norm.te, nrow=nte, ncol=ntr, byrow=F) # each column is the norms of the test observations
mat.norm.tr = matrix (data=norm.tr, nrow=nte, ncol=ntr, byrow=T) # each row is the norms of the test observations

# matrix of distances
dist.te.tr = mat.norm.te + mat.norm.tr - 2*X.te%*%t(X.tr) # matrix multiplication

# matrix to store regression errors
sq.err = matrix (nrow=nte, ncol=length(k.vals)) 
for (i in 1:nte) {
    neighbors = order (dist.te.tr[i,])
    for (j in 1:length(k.vals)){
        k = k.vals[j]
        sq.err[i,j] = (prost.te$lpsa[i] - mean(prost.tr$lpsa[neighbors[1:k]]))^2}}

apply (sq.err,2,summary)