# read training data con = url("http://www.tau.ac.il/~saharon/BigData2021/train.csv") train = read.csv(con) # reorganize nicely p=dim(train)[2]-1 n = dim(train)[1] X = as.matrix(train[,1:p],nrow=n) Y = as.numeric(train[,p+1]) # calculating correlations for (j in 1:p){ if (cor(Y,X[,j])>0.3) cat (j,cor(Y,X[,j]),"\n")} # lasso modeling library(lars) mod = lars(x=X,y=Y,type="lasso",use.Gram=FALSE) # learn about the object in different ways: names(mod) summary(mod) names(summary(mod)) # which variables participate in first 10 pieces of the path? for (i in 1:10){cat (i, ":", (1:2000)[mod$beta[i,]!=0],"\n")} # variable selection library(leaps) # select four variables of first 200, measure time print(Sys.time()) mod0=regsubsets(x=X[,1:200],y=Y,nvmax=4,really.big=T) print(Sys.time()) # learn about the object: names(mod0) summary(mod0) names(summary(mod0)) # which variables it chose? Variable 0 is the intercept for (i in 1:4){cat (i, ":", (0:200)[summary(mod0)$which[i,]],"\n")}