SC <- read.csv("semiconductor.csv") x <- subset(SC, select=-FAIL) y <- SC\$FAIL==1 n <- nrow(SC) p <- ncol(x) ## full and cut models system.time(full <- glm(y ~ ., data=x, family=binomial)) 1 - full\$deviance/full\$null.deviance o <- order(-abs(cor(x,y)))[1:25] cut <- glm(y ~ ., data=x[,o], family=binomial) 1 - cut\$deviance/cut\$null.deviance ## define binomial deviance bindev <- function(x,y,p) -2*(sum(log(p[y>0])) + sum(log(1-p[y<=0]))) ## test it bindev(x,y,full\$fit) full\$deviance ## and R2 binR2 <- function(x,y,fit){ ybar <- mean(y) p <- predict(fit,x,type="response") 1 - bindev(x,y,p)/bindev(x,y,rep(ybar,length(y))) } ## test it binR2(x,y,full) ## Out of sample prediction experiment R2 <- list() nfold <- 10 # use more "folds" to get a better picture isamp <- sample.int(nrow(x)) # random order foldsize <- floor(nrow(x)/nfold) for(i in 1:nfold){ train <- isamp[-(foldsize*(i-1) + 1:foldsize)] rfull <- glm(y[train] ~., data=x[train,], family=binomial) rcut <- glm(y[train]~., data=x[train,o], family=binomial) R2\$full <- c(R2\$full, binR2(x[-train,],y[-train],rfull)) R2\$cut <- c(R2\$cut, binR2(x[-train,o],y[-train],rcut)) cat(i) } boxplot(R2, col="plum", ylab="R2", xlab="model") #### FDR #### q <- 0.1 ## grab and plot pvalues pvals <- summary(full)\$coef[-1,4] #-1 to drop the intercept hist(pvals, xlab="p-value", main="Semiconductor Signals") ## rank them j <- rank(pvals, ties.method="min") ## plot the ranks and the FDR line plot(j,pvals, log="xy", pch=20,col=8,xlab="rank") lines(1:p, q*(1:p)/p) ## find cutoff and signif, add to plot cutoff <- max(pvals[pvals < q*j/p]) signif <- pvals<=cutoff colnames(x)[signif] ## significant at q=0.1 points(j[signif],pvals[signif],col="red") #### naive stepwise #### null <- glm(y~1, data=as.data.frame(x)) system.time(fwd <- step(null, scope=formula(full), dir="forward",k=log(nrow(x)))) #### FDR #### ## lasso (glmnet does L1-L2, gamlr does L0-L1) library(gamlr) ## format the data as a 'sparse matrix' xsparse <- Matrix(as.matrix(x)) ## time for a single path fit system.time(sclasso <- gamlr(xsparse, y, family="binomial")) ## cross validation sccv <- cv.gamlr(xsparse, y, family="binomial") par(mfrow=c(1,2)) plot(sccv\$g) plot(sccv) ## concave penalty version (see help(gamlr) + help(cv.gamlr) for more) scconcave <- gamlr(xsparse, y, family="binomial",varpen=10) plot(scconcave)