### Selected R commands and output used in Chapter 4 da <- read.table(`w-coilwti.txt',header=T) yt <- diff(da[,4]) require(NTS) m1 <- NNsetting(yt,nfore=200,lags=c(1:13)) names(m1) X <- m1$X; y <- m1$y; predX <- m1$predX; predY <- m1$predY m2 <- lm(y~-1+X) yhat <- predY <- predX%*%matrix(m2$coefficients,13,1) er1 <- yhat-predY mean(er1^2); mean(abs(er1)) require(nnet) m3 <- nnet(X,y,size=1,skip=T,linout=T,maxit=1000) pm3 <- predict(m3,newdata=predX) er2 <- pm3-predY mean(er2^2); mean(abs(er2)) #### > amzn <- read.csv("Amzn.csv") > gspc <- read.csv("GSPC.csv") > dim(amzn) [1] 2599 7 > Gspc <- diff(log(as.numeric(gspc$Adj.Close))) > damzn <- ifelse(Amzn >= 0,1,0) > dgspc <- ifelse(Gspc >= 0,1,0) > rtn <- cbind(Amzn,Gspc) > drtn <- cbind(damzn,dgspc) > zt <- cbind(drtn,rtn) > source("NNsetting.R") > m1 <- NNsetting(zt,locY=1,nfore=598,lags=c(1:3)) > names(m1) [1] "X" "y" "predX" "predY" > X <- m1$X; y <- m1$y; predX <- m1$predX; predY <- m1$predY > nameX <- c("dAmzn1","dspc1","Amzn1","spc1","dAmzn2","dspc2","Amzn2","spc2", "dAmzn3","dspc3","Amzn3","spc3") > colnames(X) <- nameX > colnames(predX) <- nameX > require(deepnet) ## deepnet package > n1 <- nn.train(X,y,hidden=c(10)) ## Single hidden layer > pn1 <- nn.predict(n1,predX) > ts.plot(pn1) > dpn1 <- ifelse(pn1 >= 0.5,1,0) > table(predY,dpn1) dpn1 predY 0 1 0 150 123 1 160 165 #### the same result if mean is used. > dpn1 <- ifelse(pn1 >= mean(pn1), 1,0) > table(predY,dpn1) dpn1 predY 0 1 0 150 123 1 160 165 > n2 <- nn.train(X,y,hidden=c(5,5)) ## two hidden layers > require(darch) ## darch package > d1 <- darch(X,y,layers=10) ## Single hidden layer > pk1 <- predict(d1,newdata=predX) > ts.plot(pk1) > pp1 <- ifelse(pk1 >= 0.5,1,0) > table(predY,pp1) pp1 predY 0 1 0 180 93 1 201 124 > d3 <- darch(X,y,layers=c(12,5,5,1)) ## Two hidden layers of sizes 5 and 5. #### > da <- read.table("taq-wba-cpch-feb06-2017.txt",header=T) > dim(da) [1] 29275 4 ### create dummy variables > c2 <- c3 <- c4 <- c5 <- c6 <- c7 <- rep(0,29275) > nT <- nrow(da) > idx <- c(1:nT)[da[,1]==2] > c2[idx] <-1 > idx <- c(1:nT)[da[,1]==3] > c3[idx] <-1 > idx <- c(1:nT)[da[,1]==4] > c4[idx] <- 1 > idx <- c(1:nT)[da[,1]==5] > c5[idx] <- 1 > idx <- c(1:nT)[da[,1]==6] > c6[idx] <- 1 > idx <- c(1:nT)[da[,1]==7] > c7[idx] <- 1 > Z <- cbind(da,c2,c3,c4,c5,c6,c7) > source("NNsetting.R") > n1 <- NNsetting(Z,nfore=2000,lags=c(1:3),include.lagY=F) > X <- n1$X; y <- n1$y; predX <- n1$predX; predY <- n1$predY > na <- c("pr","dur","size","c2","c3","c4","c5","c6","c7") > na1 <- paste(na,"l1",sep=""); na2 <- paste(na,"l2",sep="") > na3 <-paste(na,"l3",sep="") > name <- c(na1,na2,na3) > colnames(X) <- colnames(predX) <- name > require(MASS) > y <- as.factor(y) > m1 <- polr(y~.,data=data.frame(X),method="probit") > summary(m1) Call: polr(formula = y ~ ., data = data.frame(X), method = "probit") Coefficients: Value Std. Error t value prl1 -1.901e+01 2.004434 -9.4855 durl1 1.284e-02 0.002678 4.7971 sizel1 2.194e-03 0.001322 1.6596 c2l1 -2.955e-01 0.101196 -2.9200 c3l1 -6.422e-01 0.106859 -6.0097 c4l1 -1.005e+00 0.112055 -8.9728 c5l1 -1.300e+00 0.122176 -10.6376 c6l1 -1.610e+00 0.133076 -12.0973 c7l1 -1.759e+00 0.188524 -9.3279 prl2 -9.389e+00 1.999849 -4.6951 durl2 5.934e-03 0.002753 2.1552 sizel2 2.265e-03 0.001336 1.6956 c2l2 -3.550e-01 0.102485 -3.4640 c3l2 -4.644e-01 0.108436 -4.2830 c4l2 -6.333e-01 0.113046 -5.6022 c5l2 -6.913e-01 0.123311 -5.6063 c6l2 -9.001e-01 0.133976 -6.7186 c7l2 -1.218e+00 0.189065 -6.4434 prl3 -4.302e+00 1.847475 -2.3286 durl3 -8.912e-04 0.002770 -0.3218 sizel3 -3.752e-04 0.001043 -0.3597 c2l3 -2.086e-02 0.100412 -0.2077 c3l3 -9.264e-02 0.105887 -0.8749 c4l3 -1.311e-01 0.110385 -1.1876 c5l3 -1.790e-01 0.120459 -1.4858 c6l3 -2.369e-01 0.130089 -1.8208 c7l3 -3.802e-01 0.184299 -2.0629 Intercepts: Value Std. Error t value 1|2 -4.3392 0.2111 -20.5586 2|3 -3.3606 0.2100 -16.0044 3|4 -2.8530 0.2098 -13.5971 4|5 -0.5979 0.2094 -2.8549 5|6 -0.0710 0.2093 -0.3391 6|7 0.8207 0.2091 3.9243 Residual Deviance: 54018.88 > pm1 <- predict(m1,newdata=predX,type="class") ## prediction > table(predY,pm1) pm1 predY 1 2 3 4 5 6 7 1 0 0 0 4 0 0 0 2 0 0 0 100 0 0 0 3 0 0 0 180 0 0 0 4 0 0 0 1437 0 0 0 5 0 0 0 174 0 0 0 6 0 0 0 100 0 0 0 7 0 0 0 5 0 0 0 ### Deep learning > require(darch) > y1 <- as.factor(y) > nn1 <- darch(X,y1,layers=5) > pnn1 <- predict(nn1,newdata=predX,type="raw") > dim(pnn1) [1] 2000 7 > x1 <- apply(pnn1,1,which.max) > table(predY,x1) x1 predY 4 5 1 4 0 2 99 1 3 171 9 4 1420 17 5 156 18 6 99 1 7 5 0 > > nn2 <- darch(X,y1,layers=10) > pnn2 <- predict(nn2,newdata=predX) > x1 <- apply(pnn2,1,which.max) > table(predY,x1) x1 predY 1 3 4 5 1 3 0 1 0 2 0 2 98 0 3 0 47 129 4 4 2 38 1389 8 5 0 11 152 11 6 0 5 95 0 7 0 0 5 0 > > nn3 <- darch(X,y1,layers=c(27,10,5,1)) > pnn3 <- predict(nn3,newdata=predX) > x3 <- apply(pnn3,1,which.max) > table(predY,x3) x3 predY 3 4 5 1 0 4 0 2 2 98 0 3 46 123 11 4 43 1367 27 5 8 141 25 6 3 95 2 7 0 5 0 #### > da <- read.table("GDPC1.txt",header=T) > gdp <- diff(log(da$rgdp)) > tdx <- da$year + (da$mon/12) > tdx <- tdx[-1] > plot(tdx,gdp,xlab='year',ylab='gdp-rate',type='l') > length(gdp) [1] 273 > gdp <- round(gdp*100,2) # > X <- cbind(gdp[4:273],gdp[3:272],gdp[2:271],gdp[1:270]) > colnames(X) <- c("gdp","gdp1","gdp2","gdp3") > require(tree) > t1 <- tree(gdp~.,data=data.frame(X)) > summary(t1) Regression tree: tree(formula = gdp ~ ., data = data.frame(X)) Number of terminal nodes: 11 Residual mean deviance: 0.6654 = 172.3 / 259 Distribution of residuals: Min. 1st Qu. Median Mean 3rd Qu. Max. -3.27300 -0.46200 -0.03298 0.00000 0.51020 3.25700 > plot(t1) > text(t1,pretty=0) #### > PM2.5 <- scan(file="d-Shanghai-1317.txt") > tdx <- c(1:length(PM2.5))/365+2013 > par(mfcol=c(2,1)) > plot(tdx,PM2.5,xlab='year',ylab='PM2.5',type='l') > acf(PM2.5,lag=800) > m1 <- NNsetting(PM2.5,nfore=365,lags=c(1:10,365:370)) > names(m1) [1] "X" "y" "predX" "predY" > X <- m1$X; y <- m1$y; predX <- m1$predX; predY <- m1$predY > n1 <- paste("L",1:10,sep="") > n2 <- paste("L",365:370,sep="") > colnames(X) <- colnames(predX) <- c(n1,n2) > m2 <- lm(y~.,data=data.frame(X)) > summary(m2) Call: lm(formula = y ~ ., data = data.frame(X)) Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 11.330124 2.605231 4.349 1.53e-05 *** L1 0.487490 0.033792 14.426 < 2e-16 *** L2 -0.046739 0.037458 -1.248 0.21246 L3 -0.062058 0.037249 -1.666 0.09607 . L4 0.026367 0.037273 0.707 0.47951 L5 0.082690 0.037183 2.224 0.02641 * L6 0.006062 0.037240 0.163 0.87074 L7 -0.005810 0.037323 -0.156 0.87632 L8 0.106191 0.037370 2.842 0.00459 ** L9 -0.107141 0.037513 -2.856 0.00439 ** L10 0.110018 0.033664 3.268 0.00113 ** L365 -0.019086 0.029047 -0.657 0.51131 L366 0.035823 0.032708 1.095 0.27372 L367 0.051650 0.032712 1.579 0.11472 L368 -0.011674 0.032646 -0.358 0.72074 L369 0.051792 0.032573 1.590 0.11219 L370 0.058908 0.028975 2.033 0.04236 * --- Residual standard error: 25.01 on 859 degrees of freedom Multiple R-squared: 0.3443, Adjusted R-squared: 0.3321 F-statistic: 28.19 on 16 and 859 DF, p-value: < 2.2e-16 > acf(m2$residuals,lag=400) > Box.test(m2$residuals,lag=400,type="Ljung") Box-Ljung test data: m2$residuals X-squared = 370.16, df = 400, p-value = 0.8552 > pm2 <- predict(m2,newdata=data.frame(predX)) > er <- predY-pm2 > mean(abs(er)) [1] 15.44751 > sqrt(mean(er^2)) [1] 20.18687 > require(tree) > t1 <- tree(y~.,data=data.frame(X)) > par(mfcol=c(1,1)) > plot(t1) > text(t1,pretty=0) > pt1 <- predict(t1,newdata=data.frame(predX)) > er3 <- pt1-predY > mean(abs(er3)) [1] 16.3767 > sqrt(mean(er3^2)) [1] 22.2729 ### Compute residuals for model checking > pp1 <- predict(t1,newdata=data.frame(X)) > resi <- y-pp1 > Box.test(resi,lag=400,type="Ljung") Box-Ljung test data: resi X-squared = 566.59, df = 400, p-value = 7.614e-08 ### Pruning tree > cv.t1 <- cv.tree(t1) > plot(cv.t1$size,cv.t1$dev,type="b") > prune.t1 <- prune.tree(t1,best=4) > plot(prune.t1) > text(prune.t1,pretty=0) > prun <- predict(prune.t1,newdata=data.frame(predX)) > per <- predY-prun > mean(abs(per)) [1] 17.11805 > sqrt(mean(per^2)) [1] 22.03309 #### > cy <- rep(1,876) > idx <- c(1:876)[y > 50] > cy[idx] <- 2 > idx <- c(1:876)[y > 100] > cy[idx] <- 3 > idx <- c(1:876)[y > 150] > cy[idx] <- 4 > cpY <- rep(1,365) > jdx <- c(1:365)[predY > 50] > cpY[jdx] <- 2 > jdx <- c(1:365)[predY > 100] > cpY[jdx] <- 3 > jdx <- c(1:365)[predY > 150] > cpY[jdx] <- 4 > cy <- as.factor(cy); cpY <- as.factor(cpY) > t2 <- tree(cy~.,data=data.frame(X)) > plot(t2) > text(t2,pretty=0) > pt2 <- predict(t2,newdata=data.frame(predX),type="class") > table(cpY,pt2) pt2 cpY 1 2 3 4 1 207 54 0 0 2 33 55 2 0 3 4 9 0 0 4 0 1 0 0 > (207+55)/365 [1] 0.7178082 #### > require(randomForest) > dim(X) [1] 876 16 > bag1 <- randomForest(y~.,data=data.frame(X),mtry=16,importance=TRUE) > bag1 Call: randomForest(formula = y~.,data=data.frame(X), mtry=16,importance=TRUE) Type of random forest: regression Number of trees: 500 No. of variables tried at each split: 16 Mean of squared residuals: 655.6926 % Var explained: 29.93 > pbag1 <- predict(bag1,newdata=data.frame(predX)) > er <- pbag1-predY > mean(abs(er)) [1] 15.72161 > sqrt(mean(er^2)) [1] 20.36633 > > set.seed(1) > bag2 <- randomForest(y~.,data=data.frame(X),mtry=16,ntree=1000) > bag2 Call: randomForest(formula=y ~ ., data=data.frame(X),mtry=16,ntree=1000) Type of random forest: regression Number of trees: 1000 No. of variables tried at each split: 16 Mean of squared residuals: 649.4435 % Var explained: 30.6 > pbag2 <- predict(bag2,newdata=data.frame(predX)) > er2 <- pbag2-predY > mean(abs(er2)) [1] 15.688 > sqrt(mean(er2^2)) [1] 20.32263 > names(bag2) [1] "call" "type" "predicted" "mse" [5] "rsq" "oob.times" "importance" "importanceSD" [9] "localImportance" "proximity" "ntree" "mtry" [13] "forest" "coefs" "y" "test" [17] "inbag" "terms" > resi <- y - bag2$predicted > ts.plot(resi) > acf(resi,lag=400) > Box.test(resi,lag=400,type="Ljung") Box-Ljung test data: resi X-squared = 397.21, df = 400, p-value = 0.53 #### > da <- read.table("d-Shanghai-1317.txt") > m1 <- NNsetting(da[,1],nfore=365,lags=c(1:10,365:370)) > X <- m1$X; y <- m1$y; predX <- m1$predX; predY <- m1$predY > n1 <- paste("L",1:10,sep="") > n2 <- paste("L",365:370,sep="") > colnames(X) <- colnames(predX) <- c(n1,n2) > require(randomForest) > rf2 <- randomForest(y~.,data=data.frame(X),mtry=4,ntree=1000) > rf2 Call: randomForest(formula=y~.,data=data.frame(X),mtry=4,ntree=1000) Type of random forest: regression Number of trees: 1000 No. of variables tried at each split: 4 Mean of squared residuals: 638.2131 % Var explained: 31.8 > prf2 <- predict(rf2,newdata=data.frame(predX)) > er2 <- prf2-predY > mean(abs(er2)) [1] 15.64032 > sqrt(mean(er2^2)) [1] 19.97458 ######## add more predictors > m2 <- NNsetting(da[,1],nfore=365,lags=c(1:15,365:374)) > X <- m2$X; y <- m2$y; predX <- m2$predX; predY <- m2$predY > n1 <- paste("L",1:15,sep="") > n2 <- paste("L",365:374,sep="") > colnames(X) <- colnames(predX) <- c(n1,n2) > dim(X) [1] 872 25 > rf4 <- randomForest(y~.,data=data.frame(X),mtry=5) > rf4 Call: randomForest(formula = y ~ ., data = data.frame(X), mtry= 5) Type of random forest: regression Number of trees: 500 No. of variables tried at each split: 5 Mean of squared residuals: 634.4673 % Var explained: 32.42 > prf4 <- predict(rf4,newdata=data.frame(predX)) > er4 <- prf4-predY > mean(abs(er4)) [1] 15.68707 > sqrt(mean(er4^2)) [1] 19.93318 ####