############################### # # 41000: R scripts # ############################### ############################### # # Section 1: Probability # ############################### ############################# # Maris ############################# # Roger Maris Home run record maris <- c(14,28,16,39,61,33,23,26,8,13) marismean <- mean(maris) marisstd = sqrt(var(maris)) xbar = rep(0,1000) for(i in 1:1000){ aux = rnorm(10,marismean,marisstd) xbar[i] = mean(aux) } hist(maris,prob=T,col=4) aux = seq(0,70,by=0.01) lines(aux,dnorm(aux,marismean,marisstd),col=2,lwd=2) abline(v=61,col=2,lwd=2,lty=2) pdf(file="maris-2.pdf",width=5,height=3.5) pnorm(61,marismean,marisstd) ################################## # Binomial Distribution ################################## # simulate 10000 observations of N(10,3^2) N = 10000 x = rnorm(N,10,3) # calculate the proportion less than 5 p = sum(x < 5)/N # now generate N observations again from Binom(120,p) y = rbinom(N,120,p) # plot a histogram hist(y,freq=FALSE,col="skyblue",main="Probability of number of deffective boards") abline(v=10) # calculate the sample proportion that number of deffective boards is more than 10 sum(y > 10)/N # compare with the theoretical number 1-pbinom(10,120,p) ################################## # Poisson Distribution ################################## # simulate 10000 observations N = 10000 A = rpois(N,2.2) B = rpois(N,1.6) # plot a barplot op=par(mfrow = c(1, 2)) barplot(table(A)/N,main="Probability of Arsenal scores",col="green") barplot(table(B)/N,main="Probability of Liverpool scores",col="red") par(op) # plot a histogram of A-B hist(A-B) # calculate the sample proportion of 0-0 sum(A==0 & B==0)/N # compare with the theoretical number dpois(0,2.2)*dpois(0,1.6) # the sample probablity that A > B sum(A > B)/N # compare to the theoretical number # the poisson difference follows a Skellam distribution library("VGAM") # generate 10000 observations from the Skellam distribution d = rskellam(N,2.2,1.6) sum(d>0)/N ################################## # Binomial, Poisson and Normal ################################## par(mfrow=c(1,2)) z1 = rbinom(1000,25,0.5) mean(z1) sd(z1) z2 = rnorm(1000,12.5,2.5) hist(z1,col=4,prob=T) hist(z2,col=5,prob=T) # Probabilities dbinom(5,10,0.5) dbinom(5,10,0.2) z1 = rbinom(100,10,0.2) hist(z1,col=39) sum(z1==5)/100 dpois(3,1) ############################ # Poisson ############################ # Comparing two Poisson Distributions z1 <- rpois(100,5) z2 <- rpois(100,2) par(mfrow=c(1,2)) hist(z1, col=5, main="", xlab="Poisson") hist(z2, add=TRUE, col=6) legend("topright", legend=c("lambda=5", "lambda=2"), fill=c(5,6), bty="n") boxplot(list(z2,z1), col=c(6,5), horizontal=TRUE, ylab="Distn", xlab="Poisson") # P(X>Y) and P(X=Y) z1 <- rpois(1000,0.6) z2 <- rpois(1000,1.4) # Win, Draw, Loss sum(z1>z2) sum(z1==z2) sum(z1)= 0.95 prop.test(45,75,p=0.95,alternative="less",conf.level=0.99) # if you want to use the confidence interval of p_hat # we need the two-sided t-test prop.test(45,75,0.95,alternative="two.sided",conf.level=0.99) ##################################### # Pfizer: Testing Proportions ##################################### # x is your observed sample # n is the sample size # our null hypothesis is p_1 - p2 <=0 or p_1 <= p_2 prop.test(x=c(11,77),n=c(1.5*10^6,6*10^6),alternative="greater",conf.level=0.95) # if you want to use the confidence interval of p_hat # we need the two-sided t-test prop.test(x=c(11,77),n=c(1.5*10^6,6*10^6),alternative="two.sided",conf.level=0.95) #################################### # Argon: Difference in means #################################### # Lord Rayleigh Data a <- c(2.31017,2.30986,2.31010,2.31001,2.31024,2.31010,2.31028) b <- c(2.30143,2.29890,2.29816,2.30182,2.29869,2.29940,2.29849,2.29889) # two-sample t-test assume equal variance t.test(a,b,var.equal=T) #################################### # A/B Test #################################### site1 = c(.40, 500) # A site2 = c(.30, 550) # B abtestfunc <- function(ad1, ad2){ sterror1 = sqrt( ad1[1] * (1-ad1[1]) / ad1[2] ) sterror2 = sqrt( ad2[1] * (1-ad2[1]) / ad2[2] ) minmax1 = c((ad1[1] - 1.28*sterror1) * 100, (ad1[1] + 1.28*sterror1) * 100) minmax2 = c((ad2[1] - 1.28*sterror2) * 100, (ad2[1] + 1.28*sterror2) * 100) print( round(minmax1,2) ) print( round(minmax2,2) ) } abtestfunc(site1, site2) #################### ## Superbowl #################### # import superbowl data mydata1 <- read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/superbowl1.txt",header=T) # look at data head(mydata1) tail(mydata1) mydata1 # attach so R recognizes each variable attach(mydata1) # see the distribution of outcome and spread through histograms hist(Outcome) hist(Spread) # we can also calculate the mean and standard deviation mean(Outcome); sd(Outcome) mean(Spread); sd(Spread) # plot Spread vs Outcome plot(Spread,Outcome) # add a 45 degree line abline(1,1) # correlation cor(Spread,Outcome) # Compare boxplot boxplot(Spread,Outcome,horizontal=T,names=c("spread","outcome"),col=c("red","yellow"),main="Superbowl") # Covariance, Correlation, alpha, beta x <- Spread y <- Outcome mean(x) mean(y) sd(x) sd(y) cov(x,y) cor(x,y) beta <- cor(x,y)*sd(y)/sd(x) alpha <- mean(y)-beta*mean(x) beta alpha # Regression check model <- lm(y~x) coef(model) ###################### ## Kentucky Derby ###################### mydata2 <- read.csv("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/Kentucky_Derby_2014.csv",header=T) # or import data from web using Rstudio # mydata2 = Kentucky_Derby_2014 # attach the dataset attach(mydata2) head(mydata2) tail(mydata2) # plot a histogram of speedmph hist(speedmph,col="blue") # finer bins hist(speedmph,breaks=10,col="red") hist(timeinsec,breaks=10,col="purple") # to find the left tail observation k1 <- which(speedmph == min(speedmph)) mydata2[k1,] # to find the best horse k2 <- which(speedmph == max(speedmph)) mydata2[k2,] # working directory # setwd("~/....") ################################## # Berkshire Hathaway 1990-2017 ################################## library(fImport) Y = yahooSeries("BRK-A", from = "1990-01-01") # take a look of the data head(Y) plot(Y[,6],type="l",col=20,main="BRKA Share Price", ylab="Price",xlab="Time") # Calculate Returns y = rev(Y$`BRK-A.Adj.Close`) n <- length(y) rets <- y[-1]/y[-n]-1 summary(rets) # Two plots in one # par(mfrow=c(1,2)) plot(rets,pch=20,col=24) abline(0,0) # summarise returns mean(rets) sd(rets) skewness(rets) hist(rets,breaks=50,freq=FALSE,main="Berkshire Returns",col="red") lines(seq(-1,1,0.001),dnorm(seq(-1,1,0.001),mean(rets),sd(rets)),lwd=2.5,col="green") legend("topright",c("Actual Distribution","Normal Distribution"), lty=c(1,1),lwd=c(2.5,2.5),col=c("red","green")) # to save the plot,click "Export" ans "save as image" # ts.plot(y,main="BRK-A price time series",ylab="price",xlab="day") ################################## # Apple and Google ################################## # Compare log-returns AAPL and GOOG library(fImport) Y = yahooSeries("AAPL", from = "2004-04-29") y = rev(Y$AAPL.Adj.Close) plot(y,type="l",col=20) min(y) max(y) # Calculate log-returns n <- length(y) lrets <- log(y[-1]/y[-n]) summary(lrets) plot(lrets,pch=20,col=34) abline(0,0) skewness(lrets) kurtosis(lrets,method=c("moment")) Y = yahooSeries("GOOG", from = "2004-04-29") y = rev(Y$GOOG.Adj.Close) plot(y,type="l",col=20) min(y) max(y) # Calculate log-returns n <- length(y) lrets <- log(y[-1]/y[-n]) summary(lrets) plot(lrets,pch=20,col=24) abline(0,0) skewness(lrets) kurtosis(lrets,method=c("moment")) ################################## # # Section 3: Modeling # ################################## ################################## # Keynes Data ################################## keynes = read.table("keynes.txt",header=T) attach(keynes) # Plot the data plot(Year,Keynes,pch=20,col=34) plot(Market, Keynes, xlab="Market Return", ylab="Keynes Excess Return",col=20) # correlation matrix cor(cbind(Keynes,Market)) # Fit the least-squares line. model = lm(Keynes~Market) abline(model) title("Keynes vs Market") # Extract the model coefficients coef(model) summary(model) # 4-in-1 residual diagnostics layout(matrix(c(1,2,3,4),2,2)) plot(model,pch=20) # Calculate excess return Keynes = Keynes - Rate Market = Market - Rate # correlation matrix cor(cbind(Keynes,Market)) modelnew = lm(Keynes~Market) summary(modelnew) ################################## # Buffett Data ################################## buffett = read.table("buffett.txt",header=T) attach(buffett) # Plot the data plot(Market, Buffett, xlab="Market Return", ylab="Buffett Return",pch=20) legend(x="topleft",legend=c("Buffett","Market"),pch=15,col=c(2,4),bty="n") # correlation matrix cor(cbind(Buffett,Market)) # Fit the least-squares line and superimpose this line on the plot model = lm(Buffett~Market) abline(model) title("Buffett vs Market") # Extract the model coefficients coef(model) summary(model) # Improvement in Fit sd(model$residuals) sd(Buffett) # Remove datapoint 10 buffett_10 = buffett[-10,] ################################################### # Boston Housing Data # # Cutpoints tree are added to the plot ################################################### library(tree) library(MASS) data(Boston) attach(Boston) #-------------------------------------------------- #fit a tree to boston data just using lstat. #first get a big tree using a small value of mindev temp = tree(medv~lstat,data=Boston,mindev=.0001) cat('first big tree size: \n') print(length(unique(temp$where))) #then prune it down to one with 7 leaves boston.tree=prune.tree(temp,best=7) cat('pruned tree size: \n') print(length(unique(boston.tree$where))) #-------------------------------------------------- #plot the tree and the fits. par(mfrow=c(1,2)) #plot the tree plot(boston.tree,type="uniform") text(boston.tree,col="blue",label=c("yval"),cex=.8) #plot data with fit boston.fit = predict(boston.tree) #get training fitted values plot(lstat,medv,cex=.5,pch=16) #plot data oo=order(lstat) lines(lstat[oo],boston.fit[oo],col='red',lwd=3) #step function fit cvals=c(9.725,4.65,3.325,5.495,16.085,19.9) #cutpoints from tree for(i in 1:length(cvals)) abline(v=cvals[i],col='magenta',lty=2) #cutpoints rm(list=ls()) ################################ # Sales versus price ################################ # First enter the data into R sales = c(420, 380, 350, 400, 440, 380, 450, 420) price = c(5.5, 6.0, 6.5, 6.0, 5.0, 6.5, 4.5, 5.0) # Plot the data plot(price, sales, xlab="Price ($100)", ylab="Sales (Thousands of units)") # correlation matrix cor(cbind(sales,price)) # Fit the least-squares line and superimpose this line on the plot model = lm(sales~price) abline(model) # Extract the model coefficients coef(model) # Improvement in Fit sd(model$residuals) sd(sales) # correlation matrix # which is different from the above correlation matrix cor(log(cbind(sales,price))) # Model in logs modelnew = lm(log(sales)~log(price)) summary(modelnew) #New data midcity <- read.table("House.txt",header=TRUE) attach(midcity) plot(SqFt,Price,pch=20,cex=1.25,main='Square feet vs. Sales price',ylab='Sales price in USD',xlab='Square feet',bty='n') mx <- mean(SqFt) my <- mean(Price) sdx <- sd(SqFt) sdy <- sd(Price) rho <- cor(Price,SqFt) b <- rho*sdy/sdx a <- my - mx*b abline(a,b,col='purple',lwd=3) ###################################### # Transformation Example ###################################### # Simulate data: x = rnorm(50,1,1) y = exp(2*x+ rnorm(50,sd=0.4)) layout(matrix(c(1,2),1,2)) plot(x,y,col="purple",pch=20) model1 = lm(y~x) abline(model1,col=2,lwd=2) plot(model1) plot(x,log(y),col=2,pch=20) model2 = lm(log(y)~x) abline(model2,col=3,lwd=3) plot(model2) coef(model2) # Simulate data: ln-ln x = runif(50,0,4) y = 2*x^(1/3 + rnorm(50,sd=0.1)) layout(matrix(c(1,2),1,2)) plot(x,y,col="purple",pch=20) model1 = lm(y~x) abline(model1,col=2,lwd=2) plot(log(x),log(y),col=2,pch=20) model2 = lm(log(y)~log(x)) abline(model2,col=3,lwd=3) #################################### # Mammals Example #################################### mammals = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/mammals.txt",header=T) attach(mammals) # correlation matrix cor(cbind(Brain,Body)) # Fit Model model = lm(Brain~Body) par(mfrow=c(1,2)) plot(Brain,Body,pch=19,col=4,main="Fitted Model") abline(lsfit(Brain,Body),lwd=2,col=2) plot(Brain,model$res,pch=19,col=2,ylab="residuals",main="Residual Plot") # correlation matrix cor(log(cbind(Brain,Body))) lnmodel = lm(log(Body)~log(Brain)) # Graphics par(mfrow=c(1,2)) # residual analysis plot(log(Brain),log(Body),pch=19,col=4,main="Fitted Model") abline(lsfit(log(Brain),log(Body)),lwd=2,col=2) plot(log(Brain),lnmodel$res,pch=19,col=2,ylab="residuals",main="Residual Plot") ################################## # Electricity Data ################################## # Load data and rename variables electricity = read.csv("Electricity.csv") dimnames(electricity) [[2]] = c("Customer","Usage","Demand") electricity attach(electricity) # Plot data plot(Usage,Demand, main="Electricity Demand",pch=20,col="purple") # correlation matrix cor(cbind(Demand,Usage)) # Fit linear model electricity.lm = lm(Demand ~ Usage) anova(electricity.lm) # Plot residuals plot(electricity.lm$fitted,electricity.lm$residuals,xlab="fitted",ylab="residuals",main="residual plot",pch=20,col="purple") abline(0,0) # Transform the response to stabilize the variance rootDemand = sqrt(Demand) # correlation matrix cor(cbind(rootDemand,Usage)) electricity.lm = lm(rootDemand ~ Usage) plot(electricity.lm$fitted,electricity.lm$residuals,xlab="fitted",ylab="residuals",main="residual plot with transformed response",pch=20,col=59) abline(0,0) ################################## # Anscombe Residual Diagnostics ################################## ansc <- read.csv("anscombe.csv", header=TRUE) attach(ansc) pdf('anscombe-R-001.pdf') par(mfrow=c(2,2)) plot1 <- plot(ansc$x1, ansc$y1, main='Set 1',col=34,pch=20) # correlation matrix cor(cbind(y1,x1)) set1.fit <- lm(y1 ~ x1, data=ansc) abline(set1.fit) plot2 <- plot(ansc$x2, ansc$y2, main='Set 2',col=20,pch=20) # correlation matrix cor(cbind(y2,x2)) set2.fit <- lm(y2 ~ x2, data=ansc) abline(set2.fit) #sidebysideplot <- grid.arrange(plot1,plot2,col=2) dev.off() pdf('anscombe-R-002.pdf') layout(matrix(1:4,2,2)) plot(ansc$x1, ansc$y1, main='Set 1',col=34,pch=20) # correlation matrix cor(cbind(y1,x1)) set1.fit <- lm(y1 ~ x1, data=ansc) abline(set1.fit) plot(ansc$x2, ansc$y2, main='Set 2',col=24,pch=20) # correlation matrix cor(cbind(y2,x2)) set2.fit <- lm(y2 ~ x2, data=ansc) abline(set2.fit) plot(ansc$x3, ansc$y3, main='Set 3',col=20,pch=20) # correlation matrix cor(cbind(y3,x3)) set3.fit <- lm(y3 ~ x3, data=ansc) abline(set3.fit) plot(ansc$x4, ansc$y4, main='Set 4',col=3,pch=20) # correlation matrix cor(cbind(y4,x4)) set4.fit <- lm(y4 ~ x4, data=ansc) abline(set4.fit) # Fit the regression line and get the results summary(set1.fit) summary(set2.fit) summary(set3.fit) summary(set4.fit) # Look at residual plots layout(matrix(1:4,2,2)) plot(ansc$x1,residuals(set1.fit),main='Set 1 Residuals',col=34,pch=20) plot(ansc$x2,residuals(set2.fit),main='Set 2 Residuals',col=24,pch=20) plot(ansc$x3,residuals(set3.fit),main='Set 3 Residuals',col=20,pch=20) plot(ansc$x4,residuals(set4.fit),main='Set 4 Residuals',col=3,pch=20) ################################ # Smoking Data ################################ Data = read.csv("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/smoking.csv",header=T) attach(Data) # correlation matrix cor(cbind(Cancer,Consumption)) # scatter plot plot(Consumption,Cancer,pch=20,col=34,main="Cancer vs Smoking consumption"); text(Consumption,Cancer,labels=Country) # Fit the least-squares line. model = lm(Cancer~Consumption) abline(model,col=2,lty=2,lwd=2) cooks.distance(model) # remove u.s. as an outlier model2 = lm(Cancer[-7]~Consumption[-7]) abline(model2,col=3,lty=3,lwd=3) legend("topleft", c("Full sample","Removed U.S."), col=2:3, lty=2:3, lwd=2:3, bty="n") # Extract the model coefficients coef(model) coef(model2) # 4-in-1 residual diagnostics layout(matrix(c(1,2,3,4),2,2)) plot(model,pch=20) layout(matrix(c(1,2,3,4),2,2)) plot(model2,pch=20) # Delete the US and re-run Datanew = Data[-11,] lm(Cancer~Consumption) ############################## # # Section 4: Multiple Regression # ############################## ################################## # Bordeaux Wine Data ################################## bordeaux = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/bordeaux.txt",header=T) attach(bordeaux) # Plot the data plot(year,price,type="l",pch=20,col=4,main="Bordeaux Wine Prices") # correlation matrix cor(cbind(price,sum,har,sep,win,age)) # Build a Multiple Regression Model model = lm(price~sum+har+sep+win+age) summary(model) coef(model) # Diagnostics plot(rstudent(model)) # Graphics: 2x2 plot layout(matrix(c(1,2,3,4),2,2)) plot(model) # Prediction: Import bordeauxp.txt or directly into R bordeauxp = data.frame(read.table("bordeauxp.txt",header=T)) attach(bordeauxp) predict(model,bordeauxp,interval="confidence") # Change the Significance: 99% predict(model,bordeauxp,interval="confidence",level=0.99) # Fitted Value Plot plot(price) lines(predict(model)) # Detailed Graphics and predict.lm() mydata1 = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/bordeaux.txt",header=T) mydata2 = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/bordeauxp.txt",header=T) # summary statistics summary(mydata1) # a simple multiple regression trial model = lm(log(price)~log(sum)+log(har)+log(sep)+log(win)+log(age),data=mydata1) # log-log regression results summary(model) # residual diagnostics plot(model) # inputs: regression model and independent variables pred = predict(model,mydata2,interval="prediction") pred # out-of-sample prediction line plot(mydata2[,1],pred[,1],type="l",xlab="year",ylab="log_price",ylim=c(0,8), col=2, lty=2, lwd=2,main="out-of-sample prediction line") lines(mydata2[,1],pred[,2],col=3, lty=3, lwd=3) lines(mydata2[,1],pred[,3],col=4, lty=4, lwd=4) legend("topleft", c("fit","lwr","upr"),col=2:4, lty=2:4, lwd=2:4, bty="n") # exponentiate back the predicited price price_hat = exp(pred[,1]) plot(mydata2[,1],price_hat,type="l",xlab="year",ylab="predicted price",ylim=c(0,50), col=2, lty=2, lwd=2,main="out-of-sample prediction line") ################################## # Election 2016 Data ################################## mydata = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/election2016.txt",header=T) attach(mydata) head(mydata) names(mydata) # The equation to predict the 2016 presidential election is model1 = lm(VP~G+P+Z, na.action = na.omit, data=mydata) summary(model1) # The equation to predict the 2016 House election is model2 = lm(VC~G+P+Z, na.action = na.omit, data=mydata) summary(model2) ################################## # Homeless Data ################################## homeless = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/homeless.txt",header=T) attach(homeless) names(homeless) # correlation matrix cor(cbind(rate,pov,unemp,housing,pop,temp,vac,rentc)) # Build a Model model = lm(rate~pov+unemp+housing+pop+temp+vac+rentc) # Diagnostics plot(model) # Transformation and Prediction # Commands plot(model) coef(model) summary(model) rstudent(model) cooks.distance(model) ################################## # Newfood Data ################################## # Import newfood.csv data newfood = read.csv("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/newfood.csv",header=T) attach(newfood) names(newfood) # correlation matrix cor(cbind(sales,price,adv,locat,income,svol)) # Build model model = lm(sales~price+adv+locat+income+svol) summary(model) # check diagnostics layout(matrix(c(1,2,3,4),2,2)) plot(model,pch=20,col="blue") # Transformation # log-log model # don't transform dummy variables: adv, locat log_sales = log(sales) log_price = log(price) log_income = log(income) log_svol = log(svol) # correlation matrix cor(cbind(log_sales,log_price,adv,log_income,log_svol)) # Build model modelnew = lm(log_sales~log_price+adv+log_income+log_svol) summary(modelnew) # Graphics: 2x2 plot layout(matrix(c(1,2,3,4),2,2)) plot(modelnew,pch=20,col="blue") # confidence level newdata1 = data.frame(log_price=log(30),adv=1,log_income=log(8),log_svol=log(34)) predict.lm(modelnew,newdata1,se.fit=T,interval="confidence",level=0.99) # predict.lm at median(log_price), adv=1, median(log_income), median(log_svol) newdata2 = data.frame(log_price=median(log_price),adv=1, log_income=median(log_income),log_svol=median(log_svol)) predict.lm(modelnew,newdata2,se.fit=T,interval="prediction") # prediction line for log-log layout(matrix(c(1,2,3,4),2,2)) termplot(modelnew,ylabs="log_sales",terms="log_price",se=T) termplot(modelnew,ylabs="log_sales",terms="adv",se=T) termplot(modelnew,ylabs="log_sales",terms="log_income",se=T) termplot(modelnew,ylabs="log_sales",terms="log_svol",se=T) ############################## # # Section 5: Time Series # ############################## ############################## # Nasdaq-DJI ############################## Data = read.table("nasdaq-djia.txt",header=T) attach(Data) # correlation matrix cor(cbind(nasdaq,djia)) model = lm(nasdaq~djia) plot(djia,nasdaq,cex=0.8,pch=19,col=4) abline(model$coef[1], model$coef[2],lwd=2,col=2) acf(model$res,cex=2,ci.col=2,col=4,lwd=3,main="ACF of Residuals") plot(model$res,cex=0.8,pch=19,col=2) lines(model$res,col=4,lwd=2) ############################## # SP500 ############################## Data = read.table("SP500-dailyreturns.txt",header=T) attach(Data) L1 = length(Returns) sp500 = Returns[1:(L1-1)] sp5001 = Returns[2:L1] T = 1:L1 # correlation matrix cor(cbind(sp500,sp5001)) model = lm(sp500~sp5001) plot(sp500,sp5001,cex=0.8,pch=19,col=4) abline(model$coef[1], model$coef[2],lwd=2,col=2) acf(model$res,cex=2,ci.col=2,col=4,lwd=3,main="ACF of Returns") plot(model$res,cex=1.5,pch=19,col=2) lines(model$res,col=4,lwd=2) ############################ # Volatility VIX ############################# Data = read.csv("vix2009.csv",header=T) attach(Data) pdf(file="vix-acf.pdf",width=5,height=4) acf(Close,cex=2,ci.col=2,col=4,lwd=3,main="ACF of VIX") L1 = length(Close) vix2009 = Close[1:(L1-1)] vix20091 = Close[2:L1] vixreturn <- vix20091/vix2009 lvix <- log(vixreturn) plot(lvix,pch=19,col=10) acf(lvix,cex=2,ci.col=2,col=4,lwd=3,main="ACF of VIX-returns") ######################### # House Example ######################### #setwd("C:/Users/ngp/Documents/41000") House = read.table("housedata.txt",header=T) attach(House) plot(Size,Price,pch=19) # correlation matrix cor(cbind(Price,Size)) Housemodel = lm(Price~Size,data=House) summary(Housemodel) names(Housemodel) names(summary(Housemodel)) confint(Housemodel) confint(Housemodel,level=0.99) Xfuture <- data.frame(Size=seq(0,8,by=0.01)) Future1 = predict(Housemodel, Xfuture, interval = "prediction",se.fit=T) Future2 = predict(Housemodel, Xfuture, interval = "prediction",se.fit=T,level=0.99) plot(Size,Price,xlim=c(0,8),ylim=range(Future1$fit),pch=19,cex.lab=1.3) abline(lsfit(Size,Price),lwd=2,col=2) lines(Xfuture$Size,Future1$fit[,2],col=4,lwd=2,lty=2) lines(Xfuture$Size,Future1$fit[,3],col=4,lwd=2,lty=2) ########################## # South Sea Bubble ########################## #setwd("C:/Users/ngp/Documents/41000solns/sea bubble example") mydata <- read.csv("sea_bubble.csv",header=T) head(mydata) names <- ls(mydata) ts1 <- ts(mydata[,2], start=c(1719, 232), end=c(1720, 365), frequency=365) ts2 <- ts(mydata[,3], start=c(1719, 232), end=c(1720, 365), frequency=365) ts3 <- ts(mydata[,4], start=c(1719, 232), end=c(1720, 365), frequency=365) ts4 <- ts(mydata[,5], start=c(1719, 232), end=c(1720, 365), frequency=365) plot(ts1,main="South Sea Bubble",ylab="Prices",ylim=c(0,1000), col=1, lty=1, lwd=1) lines(ts2,col=2, lty=2, lwd=2) lines(ts3,col=3, lty=3, lwd=3) lines(ts4,col=4, lty=4, lwd=4) legend("topleft", names[c(1,5,4,6)], col=1:4, lty=1:4, lwd=1:4, bty="n") # fit a random walk model for South Sea Company model = ar(ts4,order.max=1,na.action=na.exclude) model # the AR term is close to 1, so this is a random walk ########################## # Kentucky Derby ########################## #setwd("C:/Users/ngp/Documents/41000solns") mydata = read.csv("Kentucky_Derby_2014.csv",header=T) head(mydata) tail(mydata) attach(mydata) ts1 <- ts(speedmph, start=1875, end=2014, frequency=1) plot(ts1,xlab="year",ylab="speed mph",main="Kentucky Derby",col=2, lty=2, lwd=2) # after removing the time trend, fit an ARMA model model = lm(speedmph~year_num) speed_tt = model$coef[1] + model$res ts2 <- ts(speed_tt, start=1875, end=2014, frequency=1) plot(ts2,xlab="year",ylab="speed mph",main="time trend removed",col=2, lty=2, lwd=2) # to find the outlier out = which(abs(model$res)>1.645) # label those outliers plot(ts2,xlab="year",ylab="speed mph",main="time trend removed",col=2, lty=2, lwd=2) text(x=out+1874,y=ts2[out],labels=winner[out]) # remove the outliers ts3 = ts2[-out] # plot the acf and pacf plots # an AR model might not be appropriate # but we can consider an MA model # acf(ts3) # pacf(ts3) model3 = arima(ts3,c(0,0,1)) # MA(1) prediction + the time trend ts4 = ts3-model3$res + c(1:140)[-out]*model$coef[2] plot(ts4,main="MA(1) predictions, outliers removed", xlab="year",ylab="speed mph",col=2, lty=2, lwd=2) lines(ts1[-out],col=3, lty=3, lwd=3) ########################## # Random Walk ########################## library(fImport) Y = yahooSeries("SQM", from = "2008-01-01") y = rev(Y$SQM.Adj.Close) plot(y,type="l",col=20) N = length(y) mu = y[1] sigma = sd(diff(y)) x = rep(mu, N) for(i in 2:N) { x[i] = x[i-1] + rnorm(1,0,sigma) } mylim = c(min(c(x,y)), max(c(x,y))) par(mfrow=c(1,2)) plot(y,type='l', bty='n', xlab='Trading days from January 1, 2008', ylab=expression(X[t]), main="Simulated random walk....", ylim=mylim) plot(x,type='l', bty='n', xlab='Trading days from January 1, 2008', ylab=expression(X[t]), main="... or actual stock price?", ylim=mylim) ############################################ # # MNIST # ########################################### library(keras) mnist <- dataset_mnist() x_train <- mnist$train$x y_train <- mnist$train$y x_test <- mnist$test$x y_test <- mnist$test$y y_train <- to_categorical(y_train, 10) y_test <- to_categorical(y_test, 10) dim(x_train) <- c(nrow(x_train), 784) dim(x_test) <- c(nrow(x_test), 784) # rescale x_train <- x_train / 255 x_test <- x_test / 255 model <- keras_model_sequential() model %>% layer_dense(units = 256, activation = 'relu', input_shape = c(784)) %>% layer_dropout(rate = 0.4) %>% layer_dense(units = 128, activation = 'relu') %>% layer_dropout(rate = 0.3) %>% layer_dense(units = 10, activation = 'softmax') summary(model) model %>% compile( loss = 'categorical_crossentropy', optimizer = optimizer_rmsprop(), metrics = c('accuracy') ) history <- model %>% fit( x_train, y_train, epochs = 30, batch_size = 128, validation_split = 0.2 ) plot(history) model %>% predict_classes(x_test) ######### simulation example : regression N = 10000 K = 50 d1 = 10 d2 = 10 X = matrix(rnorm(N*K),N,K) Z1 = pmax(X,0) %*% matrix(rpois(K*d1,lambda=1)/d1,K,d1) + rep(rnorm(d1),each=N) Z2 = pmax(Z1,0) %*% matrix(rpois(d1*d2,lambda=1)/d2,d1,d2) + rep(rnorm(d2),each=N) Y = pmax(Z2,0) %*% matrix(rpois(d2*1,lambda=1),d2,1) + 1 train.idx = sample(1:N,floor(N*0.9),replace = FALSE) test.idx = setdiff(1:N,train.idx) x_train = X[train.idx,] y_train = Y[train.idx] x_test = X[test.idx,] y_test = Y[test.idx] model <- keras_model_sequential() model %>% layer_dense(units = 2*d1, activation = 'relu', input_shape = c(K)) %>% layer_dropout(rate = 0.5) %>% layer_dense(units = d2, activation = 'relu') %>% layer_dense(units = 1) model %>% compile( optimizer = optimizer_rmsprop(), loss = 'mse', metrics = c('mae') ) history <- model %>% fit( x_train, y_train, epochs = 100,batch_size = N*0.1 ) model %>% evaluate(x_test, y_test)