###############################
#
#  41000: R scripts
#
###############################



###############################
#
#  Section 1: Probability
#
###############################



#############################
# Maris
#############################

# Roger Maris Home run record

maris <- c(14,28,16,39,61,33,23,26,8,13)

marismean <- mean(maris)
marisstd  = sqrt(var(maris))

xbar = rep(0,1000)

for(i in 1:1000){
  aux = rnorm(10,marismean,marisstd)
  xbar[i] = mean(aux)
}

hist(maris,prob=T,col=4)

aux = seq(0,70,by=0.01)
lines(aux,dnorm(aux,marismean,marisstd),col=2,lwd=2)
abline(v=61,col=2,lwd=2,lty=2)
pdf(file="maris-2.pdf",width=5,height=3.5)

pnorm(61,marismean,marisstd)


##################################
# Binomial Distribution
##################################
# simulate 10000 observations of N(10,3^2)
N = 10000
x = rnorm(N,10,3)

# calculate the proportion less than 5
p = sum(x < 5)/N

# now generate N observations again from Binom(120,p)
y = rbinom(N,120,p)

# plot a histogram
hist(y,freq=FALSE,col="skyblue",main="Probability of number of deffective boards")
abline(v=10)

# calculate the sample proportion that number of deffective boards is more than 10
sum(y > 10)/N

# compare with the theoretical number
1-pbinom(10,120,p)

##################################
# Poisson Distribution
##################################
# simulate 10000 observations 
N = 10000
A = rpois(N,2.2)
B = rpois(N,1.6)

# plot a barplot
op=par(mfrow = c(1, 2))
barplot(table(A)/N,main="Probability of Arsenal scores",col="green")
barplot(table(B)/N,main="Probability of Liverpool scores",col="red")
par(op)

# plot a histogram of A-B
hist(A-B)

# calculate the sample proportion of 0-0
sum(A==0 & B==0)/N

# compare with the theoretical number
dpois(0,2.2)*dpois(0,1.6)

# the sample probablity that A > B
sum(A > B)/N

# compare to the theoretical number
# the poisson difference follows a Skellam distribution
library("VGAM")
# generate 10000 observations from the Skellam distribution
d = rskellam(N,2.2,1.6)
sum(d>0)/N



##################################
#  Binomial, Poisson and Normal 
##################################

par(mfrow=c(1,2))
z1 = rbinom(1000,25,0.5)
mean(z1)
sd(z1)

z2 = rnorm(1000,12.5,2.5)
hist(z1,col=4,prob=T)
hist(z2,col=5,prob=T)


# Probabilities

dbinom(5,10,0.5)
dbinom(5,10,0.2)

z1 = rbinom(100,10,0.2)
hist(z1,col=39)
sum(z1==5)/100

dpois(3,1)

############################
# Poisson
############################

# Comparing two Poisson Distributions

z1 <- rpois(100,5)
z2 <- rpois(100,2)

par(mfrow=c(1,2))
hist(z1, col=5, main="", xlab="Poisson")
hist(z2, add=TRUE, col=6)
legend("topright", legend=c("lambda=5", "lambda=2"), fill=c(5,6), bty="n")
boxplot(list(z2,z1), col=c(6,5), horizontal=TRUE, ylab="Distn", xlab="Poisson")


# P(X>Y) and P(X=Y)

z1 <- rpois(1000,0.6)
z2 <- rpois(1000,1.4)

# Win, Draw, Loss

sum(z1>z2)
sum(z1==z2)
sum(z1<z2)

###############################
# Correlation
###############################

# Examples

par(mfrow=c(2,2))
x1 <- rnorm(100,0)
y1 <- rnorm(100,0)

# correlation matrix
cor(cbind(y1,x1))

model = lm(y1~x1)
plot(x1,y1,cex=1.0,pch=19,col=4)
abline(model$coef[1], model$coef[2],lwd=2,col=2)
legend("topright", legend=c("corr= 0.0"), bty="n")

y1 <- x1+0.1*rnorm(100,0)
cor(y1,x1)

model = lm(y1~x1)
plot(x1,y1,cex=1.0,pch=19,col=4)
abline(model$coef[1], model$coef[2],lwd=2,col=2)
legend("topright", legend=c("corr= 0.9956"), bty="n")

y1 <- -1.0*x1+0.3*rnorm(100,0)
cor(y1,x1)
model = lm(y1~x1)
plot(x1,y1,cex=1.0,pch=19,col=4)
abline(model$coef[1], model$coef[2],lwd=2,col=2)
legend("topright", legend=c("corr= -0.9599"), bty="n")

y1 <- x1*x1 + 0.2*rnorm(100,0)

# correlation matrix
cor(cbind(y1,x1))

model = lm(y1~x1)
plot(x1,y1,cex=1.0,pch=19,col=4)
abline(model$coef[1], model$coef[2],lwd=2,col=2)
legend("topright", legend=c("corr= 0.0"), bty="n")

#####################################
#
# Section 2: Statistics A/B testing
#
####################################





############################
# EPL: Chelsea 2014 season
############################

#setwd("C:/Users/ngp/Documents/41000solns/Chelsea")

mydata <- read.csv("Chelsea.csv",header=F)
print(mydata)

# team names converted into numbers by alphabetical order
# Chelsea is 4
team <- c(mydata[,1],mydata[,4]) 
score <- c(mydata[,2],mydata[,3])

# combine them as a data frame
mydf <- data.frame(cbind(team,score))


# scores by Chelsea
density_1 <- hist(mydf$score[mydf$team == 4],breaks=0:7,right=F,plot=F)$density

# sample mean and variance
mu_1 <- mean(mydf$score[mydf$team == 4])
var(mydf$score[mydf$team == 4])

# compare to poisson(mu_1)
barplot(rbind(density_1,dpois(0:6,mu_1)),names=0:6,
        beside=TRUE, main="Score for",
        ylab="probability", xlab="Goals",
        args.legend=list(x="top", bty="n"),
        legend.text=c("Chelsea", "Poisson"),col=c(23,46))

# scores against Chelsea
density_2 <- hist(mydf$score[mydf$team != 4],breaks=0:4,right=F,plot=F)$density

# sample mean and variance
mu_2 <- mean(mydf$score[mydf$team != 4])
var(mydf$score[mydf$team != 4])

# compare to poisson(mu_2)
barplot(rbind(density_2,dpois(0:3,mu_2)),names=0:3,
        beside=TRUE, main="Score against",
        ylab="probability", xlab="Goals",
        args.legend=list(x="top", bty="n"),
        legend.text=c("Chelsea", "Poisson"),col=c(23,48))




#############################
# Simulated Coin
#############################

N = 1000
flips = rbinom(N, 1, 0.5)
trials = 1:N

plot(cumsum(flips)/trials, type='l', bty='n', xlab="Number of coin flips", ylab="Frequency of Heads", main="", ylim=c(0.3, 0.7))
abline(h=0.5, col='grey', lty="dashed")

par(cex.axis=0.8,cex.lab=0.9,cex.main=0.8, mar=c(0,0,0,0), mgp = c(2,1,0))



###################################
# Testing in R
###################################

# Quantiles, z_alpha/2, p-value

pnorm(0.975)
pnorm(0.995)
qnorm(0.975)
qnorm(0.995)

alpha = .05 
z=1.96
z.half.alpha = qnorm(1-alpha/2) 
2*pnorm(-abs(z))

# (normal) QQ plots
n = 100
y = rnorm(n)
qqnorm(y,pch=20,col="purple")
abline(0,1)


###################################
# McFarland Insurance
###################################

xbar=57
mu0=60
s=10
n=26
se=s/sqrt(n)
T=(xbar-mu0)/se
T

# Critical Region

qt(0.99, df=n-1)

# p-value. one-sided

pt(T,df=n-1)


####################################
# Biogen: Testing Proportions
####################################

# t-test formula is
# (p_hat-p_0)/sqrt(p_0*(1-p_0)/n)

# x is your observed sample
# n is the sample size
# our null hypothesis is p (>)= 0.95

prop.test(45,75,p=0.95,alternative="less",conf.level=0.99)

# if you want to use the confidence interval of p_hat
# we need the two-sided t-test

prop.test(45,75,0.95,alternative="two.sided",conf.level=0.99)


#####################################
# Pfizer: Testing Proportions 
#####################################

# x is your observed sample
# n is the sample size
# our null hypothesis is p_1 - p2 <=0 or p_1 <= p_2

prop.test(x=c(11,77),n=c(1.5*10^6,6*10^6),alternative="greater",conf.level=0.95)

# if you want to use the confidence interval of p_hat
# we need the two-sided t-test

prop.test(x=c(11,77),n=c(1.5*10^6,6*10^6),alternative="two.sided",conf.level=0.95)


####################################
# Argon: Difference in means 
####################################

# Lord Rayleigh Data

a <- c(2.31017,2.30986,2.31010,2.31001,2.31024,2.31010,2.31028)
b <- c(2.30143,2.29890,2.29816,2.30182,2.29869,2.29940,2.29849,2.29889)

# two-sample t-test assume equal variance
t.test(a,b,var.equal=T)

####################################
# A/B Test 
####################################


site1 = c(.40, 500) # A
site2 = c(.30, 550) # B

abtestfunc <- function(ad1, ad2){
      sterror1 = sqrt( ad1[1] * (1-ad1[1]) / ad1[2] )
      sterror2 = sqrt( ad2[1] * (1-ad2[1]) / ad2[2] )
      minmax1 = c((ad1[1] - 1.28*sterror1) * 100, (ad1[1] + 1.28*sterror1) * 100)
      minmax2 = c((ad2[1] - 1.28*sterror2) * 100, (ad2[1] + 1.28*sterror2) * 100)
      print( round(minmax1,2) )
      print( round(minmax2,2) )
}

abtestfunc(site1, site2)

####################
## Superbowl
####################

# import superbowl data
mydata1 <- read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/superbowl1.txt",header=T)

# look at data
head(mydata1)
tail(mydata1)
mydata1

# attach so R recognizes  each variable
attach(mydata1)

# see the distribution of outcome and spread through histograms
hist(Outcome)
hist(Spread)

# we can also calculate the mean and standard deviation
mean(Outcome); sd(Outcome)
mean(Spread); sd(Spread)

# plot Spread vs Outcome
plot(Spread,Outcome)

# add a 45 degree line
abline(1,1)

# correlation 
cor(Spread,Outcome)

# Compare boxplot 
boxplot(Spread,Outcome,horizontal=T,names=c("spread","outcome"),col=c("red","yellow"),main="Superbowl")

# Covariance, Correlation, alpha, beta

x <- Spread
y <- Outcome

mean(x)
mean(y)
sd(x)
sd(y)
cov(x,y)
cor(x,y)
beta <- cor(x,y)*sd(y)/sd(x)
alpha <- mean(y)-beta*mean(x)
beta
alpha

# Regression check

model <- lm(y~x)
coef(model)


######################
## Kentucky Derby
######################

mydata2 <- read.csv("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/Kentucky_Derby_2014.csv",header=T)

# or import data from web using Rstudio 
# mydata2 = Kentucky_Derby_2014

# attach the dataset 
attach(mydata2)

head(mydata2)
tail(mydata2)

# plot a histogram of speedmph
hist(speedmph,col="blue")

# finer bins 
hist(speedmph,breaks=10,col="red")
hist(timeinsec,breaks=10,col="purple")

# to find the left tail observation
k1 <- which(speedmph == min(speedmph))
mydata2[k1,]

# to find the best horse
k2 <- which(speedmph == max(speedmph))
mydata2[k2,]

# working directory
# setwd("~/....") 


##################################
#  Berkshire Hathaway 1990-2017 
##################################

library(fImport)

Y = yahooSeries("BRK-A", from = "1990-01-01")

# take a look of the data
head(Y)

plot(Y[,6],type="l",col=20,main="BRKA Share Price",
     ylab="Price",xlab="Time")

# Calculate Returns

y = rev(Y$`BRK-A.Adj.Close`)
n <- length(y)
rets <- y[-1]/y[-n]-1

summary(rets)

# Two plots in one
# par(mfrow=c(1,2))

plot(rets,pch=20,col=24)
abline(0,0)

# summarise returns

mean(rets)
sd(rets)
skewness(rets)
hist(rets,breaks=50,freq=FALSE,main="Berkshire Returns",col="red")

lines(seq(-1,1,0.001),dnorm(seq(-1,1,0.001),mean(rets),sd(rets)),lwd=2.5,col="green")

legend("topright",c("Actual Distribution","Normal Distribution"),
       lty=c(1,1),lwd=c(2.5,2.5),col=c("red","green"))


# to save the plot,click "Export" ans "save as image"
# ts.plot(y,main="BRK-A price time series",ylab="price",xlab="day")

##################################
#  Apple and Google
##################################


# Compare log-returns AAPL and GOOG

library(fImport)

Y = yahooSeries("AAPL", from = "2004-04-29")
y = rev(Y$AAPL.Adj.Close)
plot(y,type="l",col=20)

min(y)
max(y)

# Calculate log-returns

n <- length(y)
lrets <- log(y[-1]/y[-n]) 
summary(lrets)
plot(lrets,pch=20,col=34)
abline(0,0)

skewness(lrets)
kurtosis(lrets,method=c("moment"))

Y = yahooSeries("GOOG", from = "2004-04-29")
y = rev(Y$GOOG.Adj.Close)
plot(y,type="l",col=20)

min(y)
max(y)

# Calculate log-returns

n <- length(y)
lrets <- log(y[-1]/y[-n]) 
summary(lrets)
plot(lrets,pch=20,col=24)
abline(0,0)

skewness(lrets)
kurtosis(lrets,method=c("moment"))




##################################
#
#  Section 3: Modeling
#
##################################



##################################
#  Keynes Data
##################################

keynes = read.table("keynes.txt",header=T)
attach(keynes)

# Plot the data

plot(Year,Keynes,pch=20,col=34)
plot(Market, Keynes, xlab="Market Return", ylab="Keynes Excess Return",col=20)

# correlation matrix
cor(cbind(Keynes,Market))

# Fit the least-squares line. 

model = lm(Keynes~Market)
abline(model)
title("Keynes vs Market")

# Extract the model coefficients

coef(model)
summary(model)

# 4-in-1 residual diagnostics

layout(matrix(c(1,2,3,4),2,2))   

plot(model,pch=20)

# Calculate excess return

Keynes = Keynes - Rate
Market = Market - Rate

# correlation matrix
cor(cbind(Keynes,Market))

modelnew = lm(Keynes~Market)
summary(modelnew)


##################################
#  Buffett Data
##################################

buffett = read.table("buffett.txt",header=T)
attach(buffett)

# Plot the data

plot(Market, Buffett, xlab="Market Return", ylab="Buffett Return",pch=20)
legend(x="topleft",legend=c("Buffett","Market"),pch=15,col=c(2,4),bty="n")

# correlation matrix
cor(cbind(Buffett,Market))

# Fit the least-squares line and superimpose this line on the plot

model = lm(Buffett~Market)
abline(model)
title("Buffett vs Market")



# Extract the model coefficients 

coef(model)
summary(model)

# Improvement in Fit

sd(model$residuals)
sd(Buffett)

# Remove datapoint 10

buffett_10 = buffett[-10,]


###################################################
# Boston Housing Data 
# 
# Cutpoints tree are added to the plot 
###################################################

library(tree)
library(MASS)
data(Boston)
attach(Boston)

#--------------------------------------------------
#fit a tree to boston data just using lstat.

#first get a big tree using a small value of mindev
temp = tree(medv~lstat,data=Boston,mindev=.0001)
cat('first big tree size: \n')
print(length(unique(temp$where)))

#then prune it down to one with 7 leaves
boston.tree=prune.tree(temp,best=7)
cat('pruned tree size: \n')
print(length(unique(boston.tree$where)))

#--------------------------------------------------
#plot the tree and the fits.
par(mfrow=c(1,2))

#plot the tree
plot(boston.tree,type="uniform")
text(boston.tree,col="blue",label=c("yval"),cex=.8)

#plot data with fit
boston.fit = predict(boston.tree) #get training fitted values

plot(lstat,medv,cex=.5,pch=16) #plot data
oo=order(lstat)
lines(lstat[oo],boston.fit[oo],col='red',lwd=3) #step function fit

cvals=c(9.725,4.65,3.325,5.495,16.085,19.9) #cutpoints from tree
for(i in 1:length(cvals)) abline(v=cvals[i],col='magenta',lty=2) #cutpoints

rm(list=ls())



################################
# Sales versus price
################################

# First enter the data into R
sales = c(420, 380, 350, 400, 440, 380, 450, 420)
price =	c(5.5, 6.0, 6.5, 6.0, 5.0, 6.5, 4.5, 5.0)

# Plot the data

plot(price, sales, xlab="Price ($100)", ylab="Sales (Thousands of units)")

# correlation matrix
cor(cbind(sales,price))

# Fit the least-squares line and superimpose this line on the plot

model = lm(sales~price)
abline(model)

# Extract the model coefficients 

coef(model)

# Improvement in Fit

sd(model$residuals)
sd(sales)

# correlation matrix
# which is different from the above correlation matrix
cor(log(cbind(sales,price)))

# Model in logs

modelnew = lm(log(sales)~log(price))

summary(modelnew)

#New data

midcity <- read.table("House.txt",header=TRUE)
attach(midcity)
plot(SqFt,Price,pch=20,cex=1.25,main='Square feet vs. Sales price',ylab='Sales price in USD',xlab='Square feet',bty='n')
mx <- mean(SqFt)
my <- mean(Price)
sdx <- sd(SqFt)
sdy <- sd(Price)
rho <- cor(Price,SqFt)
b <- rho*sdy/sdx
a <- my - mx*b
abline(a,b,col='purple',lwd=3)


######################################
# Transformation Example
######################################


# Simulate data: 

x = rnorm(50,1,1)
y = exp(2*x+ rnorm(50,sd=0.4))

layout(matrix(c(1,2),1,2))

plot(x,y,col="purple",pch=20)
model1 = lm(y~x)
abline(model1,col=2,lwd=2)

plot(model1)

plot(x,log(y),col=2,pch=20)
model2 = lm(log(y)~x)
abline(model2,col=3,lwd=3)

plot(model2)

coef(model2)

# Simulate data: ln-ln

x = runif(50,0,4)
y = 2*x^(1/3 + rnorm(50,sd=0.1))

layout(matrix(c(1,2),1,2))

plot(x,y,col="purple",pch=20)
model1 = lm(y~x)
abline(model1,col=2,lwd=2)

plot(log(x),log(y),col=2,pch=20)
model2 = lm(log(y)~log(x))
abline(model2,col=3,lwd=3)



####################################
# Mammals Example
####################################


mammals = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/mammals.txt",header=T)
attach(mammals)

# correlation matrix
cor(cbind(Brain,Body))

# Fit Model

model = lm(Brain~Body)

par(mfrow=c(1,2))
plot(Brain,Body,pch=19,col=4,main="Fitted Model")
abline(lsfit(Brain,Body),lwd=2,col=2)

plot(Brain,model$res,pch=19,col=2,ylab="residuals",main="Residual Plot")

# correlation matrix
cor(log(cbind(Brain,Body)))

lnmodel = lm(log(Body)~log(Brain))

# Graphics

par(mfrow=c(1,2))

# residual analysis

plot(log(Brain),log(Body),pch=19,col=4,main="Fitted Model")
abline(lsfit(log(Brain),log(Body)),lwd=2,col=2)
plot(log(Brain),lnmodel$res,pch=19,col=2,ylab="residuals",main="Residual Plot")


##################################
#  Electricity Data
##################################

# Load data and rename variables

electricity = read.csv("Electricity.csv")
dimnames(electricity) [[2]] = c("Customer","Usage","Demand")
electricity
attach(electricity)

# Plot data
plot(Usage,Demand, main="Electricity Demand",pch=20,col="purple")

# correlation matrix
cor(cbind(Demand,Usage))

# Fit linear model 
electricity.lm = lm(Demand ~ Usage)
anova(electricity.lm)

# Plot residuals
plot(electricity.lm$fitted,electricity.lm$residuals,xlab="fitted",ylab="residuals",main="residual plot",pch=20,col="purple")
abline(0,0)

# Transform the response to stabilize the variance
rootDemand = sqrt(Demand)

# correlation matrix
cor(cbind(rootDemand,Usage))

electricity.lm = lm(rootDemand ~ Usage)
plot(electricity.lm$fitted,electricity.lm$residuals,xlab="fitted",ylab="residuals",main="residual plot with transformed response",pch=20,col=59)
abline(0,0)


##################################
#  Anscombe Residual Diagnostics
##################################

ansc <- read.csv("anscombe.csv", header=TRUE)
attach(ansc)

pdf('anscombe-R-001.pdf')
par(mfrow=c(2,2))

plot1 <- plot(ansc$x1, ansc$y1, main='Set 1',col=34,pch=20)

# correlation matrix
cor(cbind(y1,x1))

set1.fit <- lm(y1 ~ x1, data=ansc)
abline(set1.fit)

plot2 <- plot(ansc$x2, ansc$y2, 
    main='Set 2',col=20,pch=20)
    
# correlation matrix
cor(cbind(y2,x2))

set2.fit <- lm(y2 ~ x2, data=ansc)
abline(set2.fit)

#sidebysideplot <- grid.arrange(plot1,plot2,col=2)

dev.off()

pdf('anscombe-R-002.pdf')
layout(matrix(1:4,2,2))
plot(ansc$x1, ansc$y1, main='Set 1',col=34,pch=20)

# correlation matrix
cor(cbind(y1,x1))

set1.fit <- lm(y1 ~ x1, data=ansc)
abline(set1.fit)

plot(ansc$x2, ansc$y2, 
    main='Set 2',col=24,pch=20)
    
# correlation matrix
cor(cbind(y2,x2))

set2.fit <- lm(y2 ~ x2, data=ansc)
abline(set2.fit)

plot(ansc$x3, ansc$y3, 
    main='Set 3',col=20,pch=20)
    
# correlation matrix
cor(cbind(y3,x3))

set3.fit <- lm(y3 ~ x3, data=ansc)
abline(set3.fit)

plot(ansc$x4, ansc$y4, 
    main='Set 4',col=3,pch=20)

# correlation matrix
cor(cbind(y4,x4))

set4.fit <- lm(y4 ~ x4, data=ansc)
abline(set4.fit)


# Fit the regression line and get the results

summary(set1.fit)
summary(set2.fit)
summary(set3.fit)
summary(set4.fit)


# Look at residual plots

layout(matrix(1:4,2,2))
plot(ansc$x1,residuals(set1.fit),main='Set 1 Residuals',col=34,pch=20)
plot(ansc$x2,residuals(set2.fit),main='Set 2 Residuals',col=24,pch=20)
plot(ansc$x3,residuals(set3.fit),main='Set 3 Residuals',col=20,pch=20)
plot(ansc$x4,residuals(set4.fit),main='Set 4 Residuals',col=3,pch=20)



################################
#  Smoking Data
################################


Data = read.csv("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/smoking.csv",header=T)
attach(Data)

# correlation matrix
cor(cbind(Cancer,Consumption))

# scatter plot  		
plot(Consumption,Cancer,pch=20,col=34,main="Cancer vs Smoking consumption");
text(Consumption,Cancer,labels=Country)

# Fit the least-squares line. 

model = lm(Cancer~Consumption)
abline(model,col=2,lty=2,lwd=2)

cooks.distance(model) # remove u.s. as an outlier

model2 = lm(Cancer[-7]~Consumption[-7])
abline(model2,col=3,lty=3,lwd=3)

legend("topleft", c("Full sample","Removed U.S."), col=2:3, lty=2:3, lwd=2:3, bty="n")

# Extract the model coefficients

coef(model)
coef(model2)

# 4-in-1 residual diagnostics

layout(matrix(c(1,2,3,4),2,2))   
plot(model,pch=20)

layout(matrix(c(1,2,3,4),2,2))   
plot(model2,pch=20)


# Delete the US and re-run

Datanew = Data[-11,]
lm(Cancer~Consumption)

##############################
#
#  Section 4: Multiple Regression
#
##############################


##################################
#  Bordeaux Wine Data
##################################

bordeaux = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/bordeaux.txt",header=T)
attach(bordeaux)

# Plot the data

plot(year,price,type="l",pch=20,col=4,main="Bordeaux Wine Prices")

# correlation matrix
cor(cbind(price,sum,har,sep,win,age))

# Build a Multiple Regression Model

model = lm(price~sum+har+sep+win+age)

summary(model)
coef(model)

# Diagnostics

plot(rstudent(model))

# Graphics: 2x2 plot
layout(matrix(c(1,2,3,4),2,2))
plot(model)

# Prediction: Import bordeauxp.txt or directly into R
bordeauxp = data.frame(read.table("bordeauxp.txt",header=T))
attach(bordeauxp)

predict(model,bordeauxp,interval="confidence")

# Change the Significance: 99% 

predict(model,bordeauxp,interval="confidence",level=0.99)

# Fitted Value Plot

plot(price)
lines(predict(model))


# Detailed Graphics and predict.lm()

mydata1 = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/bordeaux.txt",header=T)
mydata2 = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/bordeauxp.txt",header=T)

# summary statistics
summary(mydata1)

# a simple multiple regression trial
model = lm(log(price)~log(sum)+log(har)+log(sep)+log(win)+log(age),data=mydata1)

# log-log regression results
summary(model)

# residual diagnostics
plot(model)


# inputs: regression model and independent variables
pred = predict(model,mydata2,interval="prediction")
pred

# out-of-sample prediction line
plot(mydata2[,1],pred[,1],type="l",xlab="year",ylab="log_price",ylim=c(0,8),
     col=2, lty=2, lwd=2,main="out-of-sample prediction line")
lines(mydata2[,1],pred[,2],col=3, lty=3, lwd=3)
lines(mydata2[,1],pred[,3],col=4, lty=4, lwd=4)
legend("topleft", c("fit","lwr","upr"),col=2:4, lty=2:4, lwd=2:4, bty="n")

# exponentiate back the predicited price
price_hat = exp(pred[,1])
plot(mydata2[,1],price_hat,type="l",xlab="year",ylab="predicted price",ylim=c(0,50),
     col=2, lty=2, lwd=2,main="out-of-sample prediction line")




##################################
#  Election 2016 Data
##################################


mydata = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/election2016.txt",header=T)
attach(mydata)
head(mydata)
names(mydata)

# The equation to predict the 2016 presidential election is
model1 = lm(VP~G+P+Z, na.action = na.omit, data=mydata)
summary(model1)

# The equation to predict the 2016 House election is
model2 = lm(VC~G+P+Z, na.action = na.omit, data=mydata)
summary(model2)



##################################
#  Homeless Data
##################################

homeless = read.table("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/homeless.txt",header=T)
attach(homeless)
names(homeless)

# correlation matrix
cor(cbind(rate,pov,unemp,housing,pop,temp,vac,rentc))

# Build a Model

model = lm(rate~pov+unemp+housing+pop+temp+vac+rentc)

# Diagnostics

plot(model)

# Transformation and Prediction

# Commands

plot(model)
coef(model)
summary(model)
rstudent(model)
cooks.distance(model)


##################################
#  Newfood Data
##################################


# Import newfood.csv data

newfood = read.csv("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/newfood.csv",header=T)
attach(newfood)
names(newfood)

# correlation matrix
cor(cbind(sales,price,adv,locat,income,svol))

# Build model 

model = lm(sales~price+adv+locat+income+svol)
summary(model)

# check diagnostics

layout(matrix(c(1,2,3,4),2,2))
plot(model,pch=20,col="blue")

# Transformation
# log-log model
# don't transform dummy variables: adv, locat

log_sales = log(sales)
log_price = log(price)
log_income = log(income)
log_svol = log(svol)

# correlation matrix
cor(cbind(log_sales,log_price,adv,log_income,log_svol))

# Build model

modelnew = lm(log_sales~log_price+adv+log_income+log_svol)
summary(modelnew)

# Graphics: 2x2 plot
layout(matrix(c(1,2,3,4),2,2))
plot(modelnew,pch=20,col="blue")

# confidence level

newdata1 = data.frame(log_price=log(30),adv=1,log_income=log(8),log_svol=log(34))
predict.lm(modelnew,newdata1,se.fit=T,interval="confidence",level=0.99)

# predict.lm at median(log_price), adv=1, median(log_income), median(log_svol)

newdata2 = data.frame(log_price=median(log_price),adv=1,
                      log_income=median(log_income),log_svol=median(log_svol))
predict.lm(modelnew,newdata2,se.fit=T,interval="prediction")

# prediction line for log-log
layout(matrix(c(1,2,3,4),2,2))
termplot(modelnew,ylabs="log_sales",terms="log_price",se=T)  
termplot(modelnew,ylabs="log_sales",terms="adv",se=T)  
termplot(modelnew,ylabs="log_sales",terms="log_income",se=T)  
termplot(modelnew,ylabs="log_sales",terms="log_svol",se=T)  




##############################
#
#  Section 5: Time Series
#
##############################


##############################
#  Nasdaq-DJI
##############################

Data = read.table("nasdaq-djia.txt",header=T)
attach(Data)

# correlation matrix
cor(cbind(nasdaq,djia))

model = lm(nasdaq~djia)

plot(djia,nasdaq,cex=0.8,pch=19,col=4)
abline(model$coef[1], model$coef[2],lwd=2,col=2)

acf(model$res,cex=2,ci.col=2,col=4,lwd=3,main="ACF of Residuals")

plot(model$res,cex=0.8,pch=19,col=2)
lines(model$res,col=4,lwd=2)


##############################
# SP500
##############################

Data = read.table("SP500-dailyreturns.txt",header=T)
attach(Data)

L1 = length(Returns)
sp500 = Returns[1:(L1-1)]
sp5001 = Returns[2:L1]
T = 1:L1

# correlation matrix
cor(cbind(sp500,sp5001))

model = lm(sp500~sp5001)

plot(sp500,sp5001,cex=0.8,pch=19,col=4)
abline(model$coef[1], model$coef[2],lwd=2,col=2)

acf(model$res,cex=2,ci.col=2,col=4,lwd=3,main="ACF of Returns")


plot(model$res,cex=1.5,pch=19,col=2)
lines(model$res,col=4,lwd=2)



############################
# Volatility VIX                  
#############################

Data = read.csv("vix2009.csv",header=T)
attach(Data)

pdf(file="vix-acf.pdf",width=5,height=4)
acf(Close,cex=2,ci.col=2,col=4,lwd=3,main="ACF of VIX")

L1 = length(Close)
vix2009 = Close[1:(L1-1)]
vix20091 = Close[2:L1]

vixreturn <- vix20091/vix2009
lvix <- log(vixreturn)
plot(lvix,pch=19,col=10)

acf(lvix,cex=2,ci.col=2,col=4,lwd=3,main="ACF of VIX-returns")


#########################
# House Example                             
#########################

#setwd("C:/Users/ngp/Documents/41000")

House = read.table("housedata.txt",header=T)
attach(House)

plot(Size,Price,pch=19)

# correlation matrix
cor(cbind(Price,Size))

Housemodel = lm(Price~Size,data=House)
summary(Housemodel)

names(Housemodel)
names(summary(Housemodel))

confint(Housemodel)
confint(Housemodel,level=0.99)

Xfuture <- data.frame(Size=seq(0,8,by=0.01))

Future1 = predict(Housemodel, Xfuture, interval = "prediction",se.fit=T)
Future2 = predict(Housemodel, Xfuture, interval = "prediction",se.fit=T,level=0.99)

plot(Size,Price,xlim=c(0,8),ylim=range(Future1$fit),pch=19,cex.lab=1.3)
abline(lsfit(Size,Price),lwd=2,col=2)
lines(Xfuture$Size,Future1$fit[,2],col=4,lwd=2,lty=2)
lines(Xfuture$Size,Future1$fit[,3],col=4,lwd=2,lty=2)


##########################
# South Sea Bubble                              
##########################

#setwd("C:/Users/ngp/Documents/41000solns/sea bubble example")
mydata <- read.csv("sea_bubble.csv",header=T)

head(mydata)

names <- ls(mydata)

ts1 <- ts(mydata[,2], start=c(1719, 232), end=c(1720, 365), frequency=365) 
ts2 <- ts(mydata[,3], start=c(1719, 232), end=c(1720, 365), frequency=365) 
ts3 <- ts(mydata[,4], start=c(1719, 232), end=c(1720, 365), frequency=365) 
ts4 <- ts(mydata[,5], start=c(1719, 232), end=c(1720, 365), frequency=365) 


plot(ts1,main="South Sea Bubble",ylab="Prices",ylim=c(0,1000), col=1, lty=1, lwd=1)
lines(ts2,col=2, lty=2, lwd=2)
lines(ts3,col=3, lty=3, lwd=3)
lines(ts4,col=4, lty=4, lwd=4)
legend("topleft", names[c(1,5,4,6)], col=1:4, lty=1:4, lwd=1:4, bty="n")


# fit a random walk model for South Sea Company
model = ar(ts4,order.max=1,na.action=na.exclude)
model  # the AR term is close to 1, so this is a random walk



##########################
# Kentucky Derby                              
##########################

#setwd("C:/Users/ngp/Documents/41000solns")

mydata = read.csv("Kentucky_Derby_2014.csv",header=T)
head(mydata)
tail(mydata)
attach(mydata)


ts1 <- ts(speedmph, start=1875, end=2014, frequency=1) 
plot(ts1,xlab="year",ylab="speed mph",main="Kentucky Derby",col=2, lty=2, lwd=2)

# after removing the time trend, fit an ARMA model

model = lm(speedmph~year_num)
speed_tt = model$coef[1] + model$res
ts2 <- ts(speed_tt, start=1875, end=2014, frequency=1) 
plot(ts2,xlab="year",ylab="speed mph",main="time trend removed",col=2, lty=2, lwd=2)

# to find the outlier
out = which(abs(model$res)>1.645)

# label those outliers
plot(ts2,xlab="year",ylab="speed mph",main="time trend removed",col=2, lty=2, lwd=2)
text(x=out+1874,y=ts2[out],labels=winner[out])

# remove the outliers
ts3 = ts2[-out]

# plot the acf and pacf plots
# an AR model might not be appropriate
# but we can consider an MA model 
# acf(ts3)
# pacf(ts3)

model3 = arima(ts3,c(0,0,1))
# MA(1) prediction + the time trend
ts4 = ts3-model3$res + c(1:140)[-out]*model$coef[2]
plot(ts4,main="MA(1) predictions, outliers removed",
     xlab="year",ylab="speed mph",col=2, lty=2, lwd=2)
lines(ts1[-out],col=3, lty=3, lwd=3)




##########################
# Random Walk                               
##########################


library(fImport)

Y = yahooSeries("SQM", from = "2008-01-01")
y = rev(Y$SQM.Adj.Close)
plot(y,type="l",col=20)

N = length(y)
mu = y[1]
sigma = sd(diff(y))
x = rep(mu, N)

for(i in 2:N)
{
  x[i] = x[i-1] + rnorm(1,0,sigma)
}

mylim = c(min(c(x,y)), max(c(x,y)))

par(mfrow=c(1,2))

plot(y,type='l', bty='n', xlab='Trading days from January 1, 2008', ylab=expression(X[t]), main="Simulated random walk....", ylim=mylim)

plot(x,type='l', bty='n', xlab='Trading days from January 1, 2008', ylab=expression(X[t]), main="... or actual stock price?", ylim=mylim)


############################################
#
# MNIST
#
###########################################


library(keras)

mnist <- dataset_mnist()
x_train <- mnist$train$x
y_train <- mnist$train$y
x_test <- mnist$test$x
y_test <- mnist$test$y
y_train <- to_categorical(y_train, 10)
y_test <- to_categorical(y_test, 10)

dim(x_train) <- c(nrow(x_train), 784)
dim(x_test) <- c(nrow(x_test), 784)
# rescale
x_train <- x_train / 255
x_test <- x_test / 255

model <- keras_model_sequential() 
model %>% 
  layer_dense(units = 256, activation = 'relu', input_shape = c(784)) %>% 
  layer_dropout(rate = 0.4) %>% 
  layer_dense(units = 128, activation = 'relu') %>%
  layer_dropout(rate = 0.3) %>%
  layer_dense(units = 10, activation = 'softmax')

summary(model)

model %>% compile(
  loss = 'categorical_crossentropy',
  optimizer = optimizer_rmsprop(),
  metrics = c('accuracy')
)

history <- model %>% fit(
  x_train, y_train, 
  epochs = 30, batch_size = 128, 
  validation_split = 0.2
)

plot(history)

model %>% predict_classes(x_test)

######### simulation example : regression
N = 10000
K = 50
d1 = 10
d2 = 10

X = matrix(rnorm(N*K),N,K)
Z1 = pmax(X,0) %*% matrix(rpois(K*d1,lambda=1)/d1,K,d1) + rep(rnorm(d1),each=N)
Z2 = pmax(Z1,0) %*% matrix(rpois(d1*d2,lambda=1)/d2,d1,d2) + rep(rnorm(d2),each=N)
Y = pmax(Z2,0) %*% matrix(rpois(d2*1,lambda=1),d2,1) + 1
  
train.idx = sample(1:N,floor(N*0.9),replace = FALSE)
test.idx = setdiff(1:N,train.idx)
x_train = X[train.idx,]
y_train = Y[train.idx]
x_test = X[test.idx,]
y_test = Y[test.idx]

model <- keras_model_sequential()
model %>% 
  layer_dense(units = 2*d1, activation = 'relu', input_shape = c(K)) %>% 
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = d2, activation = 'relu') %>% 
  layer_dense(units = 1) 

model %>% compile(
  optimizer = optimizer_rmsprop(),
  loss = 'mse',
  metrics = c('mae')
)

history <- model %>% fit(
  x_train, y_train, 
  epochs = 100,batch_size = N*0.1
)

model %>% evaluate(x_test, y_test)