#---------------------------------
# Derby Data
#---------------------------------

mydata = read.csv("http://faculty.chicagobooth.edu/nicholas.polson/teaching/41000/Kentucky_Derby_2018.csv",header=T)
head(mydata)
tail(mydata)
attach(mydata)


ts1 <- ts(speedmph, start=1875, end=2016, frequency=1) 
plot(ts1,xlab="year",ylab="speed mph",main="Kentucky Derby",col=2, lty=2, lwd=2)

# we can see obvious time trend in the time series plot
# here is a simple way to extract the time trend
# after removing the time trend, we can talk about the ARMA model
model = lm(speedmph~year_num)
speed_tt = model$coef[1] + model$res
ts2 <- ts(speed_tt, start=1875, end=2016, frequency=1) 
plot(ts2,xlab="year",ylab="speed mph",main="time trend removed",col=2, lty=2, lwd=2)

# to find the outlier
out = which(abs(model$res)>1.645)

# label those outliers
plot(ts2,xlab="year",ylab="speed mph",main="time trend removed",col=2, lty=2, lwd=2)
text(x=out+1874,y=ts2[out],labels=winner[out])

# remove the outliers
ts3 = ts2[-out]
winner2 = winner[-out]
years = c(1:142)[-out]+1874

# plot the acf and pacf plots
# an AR model might not be appropriate
# but we can consider an MA model 
# acf(ts3)
# pacf(ts3)

model3 = arima(ts3,c(0,0,1))

out2 = which(abs(model3$res)>1)
winner2[out2]

# MA(1) prediction + the time trend
ts4 = ts3-model3$res + c(1:142)[-out]*model$coef[2]

plot(years,ts4,type="l",ylim=c(32.5,37.5),main="MA(1) predictions, outliers removed",
     xlab="year",ylab="speed mph",col=2, lty=2, lwd=2)
lines(years,ts1[-out],type="l",col=3, lty=3, lwd=3)
text(x=years[out2],y=ts1[-out][out2],labels=winner2[out2])