###################################################################### # Problem 2 # # Stereo System # ###################################################################### Sales = c(420, 380, 350, 400, 440, 380, 450, 420) Price = c(5.5, 6.0, 6.5, 6.0, 5.0, 6.5, 4.5, 5.0) reg = lm(Sales ~ Price) plot(Price, Sales) abline(reg$coef[1], reg$coef[2], col="red") title("Salves vs. Price") confint(reg) ###################################################################### # Problem 6 # # Shock Absorber (SLR) # ###################################################################### # Read the data. Make sure you are in the right directory. shock = read.csv("shock.csv") # So that we don't have to write shock$whatever every time. attach(shock) # If you do not want to attach shock, you can use shock$variablename. # Plot the data. plot(reboundb, rebounda, xlab="Before", ylab="After", main="Shock") # Do the regression. reg = lm(rebounda ~ reboundb) # Take a look at the summary. summary(reg) # To get the confidence interval for beta_0 and beta_1. confint(reg) # Get our prediction when BEFORE = 550. B = 550 # A = b_0 + b_1 B A = reg$coef[1] + reg$coef[2] * B # Now create our plug-in predictive interval. Left = A - 2 * 7.67 Right = A + 2 * 7.67 # Detach shock to free up the namespace. detach(shock) ###################################################################### # Problem 8 # # Heights (SLR) # ###################################################################### # Read the file. MBA = read.csv("MBA-hgt.csv", header=TRUE) # So that we don't have to write MBA$whatever every time. attach(MBA) # Plot. par(mfrow=c(1,2)) # Put both on same plot. plot(MHGT, SHGT) # Plot student vs. mother. plot(FHGT, SHGT) # Plot student vs. fater. # Get the correlation. MCOR = cor(MHGT, SHGT) FCOR = cor(FHGT, SHGT) # Get the means. MAVE = mean(MHGT) FAVE = mean(FHGT) SAVE = mean(SHGT) # Do the regressions. mreg = lm(SHGT ~ MHGT) freg = lm(SHGT ~ FHGT) # To get the least squares lines and the residual standard error use # summary. summary(mreg) summary(freg) # Calculate the sum of squares by hand. # Sum of squared residuals. MSSE = sum(mreg$res^2) FSSE = sum(freg$res^2) # Total sum of squares - same for each. SST = sum((SHGT - mean(SHGT))^2) # Sum of squares for the regression. MSSR = SST - MSSE FSSR = SST - FSSE # The R^2 MR2 = MSSR / SST FR2 = FSSR / SST # The residual for my own height. MRES = 72 - (32.8 + 0.574 * 67) FRES = 72 - (47.6 + 0.315 * 74) # Detach MBA to free up the namespace. detach(MBA) ###################################################################### # Problem 9 # # Market Model Example # ###################################################################### # Load the data. mkt = read.csv("mktmodel.csv") # See the names of the data. names(mkt) # Run the regressions. reg1 = lm(GE ~ SP500, data=mkt) reg2 = lm(JPM ~ SP500, data=mkt) reg3 = lm(XON ~ SP500, data=mkt) # Check the confidence intervals confint(reg1) confint(reg2) confint(reg3) # Check the 99% confidence interval. confint(reg1, level=0.99) ###################################################################### # Problem 10 # # Sample Size and SLR # ###################################################################### N = 70 # Make some X values and record the min and max. X = rnorm(N, 0, sqrt(2)) xint = c(min(X), max(X)) # The random sample Y. ep = rnorm(N, 0, 2) Y = 3.0 + 1.0 * X + ep yint = c(min(Y), max(Y)) ## Y2 = 3.0 + 1.0 * X + rnorm(N, 0, 4) ## yint2 = c(min(Y2), max(Y2)) # Scatter plot with true regression line. par(mfrow=c(1,2)) plot (X, Y, col=2, ylim=yint, main="ep ~ N(0, 4)") abline(3.0, 1.0) ## plot(X, Y2, col=3, ylim=yint2, ylab="Y", main="ep ~ N(0,16)") ## abline(3.0, 1.0) # Split the sample into different sizes... # 10 data points. Y10 = Y[ 1:10] X10 = X[ 1:10] # 20 data points. Y20 = Y[11:30] X20 = X[11:30] # 40 data points. Y40 = Y[31:70] X40 = X[31:70] # Three different regressions. reg10 = lm(Y10 ~ X10) reg20 = lm(Y20 ~ X20) reg40 = lm(Y40 ~ X40) # Plot the results of the regressions... # To plot all three regressions at the same time. par(mfrow=c(1,3)) the.title = "Regression with 10 data points." plot (X10, Y10, xlim=xint, ylim=yint, main=the.title, col=2) abline(reg10, col=2) abline(3.0, 1.0) the.title = "Regression with 20 data points." plot (X20, Y20, xlim=xint, ylim=yint, main=the.title, col=2) abline(reg20, col=2) abline(3.0, 1.0) the.title = "Regression with 40 data points." plot (X40, Y40, xlim=xint, ylim=yint, main=the.title, col=2) abline(reg40, col=2) abline(3.0, 1.0) # Reset the plotting device to display one plot at a time. par(mfrow=c(1,1)) ###################################################################### # Problem 11 # # T-Statistics # ###################################################################### # The number of data points. n = 50 # The dependent variable X. X = rnorm(n, 0.0, 2.0) # You can pick anything here!!! beta0 = 0.2 beta1 = 0.4 sigma = 1.0 # Number of times we repeat the trial. M = 1000 # Where we store the results of each trial. t.b0 = rep(0, M) t.b1 = rep(0, M) # Simulate. for(j in 1:M){ # Create some data. Y = rep(0, n) for(i in 1:n){ # Y = beta.0 + beta.1 * X + epsilon, epsilon ~ N(0, sigma^2). Y[i] = beta0 + beta1 * X[i] + rnorm(1, 0, sigma) } # The linear regression. reg = lm(Y ~ X) # The output of the regression. b0 = reg$coef[1] b1 = reg$coef[2] # Residual (or regression) standard error. (Different names, same thing.) s = sqrt( sum(reg$res^2) / reg$df.res ) # The standard error for b0. s.b0 = s * sqrt( 1/n + mean(X)^2 / ( (n-1) * var(X) ) ) # The standard error for b1. s.b1 = s / (sqrt(n-1) * sd(X)) # t.b0 is supposed to be approximately N(0, 1). t.b0[j] = (b0 - beta0) / s.b0 # t.b1 is supposed to be approximately N(0, 1). t.b1[j] = (b1 - beta1) / s.b1 } # This will help us draw a N(0, 1) density. xgrid = seq(-4, 4, 0.05) ygrid = dnorm(xgrid, 0, 1) # We want to plot both histograms side by side. par(mfrow=c(1,2)) # The histogram of z.b0 with a N(0, 1) density. hist(t.b0, prob=TRUE, ylim=c(0, 0.45), breaks=20) lines(xgrid, ygrid, col="red") # The histogram of z.b1 with a N(0, 1) density. hist(t.b1, prob=TRUE, ylim=c(0, 0.45), breaks=20) lines(xgrid, ygrid, col="red") # Reset the plotting device to display one plot at a time. par(mfrow=c(1,1)) ###################################################################### # Problem 13 # # Profit, Risk, RD (MLR) # ###################################################################### ## CHANGECHANGE # Read the file. profits = read.csv("Profits.csv") attach(profits) # Plot Profit against each independent variable. par(mfrow=c(1,2)) plot(RD , PROFIT, col=2, main="Profit vs. RD") plot(RISK, PROFIT, col=3, main="Profit vs. Risk") # Run the regression... reg = lm(PROFIT ~ RD + RISK) # Residual standard error is 14.34, so the 95% plug-in prediction interval is cond.mean = reg$coef[1] + reg$coef[2] * 76 + reg$coef[3] * 7 lower = cond.mean - 2 * 14.34 upper = cond.mean + 2 * 14.34 # Let R tell us what the 95% prediction interval is pred.int = predict(reg, data.frame(RD=76, RISK=7), interval="prediction", level=0.95) # Run the regression on just risk. reg.risk = lm(PROFIT ~ RISK) # Residual standard error is 106.1, so the 95% plug-in prediction interal is cond.mean = reg.risk$coef[1] + reg.risk$coef[2] * 7 lower.risk = cond.mean - 1 * 106.1 upper.risk = cond.mean + 1 * 106.1 # Detach profits to free up the namespace. detach(profits) # Reset the plotting device to display one plot at a time. par(mfrow=c(1,1)) ###################################################################### # Problem 14 # # Zagat (MLR) # ###################################################################### # Read file zagat = read.csv("zagat.csv") attach(zagat) # Plot price against everything else. par(mfrow=c(1,3)) plot(food, price) plot(service, price) plot(decor, price) # Run the regression on all of the explanatory variables. reg.all = lm(price ~ food + service + decor) # Let R calculate the prediction interval. pred.int.all = predict(reg.all, data.frame(food=18, service=14, decor=16), interval="prediction", level=0.95) # Run the regression using JUST food. reg.food = lm(price ~ food) # Plot food vs. service. par(mfrow=c(1,1)) plot(service, food, main="food vs. service") # Let R calculate the prediction interval. pred.int.food = predict(reg.food, data.frame(food=18), interval="prediction", level=0.95) # The prediction interval for part (e). pred.int.all2 = predict(reg.all, data.frame(food=20, service=3, decor=17), interval="prediction", level=0.95) # Detach zagat to free up the namespace. detach(zagat) # Reset the plotting device to display one plot at a time. par(mfrow=c(1,1)) ###################################################################### # Problem 16 # # Baseball (Dummy Variables) # ###################################################################### # Read the csv file. baseball = read.csv("RunsPerGame.csv", header=TRUE) # What are the variables available. names(baseball) # Try running the regression WITHOUT the following line. What is different? baseball$League = factor(baseball$League, levels=c("National", "American")) # Run the regression. reg = lm(R.G ~ League + OBP, data=baseball) # Check out the summary. summary(reg) # Check the confidence intervals. confint(reg, level=0.99) ###################################################################### # Problem 17 # # Corn Yields (Dummy Variables) # ###################################################################### # I made a custom data set for this problem. See end of file. corn = read.csv("corn.csv", header=TRUE) # Regression for (b) regb = lm(Y ~ F1 + F2 + F3, data=corn) # Regression for (c) regc = lm(Y ~ ANY, data=corn) # Extra regression, for (d) regd = lm(Y ~ F2 + F3 + CG, data=corn) ###################################################################### # Problem 18 # # Mid-City Housing Data (MLR, Dummy Variables) # ###################################################################### house = read.csv("MidCity.csv", header=TRUE) attach(house) n = dim(house)[1] N2 = rep(0,n) # create dummy variable for Nbhd =2 N2[Nbhd==2] = 1 N3 = rep(0,n) # create dummy variable for Nbhd =2 N3[Nbhd==3] = 1 # Run the regression. reg4a = lm(Price ~ Brick + N2 + N3 + Offers + SqFt + Bedrooms + Bathrooms) # Check the confidence interval. confint(reg4a) # Run the regression. reg4b = lm(Price ~ Brick + N2 + N3 + Offers + SqFt + Bedrooms + Bathrooms + Brick*N3) # Check the confidence interval. confint(reg4b) ###################################################################### ## CUSTOM DATA ## ###################################################################### ###################################################################### # corn.csv # ###################################################################### # You need to copy and paste this into a text file and then remove the ##. # You can do that by finding "##" and replacing "". ## "Y","F1","F2","F3","CG","ANY" ## 31,1,0,0,0,1 ## 34,1,0,0,0,1 ## 34,1,0,0,0,1 ## 34,1,0,0,0,1 ## 43,1,0,0,0,1 ## 35,1,0,0,0,1 ## 38,1,0,0,0,1 ## 36,1,0,0,0,1 ## 36,1,0,0,0,1 ## 45,1,0,0,0,1 ## 27,0,1,0,0,1 ## 27,0,1,0,0,1 ## 25,0,1,0,0,1 ## 34,0,1,0,0,1 ## 21,0,1,0,0,1 ## 36,0,1,0,0,1 ## 34,0,1,0,0,1 ## 30,0,1,0,0,1 ## 32,0,1,0,0,1 ## 33,0,1,0,0,1 ## 36,0,0,1,0,1 ## 37,0,0,1,0,1 ## 37,0,0,1,0,1 ## 34,0,0,1,0,1 ## 37,0,0,1,0,1 ## 28,0,0,1,0,1 ## 33,0,0,1,0,1 ## 29,0,0,1,0,1 ## 36,0,0,1,0,1 ## 42,0,0,1,0,1 ## 33,0,0,0,1,0 ## 27,0,0,0,1,0 ## 35,0,0,0,1,0 ## 25,0,0,0,1,0 ## 29,0,0,0,1,0 ## 20,0,0,0,1,0 ## 25,0,0,0,1,0 ## 40,0,0,0,1,0 ## 35,0,0,0,1,0 ## 29,0,0,0,1,0