# load data # it's in package MASS library(MASS) data(Boston) # type of Boston str(Boston) # summarize data summary(Boston) # take a look to first several rows of data head(Boston) dim(Boston) # column names of Boston colnames(Boston) # subsetting, pick age, which is also the 7-th column # below are equivalent Boston$age Boston[, 7] Boston$age == Boston[,7] # Find houses less than 10 years Boston[Boston$age < 10, ] # another way subset(Boston, age < 10) # compute mean and variance of medv, median price of house mean(Boston$medv) var(Boston$medv) # compute correlation and covariance between age and price corr(Boston$age, Boston$medv) cov(Boston$age, Boston$medv) # how to add a new column to the data frame? N = dim(Boston)[1] randomvec = rnorm(N) Boston$randomvec = randomvec head(Boston) # it's there! summary(Boston) # I changed my mind, how to delete it? Boston$randomvec = NULL # how to subset 5, 7, 9, 11 row index = c(5, 7, 9, 11) Boston[index, ] # how to subset 5, 7, 9, 11 row and 2, 5 column? index2 = c(2,5) Boston[index, index2] # plot plot(Boston) # too many plots, let's try plot medv ~ age plot(Boston$age, Boston$medv) # fit a linear regression fit = lm(medv ~ age, data = Boston) # summarize regression summary(fit) # draw fitted line # col controls color # lwd controls line width abline(fit, col = "red", lwd = 5) # make prediction on age = 53 newdata = data.frame(age = 53) pred = predict(fit, newdata) pred # draw the point on plot points(53, pred, pch = "x", col = "blue", cex = 5) # run a regression use everything fit2 = lm(medv ~ ., data = Boston) summary(fit2)