# load data
# it's in package MASS
library(MASS)
data(Boston)
# type of Boston
str(Boston)
# summarize data
summary(Boston)
# take a look to first several rows of data
head(Boston)
dim(Boston)
# column names of Boston
colnames(Boston)
# subsetting, pick age, which is also the 7-th column
# below are equivalent
Boston$age
Boston[, 7]
Boston$age == Boston[,7]
# Find houses less than 10 years
Boston[Boston$age < 10, ]
# another way
subset(Boston, age < 10)
# compute mean and variance of medv, median price of house
mean(Boston$medv)
var(Boston$medv)
# compute correlation and covariance between age and price
corr(Boston$age, Boston$medv)
cov(Boston$age, Boston$medv)
# how to add a new column to the data frame?
N = dim(Boston)[1]
randomvec = rnorm(N)
Boston$randomvec = randomvec
head(Boston) # it's there!
summary(Boston)
# I changed my mind, how to delete it?
Boston$randomvec = NULL
# how to subset 5, 7, 9, 11 row
index = c(5, 7, 9, 11)
Boston[index, ]
# how to subset 5, 7, 9, 11 row and 2, 5 column?
index2 = c(2,5)
Boston[index, index2]
# plot
plot(Boston)
# too many plots, let's try plot medv ~ age
plot(Boston$age, Boston$medv)
# fit a linear regression
fit = lm(medv ~ age, data = Boston)
# summarize regression
summary(fit)
# draw fitted line
# col controls color
# lwd controls line width
abline(fit, col = "red", lwd = 5)
# make prediction on age = 53
newdata = data.frame(age = 53)
pred = predict(fit, newdata)
pred
# draw the point on plot
points(53, pred, pch = "x", col = "blue", cex = 5)
# run a regression use everything
fit2 = lm(medv ~ ., data = Boston)
summary(fit2)