Newer
Older
library(caret)
library(ggplot2)
data = read.csv("communities.csv")
# ----1.----
feature_column_names <- setdiff(names(data), "ViolentCrimesPerPop")
scaler <- preProcess(data[, feature_column_names])
data_scaled[, feature_column_names] <- predict(scaler, data[, feature_column_names])
X <- data_scaled[, feature_column_names]
y <- data_scaled$ViolentCrimesPerPop
covariance_matrix <- cov(X)
eigen_decomposition <- eigen(covariance_matrix)
eigen_values <- eigen_decomposition$values
variance <- eigen_values / sum(eigen_values)
which(cumsum(variance) >= 0.95)[1]
variance[1]
variance[2]
# ----2.----
principle_components <- prcomp(X)
U <- principle_components$rotation
principle_components$x[, 1]
plot(
U[, 1],
main = "PC1",
col = "blue",
ylab = "Magnitude",
pch = 19
)
ordered_abs <- order(abs(U[, 1]), decreasing = TRUE)
names(U[, 1])[ordered_abs[1:5]]
pc_scores <- data.frame(
PC1 = principle_components$x[, 1],
PC2 = principle_components$x[, 2],
ViolentCrimesPerPop = data_scaled$ViolentCrimesPerPop
)
ggplot(pc_scores, aes(x = PC1, y = PC2, color = ViolentCrimesPerPop)) +
geom_point() +
labs(title = "PC Scores", x = "PC1", y = "PC2") +
scale_color_gradient(low = "blue", high = "red") +
theme_minimal()
# ----3.----
n = dim(data)[1]
set.seed(12345)
id = sample(1:n, floor(n * 0.5))
train = data[id, ]
test = data[-id, ]
test_scaled <- predict(scaler3, test)
X_train <- train_scaled[, feature_column_names]
X_test <- test_scaled[, feature_column_names]
y_test <- test_scaled$ViolentCrimesPerPop
model <- lm(ViolentCrimesPerPop ~ . - 1, data = train_scaled)
summary(model)
MSE <- function(y, y_hat) {
return(mean((y - y_hat) ^ 2))
}
y_hat_train <- predict(model, X_train)
y_hat_test <- predict(model, X_test)
MSE(y_train, y_hat_train)
MSE(y_test, y_hat_test)
# ----4.----
train_errors <- c()
test_errors <- c()
cost_function <- function(theta) {
train_error <- MSE(y_train, as.matrix(X_train) %*% theta)
test_error <- MSE(y_test, as.matrix(X_test) %*% theta)
train_errors <<- c(train_errors, res$value)
test_errors <<- c(test_errors, test_error)
res <- optim(
par = theta,
fn = cost_function,
method = "BFGS",
control = list(trace = 1, maxit = 200),
)
plot(
train_errors,
type = "l",
col = "blue",
ylab = "Error",
xlab = "Iteration",
main = "Training and Test Errors over Iterations",
ylim = c(0.25, 0.75),
xlim = c(501, 10000),
points(test_errors, type = "l", col = "red", )
legend = c("Training Error", "Test Error", "Optimal Test Error"),
lty = c(1, 1, 0),
col = c("blue", "red", "green"),
pch = c(NA, NA, 19),
optimal_iteration <- which.min(test_errors)
optimal_iteration
train_errors[optimal_iteration]
test_errors[optimal_iteration]
points(optimal_iteration,
test_errors[optimal_iteration],