Skip to content
Snippets Groups Projects
assignment3.R 3.01 KiB
Newer Older
  • Learn to ignore specific revisions
  • Felix Ramnelöv's avatar
    Felix Ramnelöv committed
    library(caret)
    library(ggplot2)
    
    data = read.csv("communities.csv")
    
    # ----1.----
    
    
    feature_column_names <- setdiff(names(data), "ViolentCrimesPerPop")
    
    Felix Ramnelöv's avatar
    Felix Ramnelöv committed
    
    
    scaler <- preProcess(data[, feature_column_names])
    
    Felix Ramnelöv's avatar
    Felix Ramnelöv committed
    
    data_scaled <- data
    
    data_scaled[, feature_column_names] <- predict(scaler, data[, feature_column_names])
    
    X <- data_scaled[, feature_column_names]
    
    Felix Ramnelöv's avatar
    Felix Ramnelöv committed
    y <- data_scaled$ViolentCrimesPerPop
    
    covariance_matrix <- cov(X)
    
    eigen_decomposition <- eigen(covariance_matrix)
    
    eigen_values <- eigen_decomposition$values
    
    variance <- eigen_values / sum(eigen_values)
    
    which(cumsum(variance) >= 0.95)[1]
    
    variance[1]
    variance[2]
    
    # ----2.----
    
    principle_components <- prcomp(X)
    
    U <- principle_components$rotation
    
    principle_components$x[, 1]
    
    
    plot(
      U[, 1],
      main = "PC1",
      col = "blue",
      ylab = "Magnitude",
      pch = 19
    )
    
    Felix Ramnelöv's avatar
    Felix Ramnelöv committed
    
    ordered_abs <- order(abs(U[, 1]), decreasing = TRUE)
    
    names(U[, 1])[ordered_abs[1:5]]
    
    pc_scores <- data.frame(
      PC1 = principle_components$x[, 1],
      PC2 = principle_components$x[, 2],
      ViolentCrimesPerPop = data_scaled$ViolentCrimesPerPop
    )
    
    ggplot(pc_scores, aes(x = PC1, y = PC2, color = ViolentCrimesPerPop)) +
      geom_point() +
      labs(title = "PC Scores", x = "PC1", y = "PC2") +
      scale_color_gradient(low = "blue", high = "red") +
      theme_minimal()
    
    # ----3.----
    
    n = dim(data)[1]
    set.seed(12345)
    id = sample(1:n, floor(n * 0.5))
    train = data[id, ]
    test = data[-id, ]
    
    
    scaler3 <- preProcess(train)
    
    train_scaled <- predict(scaler3, train)
    
    test_scaled <- predict(scaler3, test)
    
    X_train <- train_scaled[, feature_column_names]
    
    Felix Ramnelöv's avatar
    Felix Ramnelöv committed
    y_train <- train_scaled$ViolentCrimesPerPop
    
    
    X_test <- test_scaled[, feature_column_names]
    
    Felix Ramnelöv's avatar
    Felix Ramnelöv committed
    y_test <- test_scaled$ViolentCrimesPerPop
    
    model <- lm(ViolentCrimesPerPop ~ . - 1, data = train_scaled)
    
    summary(model)
    
    MSE <- function(y, y_hat) {
      return(mean((y - y_hat) ^ 2))
    }
    
    y_hat_train <- predict(model, X_train)
    y_hat_test <- predict(model, X_test)
    
    MSE(y_train, y_hat_train)
    MSE(y_test, y_hat_test)
    
    # ----4.----
    
    train_errors <- c()
    test_errors <- c()
    
    cost_function <- function(theta) {
      train_error <- MSE(y_train, as.matrix(X_train) %*% theta)
      test_error <- MSE(y_test, as.matrix(X_test) %*% theta)
      
    
      train_errors <<- c(train_errors, res$value)
    
      test_errors <<- c(test_errors, test_error)
    
      return(train_error)
    
    theta <- rep(0, ncol(X_train))
    
    res <- optim(
      par = theta,
      fn = cost_function,
      method = "BFGS",
      control = list(trace = 1, maxit = 200),
    )
    
    Felix Ramnelöv's avatar
    Felix Ramnelöv committed
    
    plot(
      train_errors,
      type = "l",
      col = "blue",
      ylab = "Error",
      xlab = "Iteration",
      main = "Training and Test Errors over Iterations",
    
      ylim = c(0.25, 0.75),
      xlim = c(501, 10000),
    
    points(test_errors, type = "l", col = "red", )
    
    Felix Ramnelöv's avatar
    Felix Ramnelöv committed
    
    legend(
      "topright",
    
      legend = c("Training Error", "Test Error", "Optimal Test Error"),
      lty = c(1, 1, 0),
      col = c("blue", "red", "green"),
      pch = c(NA, NA, 19),
    
    optimal_iteration <- which.min(test_errors)
    optimal_iteration
    
    train_errors[optimal_iteration]
    test_errors[optimal_iteration]
    
    points(optimal_iteration,
           test_errors[optimal_iteration],
    
           col = "green",
           pch = 19)