Lab 2: Started with assignment 1, 2 (messy)

79aadc06 · Felix Ramnelöv · 67177c39 · 79aadc06 · 79aadc06 · 79aadc06
Commit 79aadc06 authored 4 months ago by Felix Ramnelöv
--- a/lab2/assignment1.R
+++ b/lab2/assignment1.R
+library(glmnet)
+
+data = read.csv("tecator.csv")
+
+# Split data
+n = dim(data)[1]
+set.seed(12345)
+id = sample(1:n, floor(n * 0.5))
+train = data[id, ]
+test = data[-id, ]
+
+# ----1.----
+
+y_train <- train[, 102]
+X_train <- train[, 2:101]
+y_test <- test[, 102]
+X_test <- test[, 2:101]
+
+channels <- colnames(X_train)
+
+formula <- as.formula(paste("Fat~", paste(channels, collapse = " + ")))
+
+model <- lm(formula, data = train)
+
+mse <- function(y, y_hat)
+  mean((y_hat - y) ^ 2)
+
+train_pred <- predict(model, train)
+test_pred <- predict(model, test)
+
+print(paste("MSE on the training data:", mse(train$Fat, train_pred)))
+print(paste("MSE on the test data:", mse(test$Fat, test_pred)))
+
+
+#----3.----
+
+fit <- glmnet(as.matrix(X_train), y_train, alpha = 1)
+
+plot(fit, xvar = "lambda", label = TRUE)
+
+coef_matrix <- as.matrix(coef(fit))[-1, ] #ignore intercept
+
+lambda_values <- fit$lambda
+
+num_non_zero <- apply(coef_matrix != 0, 2, sum)
+
+lambda_with_3_features <- lambda_values[num_non_zero == 3]
+
+print(lambda_with_3_features)
+
+
+#----4.----
+
+fit <- glmnet(as.matrix(X_train), y_train, alpha = 0)
+
+plot(fit, xvar = "lambda", label = TRUE)
+
+coef_matrix <- as.matrix(coef(fit))[-1, ] #ignore intercept
+
+lambda_values <- fit$lambda
+
+num_non_zero <- apply(coef_matrix != 0, 2, sum)
+
+lambda_with_3_features <- lambda_values[num_non_zero == 3]
+
+print(lambda_with_3_features)
+
+#----5.----
+
+fit <- cv.glmnet(as.matrix(X_train), y_train, alpha = 1)
+
+plot(fit, xvar = "lambda", label = TRUE)
+
+fit$lambda.min
+
+coef(fit, s="lambda.min")
+
+y_hat = predict(fit, newx=as.matrix(X_train), s="lambda.min")
+
+plot(y_train, y_hat)
--- a/lab2/assignment2.R
+++ b/lab2/assignment2.R
+library(tree)
+
+# ----1.----
+
+data = read.csv2("bank-full.csv")
+data <- data.frame(lapply(data, function(x)
+  if (is.character(x))
+    factor(x)
+  else
+    x))
+
+n = dim(data)[1]
+set.seed(12345)
+id = sample(1:n, floor(n * 0.4))
+train = data[id, ]
+id1 = setdiff(1:n, id)
+set.seed(12345)
+id2 = sample(id1, floor(n * 0.3))
+valid = data[id2, ]
+id3 = setdiff(id1, id2)
+test = data[id3, ]
+
+# ----2.----
+
+# Default settings
+
+fit2a <- tree(y ~ ., data = train)
+
+summary(fit2a)
+
+# Smallest allowed node size equal to 7000
+
+fit2b <- tree(y ~ ., data = train, control = tree.control(nrow(train), minsize = 7000))
+
+summary(fit2b)
+
+# Minimum deviance to 0.0005
+
+fit2c <- tree(y ~ .,
+              data = train,
+              control = tree.control(nrow(train), mindev = 0.0005))
+
+summary(fit2c)
+
+# ----3.----
+
+trainScore <- rep(0, 50)
+validScore <- rep(0, 50)
+
+for (i in 2:50) {
+  prunedTree = prune.tree(fit2c, best = i)
+  pred <- predict(prunedTree, newdata = valid, type = "tree")
+  trainScore[i] <- deviance(prunedTree)
+  validScore[i] <- deviance(pred)
+}
+
+y_range <- range(c(validScore[2:50], trainScore[2:50]))
+
+plot(
+  2:50,
+  trainScore[2:50],
+  type = "b",
+  col = "red",
+  ylim = c(min(y_range), max(y_range)),
+  log = "y",
+)
+points(2:50, validScore[2:50], type = "b", col = "blue")
+
+optimal_leaves <- which.min(validScore[-1])
+optimalTree <- prune.tree(fit2c, best = optimal_leaves)
+
+summary(optimalTree)
+
+# ----4.----
+
+y_hat <- predict(optimalTree, newdata = test, type = "class")
+
+confusion <- table(test$y, y_hat)
+
+print(confusion)
+
+TP <- confusion[2, 2]
+TN <- confusion[1, 1]
+FP <- confusion[1, 2]
+FN <- confusion[2, 1]
+
+P <- TP + FN
+N <- TN + FP
+
+accuracy <- (TP + TN) / (P + N)
+
+precision <- TP / (TP + FP)
+recall <- TP / (TP + FN)
+
+F1 = 2 * precision * recall / (precision + recall)
+
+print(accuracy)
+
+print(F1)
+
+# ----5.----
+
+loss_matrix = matrix(c(0, 1, 5, 0), ncol = 2)
+
+fit5 <- tree(y ~ ., data = train)
+
+pruned_tree5 = prune.tree(fit2c, loss = loss_matrix, best = 9)
+
+y_hat = predict(pruned_tree5, newdata = test, type = "class")
+
+confusion <- table(test$y, y_hat)
+
+print(confusion)
+
+
+TP <- confusion[2, 2]
+TN <- confusion[1, 1]
+FP <- confusion[1, 2]
+FN <- confusion[2, 1]
+
+P <- TP + FN
+N <- TN + FP
+
+accuracy <- (TP + TN) / (P + N)
+
+precision <- TP / (TP + FP)
+recall <- TP / (TP + FN)
+
+F1 = 2 * precision * recall / (precision + recall)
+
+print(accuracy)
+
+print(F1)
+
+# ----6.----
+
+TP <- function(x, x_pred) {
+  return(table(x, x_pred)[2, 2])
+}
+
+TN <- function(x, x_pred) {
+  return(table(x, x_pred)[1, 1])
+}
+
+FP <- function(x, x_pred) {
+  return(table(x, x_pred)[1, 2])
+}
+
+FN <- function(x, x_pred) {
+  return(table(x, x_pred)[2, 1])
+}
+
+
+P <- function(x, x_pred) {
+  return(TP(x, x_pred) + FN(x, x_pred))
+}
+
+N <- function(x, x_pred) {
+  return(TN(x, x_pred) + FP(x, x_pred))
+}
+
+TPR <- function(x, x_pred) {
+  return(TP(x, x_pred) / P(x, x_pred))
+}
+
+FPR <- function(x, x_pred) {
+  return(FP(x, x_pred) / N(x, x_pred))
+}
+
+
+precision <- function(x,x_pred) {
+  TP(x,x_pred) / (TP(x,x_pred) + FP(x,x_pred))
+}
+
+recall <- function(x,x_pred) {
+  TP(x,x_pred) / (TP(x,x_pred) + FN(x,x_pred))
+}
+
+logistic_model <- glm(y ~ ., family = binomial, data = train)
+
+pi <- seq(0.05, 0.95, by = 0.05)
+
+tree_tpr <- numeric(length(thresholds))
+tree_fpr <- numeric(length(thresholds))
+
+tree_precision <- numeric(length(thresholds))
+tree_recall <- numeric(length(thresholds))
+
+logistic_tpr <- numeric(length(thresholds))
+logistic_fpr <- numeric(length(thresholds))
+
+logistic_precision <- numeric(length(thresholds))
+logistic_recall <- numeric(length(thresholds))
+
+
+for (i in 1:length(thresholds)) {
+  logistic_pred <- predict(logistic_model, test, type = "response")
+  logistic_class <- ifelse(logistic_pred > pi[i], 1, 0)
+  tree_pred <- predict(optimalTree, test, type = "vector")[, 2]
+  tree_class <- ifelse(tree_pred > pi[i], 1, 0)
+  
+  tree_tpr[i] <- TPR(test$y, tree_class)
+  tree_fpr[i] <- FPR(test$y, tree_class)
+  
+  tree_precision[i] <- precision(test$y, tree_class)
+  tree_recall[i] <- recall(test$y, tree_class)
+  
+  logistic_tpr[i] <- TPR(test$y, logistic_class)
+  logistic_fpr[i] <- FPR(test$y, logistic_class)
+  
+  logistic_precision[i] <- precision(test$y, logistic_class)
+  logistic_recall[i] <- recall(test$y, logistic_class)
+}
+
+
+
+plot(
+  logistic_fpr,
+  logistic_tpr,
+  type = "l",
+  ylim = c(0, 1),
+  xlim = c(0, 1),
+  col = "red",
+)
+
+lines(tree_fpr, tree_tpr, col = "blue", )
+
+
+plot(
+  logistic_recall,
+  logistic_precision,
+  type = "l",
+  ylim = c(0, 1),
+  xlim = c(0, 1),
+  col = "red",
+)
+
+lines(tree_recall, tree_precision, col = "blue", )
--- a/lab2/lab-notes.md
+++ b/lab2/lab-notes.md
@@ -2,8 +2,47 @@

 ## Assignment 1

+1. Underlying model:
+
+   $$\mathbf{y}=\mathbf{X \theta + \epsilon}$$
+
+   Very bad prediction, much overfitting. Too many features per obsevations.
+
+   - $\text{MSE}_{\text{train}} = 0.00570911701403287$
+   - $\text{MSE}_{\text{test}} = 722.429419254052$
+
+2. The cost function:
+
+   $$\frac{1}{N}\sum_{i=1}^{n}(y_i - \theta_0-\theta_1x_{1i}-...-\theta_px_{pi})^2 + \lambda \sum_{j=1}^{p}|\theta_j|$$
+
+3. lambdas where we have 3 features: 0.8530452 0.7772630 0.7082131
+
+4. No coefficients become 0
+
+5. Optimal lambda = 0.05744535, 8 variables
+
+   CV-score gets lower with lower lambda
+
+   Optimal lambda is not statistically significantly better than lamda = log(-4)
+
+   Quite good, we can see a trend.
+
 ## Assignment 2

+2. Values in Rstudio
+
+3. We get underfitting for small trees however we can still get good for validation due to it working as a regularization, we generalize well
+
+   High bias -> underfittning, high variance -> overfitting. We find the best amount of leaves when we balance bias and variance and minimising the deviance.
+
+4. F1 better since very imbalanced
+
+5. More likely to choose no, needs to be 5 times more sure of yes than no to select yes
+
+6. Tree a little bit better, precision recall would be better since imbalanced classes.
+
+   Since we have a big imbalance, FPR is not even 50% for pi = 0.05.
+
 ## Assignment 3

 ## Assignment 4