diff --git a/lab2/assignment1.R b/lab2/assignment1.R new file mode 100644 index 0000000000000000000000000000000000000000..ea564c33280ec6f09d0bc3b9bb929783ab84aef4 --- /dev/null +++ b/lab2/assignment1.R @@ -0,0 +1,80 @@ +library(glmnet) + +data = read.csv("tecator.csv") + +# Split data +n = dim(data)[1] +set.seed(12345) +id = sample(1:n, floor(n * 0.5)) +train = data[id, ] +test = data[-id, ] + +# ----1.---- + +y_train <- train[, 102] +X_train <- train[, 2:101] +y_test <- test[, 102] +X_test <- test[, 2:101] + +channels <- colnames(X_train) + +formula <- as.formula(paste("Fat~", paste(channels, collapse = " + "))) + +model <- lm(formula, data = train) + +mse <- function(y, y_hat) + mean((y_hat - y) ^ 2) + +train_pred <- predict(model, train) +test_pred <- predict(model, test) + +print(paste("MSE on the training data:", mse(train$Fat, train_pred))) +print(paste("MSE on the test data:", mse(test$Fat, test_pred))) + + +#----3.---- + +fit <- glmnet(as.matrix(X_train), y_train, alpha = 1) + +plot(fit, xvar = "lambda", label = TRUE) + +coef_matrix <- as.matrix(coef(fit))[-1, ] #ignore intercept + +lambda_values <- fit$lambda + +num_non_zero <- apply(coef_matrix != 0, 2, sum) + +lambda_with_3_features <- lambda_values[num_non_zero == 3] + +print(lambda_with_3_features) + + +#----4.---- + +fit <- glmnet(as.matrix(X_train), y_train, alpha = 0) + +plot(fit, xvar = "lambda", label = TRUE) + +coef_matrix <- as.matrix(coef(fit))[-1, ] #ignore intercept + +lambda_values <- fit$lambda + +num_non_zero <- apply(coef_matrix != 0, 2, sum) + +lambda_with_3_features <- lambda_values[num_non_zero == 3] + +print(lambda_with_3_features) + +#----5.---- + +fit <- cv.glmnet(as.matrix(X_train), y_train, alpha = 1) + +plot(fit, xvar = "lambda", label = TRUE) + +fit$lambda.min + +coef(fit, s="lambda.min") + +y_hat = predict(fit, newx=as.matrix(X_train), s="lambda.min") + +plot(y_train, y_hat) diff --git a/lab2/assignment2.R b/lab2/assignment2.R new file mode 100644 index 0000000000000000000000000000000000000000..a9e55d93db0b35922920b22942af946f9fdecb05 --- /dev/null +++ b/lab2/assignment2.R @@ -0,0 +1,238 @@ +library(tree) + +# ----1.---- + +data = read.csv2("bank-full.csv") +data <- data.frame(lapply(data, function(x) + if (is.character(x)) + factor(x) + else + x)) + +n = dim(data)[1] +set.seed(12345) +id = sample(1:n, floor(n * 0.4)) +train = data[id, ] +id1 = setdiff(1:n, id) +set.seed(12345) +id2 = sample(id1, floor(n * 0.3)) +valid = data[id2, ] +id3 = setdiff(id1, id2) +test = data[id3, ] + +# ----2.---- + +# Default settings + +fit2a <- tree(y ~ ., data = train) + +summary(fit2a) + +# Smallest allowed node size equal to 7000 + +fit2b <- tree(y ~ ., data = train, control = tree.control(nrow(train), minsize = 7000)) + +summary(fit2b) + +# Minimum deviance to 0.0005 + +fit2c <- tree(y ~ ., + data = train, + control = tree.control(nrow(train), mindev = 0.0005)) + +summary(fit2c) + +# ----3.---- + +trainScore <- rep(0, 50) +validScore <- rep(0, 50) + +for (i in 2:50) { + prunedTree = prune.tree(fit2c, best = i) + pred <- predict(prunedTree, newdata = valid, type = "tree") + trainScore[i] <- deviance(prunedTree) + validScore[i] <- deviance(pred) +} + +y_range <- range(c(validScore[2:50], trainScore[2:50])) + +plot( + 2:50, + trainScore[2:50], + type = "b", + col = "red", + ylim = c(min(y_range), max(y_range)), + log = "y", +) +points(2:50, validScore[2:50], type = "b", col = "blue") + +optimal_leaves <- which.min(validScore[-1]) +optimalTree <- prune.tree(fit2c, best = optimal_leaves) + +summary(optimalTree) + +# ----4.---- + +y_hat <- predict(optimalTree, newdata = test, type = "class") + +confusion <- table(test$y, y_hat) + +print(confusion) + +TP <- confusion[2, 2] +TN <- confusion[1, 1] +FP <- confusion[1, 2] +FN <- confusion[2, 1] + +P <- TP + FN +N <- TN + FP + +accuracy <- (TP + TN) / (P + N) + +precision <- TP / (TP + FP) +recall <- TP / (TP + FN) + +F1 = 2 * precision * recall / (precision + recall) + +print(accuracy) + +print(F1) + +# ----5.---- + +loss_matrix = matrix(c(0, 1, 5, 0), ncol = 2) + +fit5 <- tree(y ~ ., data = train) + +pruned_tree5 = prune.tree(fit2c, loss = loss_matrix, best = 9) + +y_hat = predict(pruned_tree5, newdata = test, type = "class") + +confusion <- table(test$y, y_hat) + +print(confusion) + + +TP <- confusion[2, 2] +TN <- confusion[1, 1] +FP <- confusion[1, 2] +FN <- confusion[2, 1] + +P <- TP + FN +N <- TN + FP + +accuracy <- (TP + TN) / (P + N) + +precision <- TP / (TP + FP) +recall <- TP / (TP + FN) + +F1 = 2 * precision * recall / (precision + recall) + +print(accuracy) + +print(F1) + +# ----6.---- + +TP <- function(x, x_pred) { + return(table(x, x_pred)[2, 2]) +} + +TN <- function(x, x_pred) { + return(table(x, x_pred)[1, 1]) +} + +FP <- function(x, x_pred) { + return(table(x, x_pred)[1, 2]) +} + +FN <- function(x, x_pred) { + return(table(x, x_pred)[2, 1]) +} + + +P <- function(x, x_pred) { + return(TP(x, x_pred) + FN(x, x_pred)) +} + +N <- function(x, x_pred) { + return(TN(x, x_pred) + FP(x, x_pred)) +} + +TPR <- function(x, x_pred) { + return(TP(x, x_pred) / P(x, x_pred)) +} + +FPR <- function(x, x_pred) { + return(FP(x, x_pred) / N(x, x_pred)) +} + + +precision <- function(x,x_pred) { + TP(x,x_pred) / (TP(x,x_pred) + FP(x,x_pred)) +} + +recall <- function(x,x_pred) { + TP(x,x_pred) / (TP(x,x_pred) + FN(x,x_pred)) +} + +logistic_model <- glm(y ~ ., family = binomial, data = train) + +pi <- seq(0.05, 0.95, by = 0.05) + +tree_tpr <- numeric(length(thresholds)) +tree_fpr <- numeric(length(thresholds)) + +tree_precision <- numeric(length(thresholds)) +tree_recall <- numeric(length(thresholds)) + +logistic_tpr <- numeric(length(thresholds)) +logistic_fpr <- numeric(length(thresholds)) + +logistic_precision <- numeric(length(thresholds)) +logistic_recall <- numeric(length(thresholds)) + + +for (i in 1:length(thresholds)) { + logistic_pred <- predict(logistic_model, test, type = "response") + logistic_class <- ifelse(logistic_pred > pi[i], 1, 0) + tree_pred <- predict(optimalTree, test, type = "vector")[, 2] + tree_class <- ifelse(tree_pred > pi[i], 1, 0) + + tree_tpr[i] <- TPR(test$y, tree_class) + tree_fpr[i] <- FPR(test$y, tree_class) + + tree_precision[i] <- precision(test$y, tree_class) + tree_recall[i] <- recall(test$y, tree_class) + + logistic_tpr[i] <- TPR(test$y, logistic_class) + logistic_fpr[i] <- FPR(test$y, logistic_class) + + logistic_precision[i] <- precision(test$y, logistic_class) + logistic_recall[i] <- recall(test$y, logistic_class) +} + + + +plot( + logistic_fpr, + logistic_tpr, + type = "l", + ylim = c(0, 1), + xlim = c(0, 1), + col = "red", +) + +lines(tree_fpr, tree_tpr, col = "blue", ) + + +plot( + logistic_recall, + logistic_precision, + type = "l", + ylim = c(0, 1), + xlim = c(0, 1), + col = "red", +) + +lines(tree_recall, tree_precision, col = "blue", ) diff --git a/lab2/lab-notes.md b/lab2/lab-notes.md index a82fe420a7eda108adb67e70c1398cc72f76d4a5..a01a0532b12e6e76166fe38ed9a84aaa5ce596cb 100644 --- a/lab2/lab-notes.md +++ b/lab2/lab-notes.md @@ -2,8 +2,47 @@ ## Assignment 1 +1. Underlying model: + + $$\mathbf{y}=\mathbf{X \theta + \epsilon}$$ + + Very bad prediction, much overfitting. Too many features per obsevations. + + - $\text{MSE}_{\text{train}} = 0.00570911701403287$ + - $\text{MSE}_{\text{test}} = 722.429419254052$ + +2. The cost function: + + $$\frac{1}{N}\sum_{i=1}^{n}(y_i - \theta_0-\theta_1x_{1i}-...-\theta_px_{pi})^2 + \lambda \sum_{j=1}^{p}|\theta_j|$$ + +3. lambdas where we have 3 features: 0.8530452 0.7772630 0.7082131 + +4. No coefficients become 0 + +5. Optimal lambda = 0.05744535, 8 variables + + CV-score gets lower with lower lambda + + Optimal lambda is not statistically significantly better than lamda = log(-4) + + Quite good, we can see a trend. + ## Assignment 2 +2. Values in Rstudio + +3. We get underfitting for small trees however we can still get good for validation due to it working as a regularization, we generalize well + + High bias -> underfittning, high variance -> overfitting. We find the best amount of leaves when we balance bias and variance and minimising the deviance. + +4. F1 better since very imbalanced + +5. More likely to choose no, needs to be 5 times more sure of yes than no to select yes + +6. Tree a little bit better, precision recall would be better since imbalanced classes. + + Since we have a big imbalance, FPR is not even 50% for pi = 0.05. + ## Assignment 3 ## Assignment 4