Skip to content
Snippets Groups Projects
Commit 79aadc06 authored by Felix Ramnelöv's avatar Felix Ramnelöv
Browse files

Lab 2: Started with assignment 1, 2 (messy)

parent 67177c39
No related branches found
No related tags found
No related merge requests found
library(glmnet)
data = read.csv("tecator.csv")
# Split data
n = dim(data)[1]
set.seed(12345)
id = sample(1:n, floor(n * 0.5))
train = data[id, ]
test = data[-id, ]
# ----1.----
y_train <- train[, 102]
X_train <- train[, 2:101]
y_test <- test[, 102]
X_test <- test[, 2:101]
channels <- colnames(X_train)
formula <- as.formula(paste("Fat~", paste(channels, collapse = " + ")))
model <- lm(formula, data = train)
mse <- function(y, y_hat)
mean((y_hat - y) ^ 2)
train_pred <- predict(model, train)
test_pred <- predict(model, test)
print(paste("MSE on the training data:", mse(train$Fat, train_pred)))
print(paste("MSE on the test data:", mse(test$Fat, test_pred)))
#----3.----
fit <- glmnet(as.matrix(X_train), y_train, alpha = 1)
plot(fit, xvar = "lambda", label = TRUE)
coef_matrix <- as.matrix(coef(fit))[-1, ] #ignore intercept
lambda_values <- fit$lambda
num_non_zero <- apply(coef_matrix != 0, 2, sum)
lambda_with_3_features <- lambda_values[num_non_zero == 3]
print(lambda_with_3_features)
#----4.----
fit <- glmnet(as.matrix(X_train), y_train, alpha = 0)
plot(fit, xvar = "lambda", label = TRUE)
coef_matrix <- as.matrix(coef(fit))[-1, ] #ignore intercept
lambda_values <- fit$lambda
num_non_zero <- apply(coef_matrix != 0, 2, sum)
lambda_with_3_features <- lambda_values[num_non_zero == 3]
print(lambda_with_3_features)
#----5.----
fit <- cv.glmnet(as.matrix(X_train), y_train, alpha = 1)
plot(fit, xvar = "lambda", label = TRUE)
fit$lambda.min
coef(fit, s="lambda.min")
y_hat = predict(fit, newx=as.matrix(X_train), s="lambda.min")
plot(y_train, y_hat)
library(tree)
# ----1.----
data = read.csv2("bank-full.csv")
data <- data.frame(lapply(data, function(x)
if (is.character(x))
factor(x)
else
x))
n = dim(data)[1]
set.seed(12345)
id = sample(1:n, floor(n * 0.4))
train = data[id, ]
id1 = setdiff(1:n, id)
set.seed(12345)
id2 = sample(id1, floor(n * 0.3))
valid = data[id2, ]
id3 = setdiff(id1, id2)
test = data[id3, ]
# ----2.----
# Default settings
fit2a <- tree(y ~ ., data = train)
summary(fit2a)
# Smallest allowed node size equal to 7000
fit2b <- tree(y ~ ., data = train, control = tree.control(nrow(train), minsize = 7000))
summary(fit2b)
# Minimum deviance to 0.0005
fit2c <- tree(y ~ .,
data = train,
control = tree.control(nrow(train), mindev = 0.0005))
summary(fit2c)
# ----3.----
trainScore <- rep(0, 50)
validScore <- rep(0, 50)
for (i in 2:50) {
prunedTree = prune.tree(fit2c, best = i)
pred <- predict(prunedTree, newdata = valid, type = "tree")
trainScore[i] <- deviance(prunedTree)
validScore[i] <- deviance(pred)
}
y_range <- range(c(validScore[2:50], trainScore[2:50]))
plot(
2:50,
trainScore[2:50],
type = "b",
col = "red",
ylim = c(min(y_range), max(y_range)),
log = "y",
)
points(2:50, validScore[2:50], type = "b", col = "blue")
optimal_leaves <- which.min(validScore[-1])
optimalTree <- prune.tree(fit2c, best = optimal_leaves)
summary(optimalTree)
# ----4.----
y_hat <- predict(optimalTree, newdata = test, type = "class")
confusion <- table(test$y, y_hat)
print(confusion)
TP <- confusion[2, 2]
TN <- confusion[1, 1]
FP <- confusion[1, 2]
FN <- confusion[2, 1]
P <- TP + FN
N <- TN + FP
accuracy <- (TP + TN) / (P + N)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
F1 = 2 * precision * recall / (precision + recall)
print(accuracy)
print(F1)
# ----5.----
loss_matrix = matrix(c(0, 1, 5, 0), ncol = 2)
fit5 <- tree(y ~ ., data = train)
pruned_tree5 = prune.tree(fit2c, loss = loss_matrix, best = 9)
y_hat = predict(pruned_tree5, newdata = test, type = "class")
confusion <- table(test$y, y_hat)
print(confusion)
TP <- confusion[2, 2]
TN <- confusion[1, 1]
FP <- confusion[1, 2]
FN <- confusion[2, 1]
P <- TP + FN
N <- TN + FP
accuracy <- (TP + TN) / (P + N)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
F1 = 2 * precision * recall / (precision + recall)
print(accuracy)
print(F1)
# ----6.----
TP <- function(x, x_pred) {
return(table(x, x_pred)[2, 2])
}
TN <- function(x, x_pred) {
return(table(x, x_pred)[1, 1])
}
FP <- function(x, x_pred) {
return(table(x, x_pred)[1, 2])
}
FN <- function(x, x_pred) {
return(table(x, x_pred)[2, 1])
}
P <- function(x, x_pred) {
return(TP(x, x_pred) + FN(x, x_pred))
}
N <- function(x, x_pred) {
return(TN(x, x_pred) + FP(x, x_pred))
}
TPR <- function(x, x_pred) {
return(TP(x, x_pred) / P(x, x_pred))
}
FPR <- function(x, x_pred) {
return(FP(x, x_pred) / N(x, x_pred))
}
precision <- function(x,x_pred) {
TP(x,x_pred) / (TP(x,x_pred) + FP(x,x_pred))
}
recall <- function(x,x_pred) {
TP(x,x_pred) / (TP(x,x_pred) + FN(x,x_pred))
}
logistic_model <- glm(y ~ ., family = binomial, data = train)
pi <- seq(0.05, 0.95, by = 0.05)
tree_tpr <- numeric(length(thresholds))
tree_fpr <- numeric(length(thresholds))
tree_precision <- numeric(length(thresholds))
tree_recall <- numeric(length(thresholds))
logistic_tpr <- numeric(length(thresholds))
logistic_fpr <- numeric(length(thresholds))
logistic_precision <- numeric(length(thresholds))
logistic_recall <- numeric(length(thresholds))
for (i in 1:length(thresholds)) {
logistic_pred <- predict(logistic_model, test, type = "response")
logistic_class <- ifelse(logistic_pred > pi[i], 1, 0)
tree_pred <- predict(optimalTree, test, type = "vector")[, 2]
tree_class <- ifelse(tree_pred > pi[i], 1, 0)
tree_tpr[i] <- TPR(test$y, tree_class)
tree_fpr[i] <- FPR(test$y, tree_class)
tree_precision[i] <- precision(test$y, tree_class)
tree_recall[i] <- recall(test$y, tree_class)
logistic_tpr[i] <- TPR(test$y, logistic_class)
logistic_fpr[i] <- FPR(test$y, logistic_class)
logistic_precision[i] <- precision(test$y, logistic_class)
logistic_recall[i] <- recall(test$y, logistic_class)
}
plot(
logistic_fpr,
logistic_tpr,
type = "l",
ylim = c(0, 1),
xlim = c(0, 1),
col = "red",
)
lines(tree_fpr, tree_tpr, col = "blue", )
plot(
logistic_recall,
logistic_precision,
type = "l",
ylim = c(0, 1),
xlim = c(0, 1),
col = "red",
)
lines(tree_recall, tree_precision, col = "blue", )
......@@ -2,8 +2,47 @@
## Assignment 1
1. Underlying model:
$$\mathbf{y}=\mathbf{X \theta + \epsilon}$$
Very bad prediction, much overfitting. Too many features per obsevations.
- $\text{MSE}_{\text{train}} = 0.00570911701403287$
- $\text{MSE}_{\text{test}} = 722.429419254052$
2. The cost function:
$$\frac{1}{N}\sum_{i=1}^{n}(y_i - \theta_0-\theta_1x_{1i}-...-\theta_px_{pi})^2 + \lambda \sum_{j=1}^{p}|\theta_j|$$
3. lambdas where we have 3 features: 0.8530452 0.7772630 0.7082131
4. No coefficients become 0
5. Optimal lambda = 0.05744535, 8 variables
CV-score gets lower with lower lambda
Optimal lambda is not statistically significantly better than lamda = log(-4)
Quite good, we can see a trend.
## Assignment 2
2. Values in Rstudio
3. We get underfitting for small trees however we can still get good for validation due to it working as a regularization, we generalize well
High bias -> underfittning, high variance -> overfitting. We find the best amount of leaves when we balance bias and variance and minimising the deviance.
4. F1 better since very imbalanced
5. More likely to choose no, needs to be 5 times more sure of yes than no to select yes
6. Tree a little bit better, precision recall would be better since imbalanced classes.
Since we have a big imbalance, FPR is not even 50% for pi = 0.05.
## Assignment 3
## Assignment 4
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment