diff --git a/lab1/assignment3.R b/lab1/assignment3.R index dcaaf6387dd143a9ed05dba9ba7d888315a1b6d5..e31f39e97a28374a0cb85f22da4b0f02997e5215 100644 --- a/lab1/assignment3.R +++ b/lab1/assignment3.R @@ -3,16 +3,126 @@ data = read.csv("pima-indians-diabetes.csv", header = FALSE) # ----1.---- plot( - main="Plasma Glucose Concentration vs Age", + main = "Plasma Glucose Concentration vs Age", data$V8, data$V2, xlab = "Age", ylab = "Plasma Glucose Concentration", - col = ifelse(data$V9 == 1, - "red", - "blue") + col = ifelse(data$V9 == 1, "red", "blue") ) -legend("bottomright", legend = c("Diabetes", "No Diabetes"), - fill = c("red", "blue")) +legend( + "bottomright", + legend = c("Diabetes", "No Diabetes"), + fill = c("red", "blue") +) + +# ----2.---- + +model <- glm(V9 ~ V8 + V2, family = binomial, data = data) + +predict_reg <- predict(model, data, type = "response") + +r = 0.5 + +predict_reg <- ifelse(predict_reg > r, 1, 0) + +# Missclassification rate +missclass = function(X, Xfit) { + n = length(X) + return(1 - sum(diag(table(X, Xfit))) / n) +} + +missclass_rate = missclass(data$V9, predict_reg) + +print(missclass_rate) + +plot( + main = "Plasma Glucose Concentration vs Age", + data$V8, + data$V2, + xlab = "Age", + ylab = "Plasma Glucose Concentration", + col = ifelse(predict_reg == 1, "red", "blue") +) +legend( + "bottomright", + legend = c("Predicted Diabetes", "No Predicted Diabetes"), + fill = c("red", "blue") +) + +# ----3.---- + +theta <- coefficients(model) + +boundary <- function(x1, theta, r) { + logit_r <- log(r / (1 - r)) # Calculate the log-odds corresponding to the threshold r + # Solve for Plasma Glucose given Age, using the logistic regression coefficients + return((-theta[1] - log((1 - r) / r) - theta[2] * x1) / theta[3]) +} + +age_seq <- seq(min(data$V8), max(data$V8), length.out = 100) + +glucose_boundary <- sapply(age_seq, boundary, theta = theta, r = r) + +lines(age_seq, + glucose_boundary, + col = "black", + lwd = 2) + +# ----5.---- + +data$z1 <- data$V8 ^ 4 +data$z2 <- data$V8 ^ 3 * data$V2 +data$z3 <- data$V8 ^ 2 * data$V2 ^ 2 +data$z4 <- data$V8 * data$V2 ^ 3 +data$z5 <- data$V2 ^ 4 + +model2 <- glm(V9 ~ V8 + V2 + z1 + z2 + z3 + z4 + z5, family = binomial, data = data) + +predict_reg2 <- predict(model2, data, type = "response") +predict_reg2 <- ifelse(predict_reg2 > r, 1, 0) + +theta2 <- coefficients(model2) + +plot( + main = "Plasma Glucose Concentration vs Age", + data$V8, + data$V2, + xlab = "Age", + ylab = "Plasma Glucose Concentration", + col = ifelse(predict_reg2 == 1, "red", "blue") +) +legend( + "bottomright", + legend = c("Predicted Diabetes", "No Predicted Diabetes"), + fill = c("red", "blue") +) + +missclass_rate2 = missclass(data$V9, predict_reg2) + +print(summary(model2)) +print(missclass_rate2) + + +boundary2 <- function(x1, theta, r) { + # Calculate the log-odds corresponding to the threshold r + logit_r <- log(r / (1 - r)) + + # Basis function transformations + z1 <- x1 ^ 4 + z2 <- x1 ^ 3 * x2 + z3 <- x1 ^ 2 * x2 ^ 2 + z4 <- x1 * x2 ^ 3 + z5 <- x2 ^ 4 + + + return() +} + + +glucose_boundary2 <- sapply(age_seq, boundary, theta = theta2, r = r) -# ----2.---- \ No newline at end of file +lines(age_seq, + glucose_boundary2, + col = "black", + lwd = 2) diff --git a/lab1/lab-notes.md b/lab1/lab-notes.md index a87b347f9337092325c78a9a99e1cb5c74d897cb..7bd9565bf7115bbd880df4c680b05f0db54d97a1 100644 --- a/lab1/lab-notes.md +++ b/lab1/lab-notes.md @@ -130,6 +130,34 @@ Confusion matrix o misclassification error e framtana. ## Assignment 3 +1. Not easy to classify. High variance. A lot of overlapping. + +2. $$p(y = 1 \mid \mathbf{x}^*) = g(\mathbf{x}^*, \boldsymbol{\theta}) = \frac{1}{1 + e^{-\boldsymbol{\theta}^\top \mathbf{x}^*}}$$ + + Transform into decision: + + $$ + \hat{y} = + \begin{cases} + 1 & \text{if } p(y = 1 \mid \mathbf{x}^*) > t \\ + 0 & \text{otherwise} + \end{cases} + $$ + + Normally, \( t = 0.5 \). + + Wrong 1/4 of times, not very good. + +3. a. theta0 + theta1* C1 + theta2*C2 = 0 + + b. + + Does not capture data distibution very well. + +4. Lower r gives lower risk of missing patient with diabetes. + +5. Model becomes more complex with lower missclassification rate. + ## Assignment 4 - _Why can it be important to consider various probability thresholds in the