Lab 1: Added notes for Assignment 3 (not revised)

5e9d99cb · Felix Ramnelöv · 995d312c · 5e9d99cb · 5e9d99cb
Commit 5e9d99cb authored 4 months ago by Felix Ramnelöv
--- a/lab1/assignment3.R
+++ b/lab1/assignment3.R
@@ -3,16 +3,126 @@ data = read.csv("pima-indians-diabetes.csv", header = FALSE)
 # ----1.----

 plot(
-  main="Plasma Glucose Concentration vs Age",
+  main = "Plasma Glucose Concentration vs Age",
  data$V8,
  data$V2,
  xlab = "Age",
  ylab = "Plasma Glucose Concentration",
-  col = ifelse(data$V9 == 1,
-  "red",
-  "blue")
+  col = ifelse(data$V9 == 1, "red", "blue")
 )
-legend("bottomright", legend = c("Diabetes", "No Diabetes"), 
-       fill = c("red", "blue"))
+legend(
+  "bottomright",
+  legend = c("Diabetes", "No Diabetes"),
+  fill = c("red", "blue")
+)
+
+# ----2.----
+
+model <- glm(V9 ~ V8 + V2, family = binomial, data = data)
+
+predict_reg <- predict(model, data, type = "response")
+
+r = 0.5
+
+predict_reg <- ifelse(predict_reg > r, 1, 0)
+
+# Missclassification rate
+missclass = function(X, Xfit) {
+  n = length(X)
+  return(1 - sum(diag(table(X, Xfit))) / n)
+}
+
+missclass_rate = missclass(data$V9, predict_reg)
+
+print(missclass_rate)
+
+plot(
+  main = "Plasma Glucose Concentration vs Age",
+  data$V8,
+  data$V2,
+  xlab = "Age",
+  ylab = "Plasma Glucose Concentration",
+  col = ifelse(predict_reg == 1, "red", "blue")
+)
+legend(
+  "bottomright",
+  legend = c("Predicted Diabetes", "No Predicted Diabetes"),
+  fill = c("red", "blue")
+)
+
+# ----3.----
+
+theta <- coefficients(model)
+
+boundary <- function(x1, theta, r) {
+  logit_r <- log(r / (1 - r))  # Calculate the log-odds corresponding to the threshold r
+  # Solve for Plasma Glucose given Age, using the logistic regression coefficients
+  return((-theta[1] - log((1 - r) / r) - theta[2] * x1) / theta[3])
+}
+
+age_seq <- seq(min(data$V8), max(data$V8), length.out = 100)
+
+glucose_boundary <- sapply(age_seq, boundary, theta = theta, r = r)
+
+lines(age_seq,
+      glucose_boundary,
+      col = "black",
+      lwd = 2)
+
+# ----5.----
+
+data$z1 <- data$V8 ^ 4
+data$z2 <- data$V8 ^ 3 * data$V2
+data$z3 <- data$V8 ^ 2 * data$V2 ^ 2
+data$z4 <- data$V8 * data$V2 ^ 3
+data$z5 <- data$V2 ^ 4
+
+model2 <- glm(V9 ~ V8 + V2 + z1 + z2 + z3 + z4 + z5, family = binomial, data = data)
+
+predict_reg2 <- predict(model2, data, type = "response")
+predict_reg2 <- ifelse(predict_reg2 > r, 1, 0)
+
+theta2 <- coefficients(model2)
+
+plot(
+  main = "Plasma Glucose Concentration vs Age",
+  data$V8,
+  data$V2,
+  xlab = "Age",
+  ylab = "Plasma Glucose Concentration",
+  col = ifelse(predict_reg2 == 1, "red", "blue")
+)
+legend(
+  "bottomright",
+  legend = c("Predicted Diabetes", "No Predicted Diabetes"),
+  fill = c("red", "blue")
+)
+
+missclass_rate2 = missclass(data$V9, predict_reg2)
+
+print(summary(model2))
+print(missclass_rate2)
+
+
+boundary2 <- function(x1, theta, r) {
+  # Calculate the log-odds corresponding to the threshold r
+  logit_r <- log(r / (1 - r))
+  
+  # Basis function transformations
+  z1 <- x1 ^ 4
+  z2 <- x1 ^ 3 * x2
+  z3 <- x1 ^ 2 * x2 ^ 2
+  z4 <- x1 * x2 ^ 3
+  z5 <- x2 ^ 4
+  
+  
+  return()
+}
+
+
+glucose_boundary2 <- sapply(age_seq, boundary, theta = theta2, r = r)

-# ----2.----
\ No newline at end of file
+lines(age_seq,
+      glucose_boundary2,
+      col = "black",
+      lwd = 2)
--- a/lab1/lab-notes.md
+++ b/lab1/lab-notes.md
@@ -130,6 +130,34 @@ Confusion matrix o misclassification error e framtana.

 ## Assignment 3

+1. Not easy to classify. High variance. A lot of overlapping.
+
+2. $$p(y = 1 \mid \mathbf{x}^*) = g(\mathbf{x}^*, \boldsymbol{\theta}) = \frac{1}{1 + e^{-\boldsymbol{\theta}^\top \mathbf{x}^*}}$$
+
+   Transform into decision:
+
+   $$
+   \hat{y} =
+   \begin{cases}
+   1 & \text{if } p(y = 1 \mid \mathbf{x}^*) > t \\
+   0 & \text{otherwise}
+   \end{cases}
+   $$
+
+   Normally, \( t = 0.5 \).
+
+   Wrong 1/4 of times, not very good.
+
+3. a. theta0 + theta1* C1 + theta2*C2 = 0
+
+   b.
+
+   Does not capture data distibution very well.
+
+4. Lower r gives lower risk of missing patient with diabetes.
+
+5. Model becomes more complex with lower missclassification rate.
+
 ## Assignment 4

 - _Why can it be important to consider various probability thresholds in the