Reproducing Shree code - v01

code
draft
renan
shree
putting together shree messy code
Code
# options(repos = c(CRAN = "https://cloud.r-project.org"))

packages <- c("dplyr", "car", "ResourceSelection", "caret", "pROC",  "logistf", "Hmisc", "rcompanion", "ggplot2", "summarytools", "tidyverse", "knitr", "ggpubr")
# install.packages(packages)

# Load Libraries
lapply(packages, library, character.only = TRUE)

# Set seed for reproducibility
set.seed(123)

Loading Dataset:

Code
find_git_root <- function(start = getwd()) {
  path <- normalizePath(start, winslash = "/", mustWork = TRUE)
  while (path != dirname(path)) {
    if (dir.exists(file.path(path, ".git"))) return(path)
    path <- dirname(path)
  }
  stop("No .git directory found — are you inside a Git repository?")
}

repo_root <- find_git_root()
datasets_path <- file.path(repo_root, "datasets")

# Reading the datafile healthcare-dataset-stroke-data
steve_dataset_path <- file.path(datasets_path, "kaggle-healthcare-dataset-stroke-data/healthcare-dataset-stroke-data.csv")
stroke1 = read_csv(steve_dataset_path, show_col_types = FALSE)

Handling Dataset Features

Code
stroke1[stroke1 == "N/A" | stroke1 == "Unknown" | stroke1 == "children" | stroke1 == "other"] <- NA
stroke1$bmi <- round(as.numeric(stroke1$bmi), 2)
stroke1$gender[stroke1$gender == "Male"] <- 1
stroke1$gender[stroke1$gender == "Female"] <- 2
stroke1$gender <- as.numeric(stroke1$gender)
stroke1$ever_married[stroke1$ever_married == "Yes"] <- 1
stroke1$ever_married[stroke1$ever_married == "No"] <- 2
stroke1$ever_married <- as.numeric(stroke1$ever_married)
stroke1$work_type[stroke1$work_type == "Govt_job"] <- 1
stroke1$work_type[stroke1$work_type == "Private"] <- 2
stroke1$work_type[stroke1$work_type == "Self-employed"] <- 3
stroke1$work_type[stroke1$work_type == "Never_worked"] <- 4
stroke1$work_type <- as.numeric(stroke1$work_type)
stroke1$Residence_type[stroke1$Residence_type == "Urban"] <- 1
stroke1$Residence_type[stroke1$Residence_type == "Rural"] <- 2
stroke1$Residence_type <- as.numeric(stroke1$Residence_type)
stroke1$avg_glucose_level <- as.numeric(stroke1$avg_glucose_level)
stroke1$heart_disease <- as.numeric(stroke1$heart_disease)
stroke1$hypertension <- as.numeric(stroke1$hypertension)
stroke1$age <- round(as.numeric(stroke1$age), 2)
stroke1$stroke <- as.numeric(stroke1$stroke)
stroke1$smoking_status[stroke1$smoking_status == "never smoked"] <- 1
stroke1$smoking_status[stroke1$smoking_status == "formerly smoked"] <- 2
stroke1$smoking_status[stroke1$smoking_status == "smokes"] <- 3
stroke1$smoking_status <- as.numeric(stroke1$smoking_status)
stroke1 <- stroke1[, !(names(stroke1) %in% "id")]

Removing NAs and cleaning Dataset

Code
stroke1$stroke <- as.factor(stroke1$stroke)
stroke1_clean <- na.omit(stroke1)
strokeclean <- stroke1_clean
fourassume <- stroke1_clean

Code that Shree used

Load data

# stroke <- read.csv("stroke.csv") 
# strokeclean

Elimination of non-predictive identifiers, such as patient ID

Code –

```{r}
stroke = strokeclean %>% select(-id)
stroke = stroke %>% select(-id) Command
```

Making components out of categorical variables (converting into factor)

Code –

stroke = strokeclean %>%
mutate(
gender          = factor(gender),
ever_married    = factor(ever_married),
work_type       = factor(work_type),
Residence_type  = factor(Residence_type),
smoking_status  = factor(smoking_status),
hypertension    = factor(hypertension),
heart_disease   = factor(heart_disease),
stroke          = factor(stroke, levels = c(0, 1),
labels = c("No", "Yes"))
)

Managing uncommon categories (like the gender group “Other”)

Code –

stroke$gender[stroke$gender == "Other"] = "Male"
Warning in `[<-.factor`(`*tmp*`, stroke$gender == "Other", value =
structure(c(1L, : invalid factor level, NA generated
stroke$gender = droplevels(stroke$gender)

Using median imputation to impute missing BMI values

Code-

stroke$bmi[is.na(stroke$bmi)] = median(stroke$bmi, na.rm = TRUE)

Convert Bmi into numberic

stroke$bmi[stroke$bmi == "N/A"] = NA
stroke$bmi = as.numeric(stroke$bmi)
median_bmi = median(stroke$bmi, na.rm = TRUE)
stroke$bmi[is.na(stroke$bmi)] <- median_bmi
summary(stroke$bmi)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  11.50   25.40   29.20   30.44   34.20   92.00 

Train/Test division:

set.seed(123)
index = createDataPartition(stroke$stroke, p = 0.7, list = FALSE)
train_data  = stroke[index, ]
test_data   = stroke[-index, ]
prop.table(table(train_data$stroke))

        No        Yes 
0.94638298 0.05361702 
prop.table(table(test_data$stroke))

        No        Yes 
0.94637537 0.05362463 

1 Logestic Regression:

```{r}
set.seed(123)
fit_glm = train(
model_formula,
data = train_data,
method = "glm",
family = "binomial",
trControl = ctrl,
metric = "ROC")
fit_glm
```
```{r}
varImp(fit_glm)
```

Evaluate model code:

```{r}
evaluate_model = function(model, test_data, positive_class = "Yes") {
```

Class predictions

```{r}
pred_class =  predict(model, newdata = test_data)
```

Probabilities for positive class “Yes”

```{r}
pred_prob =  predict(model, newdata = test_data, type = "prob")[, positive_class]
```

Confusion matrix

```{r}
cm =  confusionMatrix(pred_class, test_data$stroke, positive = positive_class)
```

ROC & AUC

```{r}
  roc_obj =  roc(
    response  = test_data$stroke,
    predictor = pred_prob,
    levels    = c("No", "Yes")  # "No" = control, "Yes" = case
  )
   list(
    cm      = cm,
    auc     = auc(roc_obj),
    roc_obj = roc_obj
  )
}
```
```{r}
res_glm = evaluate_model(fit_glm, test_data)
res_glm$cm      
res_glm$auc  
```

2 Decision tree-

```{r}
set.seed(123)
fit_rpart = train(
model_formula,
... (191 lines left)
```

References