Wrangling

Loading the needed packages and the raw data.

#load required packages
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.1.8
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
here() starts at C:/Users/Katie/Documents/2022-2023/MADA/katiewells-MADA-portfolio
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.3     ✔ rsample      1.1.1
✔ dials        1.1.0     ✔ tune         1.0.1
✔ infer        1.0.4     ✔ workflows    1.1.3
✔ modeldata    1.1.0     ✔ workflowsets 1.0.0
✔ parsnip      1.0.4     ✔ yardstick    1.1.0
✔ recipes      1.0.5     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Search for functions across packages at https://www.tidymodels.org/find/
#load data
flu <- readRDS(here("fluanalysis", "data", "SympAct_Any_Pos.Rda"))

Removing all variables that have: Score, Total, FluA, FluB, Dxname, or Activity in their name; also removing Unique.Visit and any NA observations.

#remove unnecessary variables
flu <- flu %>% select(-contains(c("Score", "Total", "FluA", "FluB", "Dxname", "Activity")))
flu <- flu %>% subset(select = -(Unique.Visit))
flu <- flu %>% drop_na()

Saving the cleaned data in an RDS file.

saveRDS(flu, file = here("fluanalysis", "data", "flu2.rds"))

Preprocessing for Module 11

#load data
flu2 <- readRDS(here("fluanalysis", "data", "flu2.rds"))
## Feature/Variable Removal (Weakness, Cough (2x), Myalgia - Yes/No)
flu2 <- flu2 %>%
  select(-c(WeaknessYN, CoughYN, CoughYN2, MyalgiaYN))

## Recipe Creation
categorical_recipe <- recipe(~ SwollenLymphNodes + ChestCongestion + ChillsSweats + NasalCongestion + Sneeze + Fatigue + SubjectiveFever + Headache + RunnyNose + AbPain + ChestPain + Diarrhea + EyePn + Insomnia + ItchyEye + Nausea + EarPn + Pharyngitis + Breathless + ToothPn + Vomit + Wheeze, data = flu2)
categorical_dummies <- categorical_recipe %>%
  step_dummy(all_predictors()) %>%
  prep(training = flu2)
categorical_dummies_data <- bake(categorical_dummies, new_data = NULL)
## Recipe Creation
ord_levels <- c("None", "Mild", "Moderate", "Severe")
flu2 <- flu2 %>%
  mutate(Weakness = ordered(Weakness),
         CoughIntensity = ordered(CoughIntensity),
         Myalgia = ordered(Myalgia))
ord_recipe <- recipe(~ Weakness + CoughIntensity + Myalgia, data = flu2)
ord_dummies <- ord_recipe %>%
  step_ordinalscore(all_predictors()) %>%
  prep(training = flu2)
ord_dummies_data <- bake(ord_dummies, new_data = NULL)

## Low ("near-zero") variance predictors
xtabs(~ Vision + Hearing, data = flu2)
      Hearing
Vision  No Yes
   No  684  27
   Yes  16   3
### Less than 50 observations where Hearing and Vision == "Yes"
flu2 <- flu2 %>%
  select(-c(Vision, Hearing))

# Save cleaned set
saveRDS(flu2, file = here("fluanalysis", "data", "flu2.rds"))