Wrangling
Loading the needed packages and the raw data.
#load required packages
library (tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.0 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.1 ✔ tibble 3.1.8
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
here() starts at C:/Users/Katie/Documents/2022-2023/MADA/katiewells-MADA-portfolio
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom 1.0.3 ✔ rsample 1.1.1
✔ dials 1.1.0 ✔ tune 1.0.1
✔ infer 1.0.4 ✔ workflows 1.1.3
✔ modeldata 1.1.0 ✔ workflowsets 1.0.0
✔ parsnip 1.0.4 ✔ yardstick 1.1.0
✔ recipes 1.0.5
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter() masks stats::filter()
✖ recipes::fixed() masks stringr::fixed()
✖ dplyr::lag() masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step() masks stats::step()
• Search for functions across packages at https://www.tidymodels.org/find/
#load data
flu <- readRDS (here ("fluanalysis" , "data" , "SympAct_Any_Pos.Rda" ))
Removing all variables that have: Score, Total, FluA, FluB, Dxname, or Activity in their name; also removing Unique.Visit and any NA observations.
#remove unnecessary variables
flu <- flu %>% select (- contains (c ("Score" , "Total" , "FluA" , "FluB" , "Dxname" , "Activity" )))
flu <- flu %>% subset (select = - (Unique.Visit))
flu <- flu %>% drop_na ()
Saving the cleaned data in an RDS file.
saveRDS (flu, file = here ("fluanalysis" , "data" , "flu2.rds" ))
Preprocessing for Module 11
#load data
flu2 <- readRDS (here ("fluanalysis" , "data" , "flu2.rds" ))
## Feature/Variable Removal (Weakness, Cough (2x), Myalgia - Yes/No)
flu2 <- flu2 %>%
select (- c (WeaknessYN, CoughYN, CoughYN2, MyalgiaYN))
## Recipe Creation
categorical_recipe <- recipe (~ SwollenLymphNodes + ChestCongestion + ChillsSweats + NasalCongestion + Sneeze + Fatigue + SubjectiveFever + Headache + RunnyNose + AbPain + ChestPain + Diarrhea + EyePn + Insomnia + ItchyEye + Nausea + EarPn + Pharyngitis + Breathless + ToothPn + Vomit + Wheeze, data = flu2)
categorical_dummies <- categorical_recipe %>%
step_dummy (all_predictors ()) %>%
prep (training = flu2)
categorical_dummies_data <- bake (categorical_dummies, new_data = NULL )
## Recipe Creation
ord_levels <- c ("None" , "Mild" , "Moderate" , "Severe" )
flu2 <- flu2 %>%
mutate (Weakness = ordered (Weakness),
CoughIntensity = ordered (CoughIntensity),
Myalgia = ordered (Myalgia))
ord_recipe <- recipe (~ Weakness + CoughIntensity + Myalgia, data = flu2)
ord_dummies <- ord_recipe %>%
step_ordinalscore (all_predictors ()) %>%
prep (training = flu2)
ord_dummies_data <- bake (ord_dummies, new_data = NULL )
## Low ("near-zero") variance predictors
xtabs (~ Vision + Hearing, data = flu2)
Hearing
Vision No Yes
No 684 27
Yes 16 3
### Less than 50 observations where Hearing and Vision == "Yes"
flu2 <- flu2 %>%
select (- c (Vision, Hearing))
# Save cleaned set
saveRDS (flu2, file = here ("fluanalysis" , "data" , "flu2.rds" ))