voter_data <- read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/non-voters/nonvoters_data.csv')
voter_clean <- voter_data |>
select(-RespId, -weight, -Q1) |>
mutate(
educ = factor(educ, levels = c("High school or less", "Some college", "College")),
income_cat = factor(income_cat, levels = c("Less than $40k", "$40-75k ",
"$75-125k", "$125k or more")),
voter_category = factor(voter_category, levels = c("rarely/never", "sporadic", "always"))
) |>
filter(Q22 != 5 | is.na(Q22)) |>
mutate(Q22 = as_factor(Q22),
Q22 = if_else(is.na(Q22), "Not Asked", Q22),
across(Q28_1:Q28_8, ~if_else(.x == -1, 0, .x)),
across(Q28_1:Q28_8, ~ as_factor(.x)),
across(Q28_1:Q28_8, ~if_else(is.na(.x) , "Not Asked", .x)),
across(Q29_1:Q29_10, ~if_else(.x == -1, 0, .x)),
across(Q29_1:Q29_10, ~ as_factor(.x)),
across(Q29_1:Q29_8, ~if_else(is.na(.x) , "Not Asked", .x)),
Party_ID = as_factor(case_when(
Q31 == 1 ~ "Strong Republican",
Q31 == 2 ~ "Republican",
Q32 == 1 ~ "Strong Democrat",
Q32 == 2 ~ "Democrat",
Q33 == 1 ~ "Lean Republican",
Q33 == 2 ~ "Lean Democrat",
TRUE ~ "Other"
)),
Party_ID = factor(Party_ID, levels =c("Strong Republican", "Republican", "Lean Republican",
"Other", "Lean Democrat", "Democrat", "Strong Democrat")),
across(!ppage, ~as_factor(if_else(.x == -1, NA, .x))))
mr_recipe <- recipe(voter_category ~ . , data = voter_train) |>
step_zv(all_predictors()) |>
step_integer(educ, income_cat, Party_ID, Q2_2:Q4_6, Q6, Q8_1:Q9_4, Q14:Q17_4,
Q25:Q26) |>
step_impute_median(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors(), one_hot = FALSE) |>
step_normalize(all_numeric_predictors())
.pred_class | .pred_rarely/never | .pred_sporadic | .pred_always |
---|---|---|---|
sporadic | 0.0317402 | 0.5122327 | 0.4560271 |
always | 0.0463060 | 0.4464935 | 0.5072005 |
sporadic | 0.0553279 | 0.6529393 | 0.2917328 |
always | 0.0388506 | 0.4351109 | 0.5260385 |
always | 0.0361359 | 0.4164662 | 0.5473979 |
always | 0.0991499 | 0.4310622 | 0.4697879 |
voter_metrics <- metric_set(accuracy, precision, recall)
mr_fit |> augment(new_data = voter_test) |>
voter_metrics(truth = voter_category, estimate = .pred_class, estimator = "macro") |>
kable()
.metric | .estimator | .estimate |
---|---|---|
accuracy | multiclass | 0.6099656 |
precision | macro | 0.6375886 |
recall | macro | 0.5976485 |