library(tidymodels)
library(tidyverse)
library(tictoc) # Rechenzeit messen, optional
# data(penguins, package = "palmerpenguins")
<- "https://vincentarelbundock.github.io/Rdatasets/csv/modeldata/penguins.csv"
d_path <- read_csv(d_path) d
tidymodels-penguins05
ds1
tidymodels
prediction
yacsda
statlearning
num
Aufgabe
Berechnen Sie ein kNN-Modell mit tidymodels und zwar anhand des penguins
Datensatzes.
Modellgleichung: body_mass_g ~ bill_length_mm, data = d_train
.
Gesucht ist R-Quadrat als Maß für die Modellgüte im TEST-Sample.
Hinweise:
- Fixieren Sie die Zufallszahlen auf den Startwert 42.
- Nutzen Sie eine v=5,r=2 CV.
- Tunen Sie \(K\), setzen Sie den Tuning-Wertebereich auf 1 bis 5.
- Entfernen Sie fehlende Werte in den Variablen.
- Verzichten Sie auf weitere Schritte der Vorverarbeitung.
Lösung
Setup:
Datensatz auf NAs prüfen:
<-
d2 %>%
d drop_na()
Datensatz aufteilen:
set.seed(42)
<- initial_split(d2)
d_split <- training(d_split)
d_train <- testing(d_split) d_test
Workflow:
<-
rec1 recipe(body_mass_g ~ bill_length_mm, data = d_train) %>%
step_naomit(all_numeric())
<-
knn_model nearest_neighbor(
mode = "regression",
neighbors = tune()
)
<-
wflow workflow() %>%
add_recipe(rec1) %>%
add_model(knn_model)
wflow
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: nearest_neighbor()
── Preprocessor ────────────────────────────────────────────────────────────────
1 Recipe Step
• step_naomit()
── Model ───────────────────────────────────────────────────────────────────────
K-Nearest Neighbor Model Specification (regression)
Main Arguments:
neighbors = tune()
Computational engine: kknn
Backen:
<- prep(rec1) %>% bake(new_data = NULL)
d_baked %>% head() d_baked
# A tibble: 6 × 2
bill_length_mm body_mass_g
<dbl> <dbl>
1 34.5 2900
2 52.2 3450
3 45.4 4800
4 42.1 4000
5 50 5350
6 41.5 4000
Auf NA prüfen:
sum(is.na(d_baked))
[1] 0
CV:
set.seed(42)
<- vfold_cv(d_train, v = 5, repeats = 2)
folds folds
# 5-fold cross-validation repeated 2 times
# A tibble: 10 × 3
splits id id2
<list> <chr> <chr>
1 <split [199/50]> Repeat1 Fold1
2 <split [199/50]> Repeat1 Fold2
3 <split [199/50]> Repeat1 Fold3
4 <split [199/50]> Repeat1 Fold4
5 <split [200/49]> Repeat1 Fold5
6 <split [199/50]> Repeat2 Fold1
7 <split [199/50]> Repeat2 Fold2
8 <split [199/50]> Repeat2 Fold3
9 <split [199/50]> Repeat2 Fold4
10 <split [200/49]> Repeat2 Fold5
Tunen:
<-
d_resamples tune_grid(
wflow,resamples = folds,
control = control_grid(save_workflow = TRUE),
grid = grid_regular(
neighbors(range = c(1, 5))
)
)
d_resamples
# Tuning results
# 5-fold cross-validation repeated 2 times
# A tibble: 10 × 5
splits id id2 .metrics .notes
<list> <chr> <chr> <list> <list>
1 <split [199/50]> Repeat1 Fold1 <tibble [6 × 5]> <tibble [0 × 3]>
2 <split [199/50]> Repeat1 Fold2 <tibble [6 × 5]> <tibble [0 × 3]>
3 <split [199/50]> Repeat1 Fold3 <tibble [6 × 5]> <tibble [0 × 3]>
4 <split [199/50]> Repeat1 Fold4 <tibble [6 × 5]> <tibble [0 × 3]>
5 <split [200/49]> Repeat1 Fold5 <tibble [6 × 5]> <tibble [0 × 3]>
6 <split [199/50]> Repeat2 Fold1 <tibble [6 × 5]> <tibble [0 × 3]>
7 <split [199/50]> Repeat2 Fold2 <tibble [6 × 5]> <tibble [0 × 3]>
8 <split [199/50]> Repeat2 Fold3 <tibble [6 × 5]> <tibble [0 × 3]>
9 <split [199/50]> Repeat2 Fold4 <tibble [6 × 5]> <tibble [0 × 3]>
10 <split [200/49]> Repeat2 Fold5 <tibble [6 × 5]> <tibble [0 × 3]>
Bester Kandidat:
show_best(d_resamples)
Warning: No value of `metric` was given; metric 'rmse' will be used.
# A tibble: 3 × 7
neighbors .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 5 rmse standard 733. 10 19.3 Preprocessor1_Model3
2 3 rmse standard 777. 10 23.8 Preprocessor1_Model2
3 1 rmse standard 945. 10 28.0 Preprocessor1_Model1
<- fit_best(d_resamples)
fitbest fitbest
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: nearest_neighbor()
── Preprocessor ────────────────────────────────────────────────────────────────
1 Recipe Step
• step_naomit()
── Model ───────────────────────────────────────────────────────────────────────
Call:
kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(5L, data, 5))
Type of response variable: continuous
minimal mean absolute error: 497.0257
Minimal mean squared error: 407926.4
Best kernel: optimal
Best k: 5
Last Fit:
<- last_fit(fitbest, d_split)
fit_last fit_last
# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [249/84]> train/test split <tibble> <tibble> <tibble> <workflow>
Modellgüte im Test-Sample:
%>% collect_metrics() fit_last
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 654. Preprocessor1_Model1
2 rsq standard 0.294 Preprocessor1_Model1
R-Quadrat:
<- collect_metrics(fit_last)[[".estimate"]][2]
sol sol
[1] 0.2935091
Categories:
- ds1
- tidymodels
- prediction
- yacsda
- statlearning
- num