library(tidymodels)
library(tidyverse)
library(tictoc)
bike04
statlearning
tidymodels
num
Aufgabe
Kann man die Anzahl gerade verliehener Fahrräder eines entsprechenden Anbieters anhand der Temperatur vorhersagen?
In dieser Übung untersuchen wir diese Frage.
Sie können die Daten von der Webseite der UCI herunterladen.
Wir beziehen uns auf den Datensatz day
.
Berechnen Sie einen Entscheidungsbaum mit der Anzahl der aktuell vermieteten Räder als AV und der aktuellen Temperatur als UV!
Tunen Sie alle Paramter; lassen Sie sich 20 Tuningparameter vorschlagen.
Geben Sie den MSE an!
Lösung
<- read.csv("/Users/sebastiansaueruser/datasets/Bike-Sharing-Dataset/day.csv") d
glimpse(d)
Rows: 731
Columns: 16
$ instant <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
$ dteday <chr> "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04", "20…
$ season <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ yr <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ mnth <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ holiday <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
$ weekday <int> 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4,…
$ workingday <int> 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,…
$ weathersit <int> 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2,…
$ temp <dbl> 0.3441670, 0.3634780, 0.1963640, 0.2000000, 0.2269570, 0.20…
$ atemp <dbl> 0.3636250, 0.3537390, 0.1894050, 0.2121220, 0.2292700, 0.23…
$ hum <dbl> 0.805833, 0.696087, 0.437273, 0.590435, 0.436957, 0.518261,…
$ windspeed <dbl> 0.1604460, 0.2485390, 0.2483090, 0.1602960, 0.1869000, 0.08…
$ casual <int> 331, 131, 120, 108, 82, 88, 148, 68, 54, 41, 43, 25, 38, 54…
$ registered <int> 654, 670, 1229, 1454, 1518, 1518, 1362, 891, 768, 1280, 122…
$ cnt <int> 985, 801, 1349, 1562, 1600, 1606, 1510, 959, 822, 1321, 126…
Data split
set.seed(42)
<- initial_split(d, strata = cnt)
d_split
<- training(d_split)
d_train <- testing(d_split) d_test
Define recipe
<-
rec1 recipe(cnt ~ temp, data = d)
Define model
<-
m1 decision_tree(cost_complexity = tune(),
tree_depth = tune(),
min_n = tune(),
mode = "regression")
Tuning grid
<-
grid grid_latin_hypercube(cost_complexity(),
tree_depth(),
min_n(),
size = 20)
grid
# A tibble: 20 × 3
cost_complexity tree_depth min_n
<dbl> <int> <int>
1 1.09e- 7 8 13
2 9.98e- 9 14 32
3 1.72e- 5 8 38
4 6.73e- 5 11 9
5 5.01e- 6 13 20
6 1.60e- 2 5 18
7 4.08e- 9 12 4
8 3.49e- 3 2 8
9 3.72e-10 9 27
10 3.14e- 7 11 21
11 3.92e- 2 3 30
12 8.08e- 5 6 26
13 1.04e- 6 14 33
14 1.17e-10 1 36
15 9.35e-10 4 16
16 3.05e- 4 7 15
17 1.80e- 6 6 23
18 8.38e- 4 3 5
19 8.01e- 3 13 11
20 3.46e- 8 10 39
Alternativ:
<-
grid grid_latin_hypercube(extract_parameter_set_dials(m1), size = 50)
grid
# A tibble: 50 × 3
cost_complexity tree_depth min_n
<dbl> <int> <int>
1 0.000390 6 21
2 0.0000000863 8 15
3 0.000576 12 37
4 0.0000000469 2 31
5 0.0000000283 5 19
6 0.00000000207 4 5
7 0.000000614 2 23
8 0.00000000952 14 13
9 0.00000413 11 7
10 0.0000472 7 12
# ℹ 40 more rows
Define Resamples
<- vfold_cv(d_train) rsmpl
Workflow
<-
wf1 workflow() %>%
add_model(m1) %>%
add_recipe(rec1)
Fit
tic()
<- tune_grid(
fit1 object = wf1,
resamples = rsmpl)
toc()
6.762 sec elapsed
fit1
# Tuning results
# 10-fold cross-validation
# A tibble: 10 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [492/55]> Fold01 <tibble [20 × 7]> <tibble [0 × 3]>
2 <split [492/55]> Fold02 <tibble [20 × 7]> <tibble [0 × 3]>
3 <split [492/55]> Fold03 <tibble [20 × 7]> <tibble [0 × 3]>
4 <split [492/55]> Fold04 <tibble [20 × 7]> <tibble [0 × 3]>
5 <split [492/55]> Fold05 <tibble [20 × 7]> <tibble [0 × 3]>
6 <split [492/55]> Fold06 <tibble [20 × 7]> <tibble [0 × 3]>
7 <split [492/55]> Fold07 <tibble [20 × 7]> <tibble [0 × 3]>
8 <split [493/54]> Fold08 <tibble [20 × 7]> <tibble [0 × 3]>
9 <split [493/54]> Fold09 <tibble [20 × 7]> <tibble [0 × 3]>
10 <split [493/54]> Fold10 <tibble [20 × 7]> <tibble [0 × 3]>
Bester Kandidat
show_best(fit1)
Warning: No value of `metric` was given; metric 'rmse' will be used.
# A tibble: 5 × 9
cost_complexity tree_depth min_n .metric .estimator mean n std_err
<dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl>
1 3.92e- 4 2 18 rmse standard 1443. 10 29.9
2 1.46e- 2 11 38 rmse standard 1453. 10 33.5
3 1.23e- 2 14 10 rmse standard 1458. 10 32.5
4 1.17e- 9 3 29 rmse standard 1459. 10 29.2
5 4.46e-10 5 36 rmse standard 1460. 10 29.9
# ℹ 1 more variable: .config <chr>
<-
wf1_best %>%
wf1 finalize_workflow(parameters = select_best(fit1))
Warning: No value of `metric` was given; metric 'rmse' will be used.
Last Fit
<- last_fit(wf1_best, d_split) fit_testsample
Model performance (metrics) in test set
%>% collect_metrics() fit_testsample
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 1399. Preprocessor1_Model1
2 rsq standard 0.497 Preprocessor1_Model1
<- fit_testsample %>% collect_metrics() %>% pluck(3, 1)
MSE MSE
[1] 1398.675
Solution: 1398.6748691
Categories:
- statlearning
- tidymodels
- num