bike03

statlearning
tidymodels
num
Published

May 17, 2023

Aufgabe

Kann man die Anzahl gerade verliehener Fahrräder eines entsprechenden Anbieters anhand der Temperatur vorhersagen?

In dieser Übung untersuchen wir diese Frage.

Sie können die Daten von der Webseite der UCI herunterladen.

Wir beziehen uns auf den Datensatz day.

Berechnen Sie einen Entscheidungsbaum mit der Anzahl der aktuell vermieteten Räder als AV und der aktuellen Temperatur als UV!

Tunen Sie den Cp-Parameter des Baumes; lassen Sie sich 20 Tuningparameter vorschlagen.

Geben Sie den MSE an!

Hinweise











Lösung

library(tidymodels)
library(tidyverse)
library(tictoc)
d <- read.csv("/Users/sebastiansaueruser/datasets/Bike-Sharing-Dataset/day.csv")
glimpse(d)
Rows: 731
Columns: 16
$ instant    <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
$ dteday     <chr> "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04", "20…
$ season     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ yr         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ mnth       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ holiday    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
$ weekday    <int> 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4,…
$ workingday <int> 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,…
$ weathersit <int> 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2,…
$ temp       <dbl> 0.3441670, 0.3634780, 0.1963640, 0.2000000, 0.2269570, 0.20…
$ atemp      <dbl> 0.3636250, 0.3537390, 0.1894050, 0.2121220, 0.2292700, 0.23…
$ hum        <dbl> 0.805833, 0.696087, 0.437273, 0.590435, 0.436957, 0.518261,…
$ windspeed  <dbl> 0.1604460, 0.2485390, 0.2483090, 0.1602960, 0.1869000, 0.08…
$ casual     <int> 331, 131, 120, 108, 82, 88, 148, 68, 54, 41, 43, 25, 38, 54…
$ registered <int> 654, 670, 1229, 1454, 1518, 1518, 1362, 891, 768, 1280, 122…
$ cnt        <int> 985, 801, 1349, 1562, 1600, 1606, 1510, 959, 822, 1321, 126…

Data split

set.seed(42)
d_split <- initial_split(d, strata = cnt)

d_train <- training(d_split)
d_test <- testing(d_split)

Define recipe

rec1 <- 
  recipe(cnt ~ temp, data = d)

Define model

m1 <-
  decision_tree(cost_complexity = tune(),
                mode = "regression")

Tuning grid

grid <-
  grid_regular(cost_complexity(), levels = 20)
grid
# A tibble: 20 × 1
   cost_complexity
             <dbl>
 1        1   e-10
 2        2.98e-10
 3        8.86e-10
 4        2.64e- 9
 5        7.85e- 9
 6        2.34e- 8
 7        6.95e- 8
 8        2.07e- 7
 9        6.16e- 7
10        1.83e- 6
11        5.46e- 6
12        1.62e- 5
13        4.83e- 5
14        1.44e- 4
15        4.28e- 4
16        1.27e- 3
17        3.79e- 3
18        1.13e- 2
19        3.36e- 2
20        1   e- 1

Alternativ:

grid <-
  grid_regular(extract_parameter_set_dials(m1), levels = 20)
grid
# A tibble: 20 × 1
   cost_complexity
             <dbl>
 1        1   e-10
 2        2.98e-10
 3        8.86e-10
 4        2.64e- 9
 5        7.85e- 9
 6        2.34e- 8
 7        6.95e- 8
 8        2.07e- 7
 9        6.16e- 7
10        1.83e- 6
11        5.46e- 6
12        1.62e- 5
13        4.83e- 5
14        1.44e- 4
15        4.28e- 4
16        1.27e- 3
17        3.79e- 3
18        1.13e- 2
19        3.36e- 2
20        1   e- 1

Define Resamples

rsmpl <- vfold_cv(d_train)

Workflow

wf1 <-
  workflow() %>% 
  add_model(m1) %>% 
  add_recipe(rec1) 

Fit

tic()
fit1 <- tune_grid(
  object = wf1, 
  resamples = rsmpl)
toc()
5.821 sec elapsed
fit1
# Tuning results
# 10-fold cross-validation 
# A tibble: 10 × 4
   splits           id     .metrics          .notes          
   <list>           <chr>  <list>            <list>          
 1 <split [492/55]> Fold01 <tibble [20 × 5]> <tibble [0 × 3]>
 2 <split [492/55]> Fold02 <tibble [20 × 5]> <tibble [0 × 3]>
 3 <split [492/55]> Fold03 <tibble [20 × 5]> <tibble [0 × 3]>
 4 <split [492/55]> Fold04 <tibble [20 × 5]> <tibble [0 × 3]>
 5 <split [492/55]> Fold05 <tibble [20 × 5]> <tibble [0 × 3]>
 6 <split [492/55]> Fold06 <tibble [20 × 5]> <tibble [0 × 3]>
 7 <split [492/55]> Fold07 <tibble [20 × 5]> <tibble [0 × 3]>
 8 <split [493/54]> Fold08 <tibble [20 × 5]> <tibble [0 × 3]>
 9 <split [493/54]> Fold09 <tibble [20 × 5]> <tibble [0 × 3]>
10 <split [493/54]> Fold10 <tibble [20 × 5]> <tibble [0 × 3]>

Bester Kandidat

show_best(fit1)
Warning: No value of `metric` was given; metric 'rmse' will be used.
# A tibble: 5 × 7
  cost_complexity .metric .estimator  mean     n std_err .config              
            <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1      0.0208     rmse    standard   1478.    10    34.7 Preprocessor1_Model09
2      0.00220    rmse    standard   1538.    10    36.4 Preprocessor1_Model01
3      0.000306   rmse    standard   1556.    10    40.3 Preprocessor1_Model07
4      0.00000175 rmse    standard   1558.    10    39.7 Preprocessor1_Model02
5      0.0000194  rmse    standard   1558.    10    39.7 Preprocessor1_Model03
wf1_best <-
  wf1 %>% 
  finalize_workflow(parameters = select_best(fit1))
Warning: No value of `metric` was given; metric 'rmse' will be used.

Last Fit

fit_testsample <- last_fit(wf1_best, d_split)

Model performance (metrics) in test set

fit_testsample %>% collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard    1430.    Preprocessor1_Model1
2 rsq     standard       0.473 Preprocessor1_Model1
MSE <- fit_testsample %>% collect_metrics() %>% pluck(3, 1)
MSE
[1] 1430.304

Solution: 1430.3042213


Categories:

  • statlearning
  • tidymodels
  • num