bike04

statlearning

tidymodels

num

Published

May 17, 2023

Aufgabe

Kann man die Anzahl gerade verliehener Fahrräder eines entsprechenden Anbieters anhand der Temperatur vorhersagen?

In dieser Übung untersuchen wir diese Frage.

Sie können die Daten von der Webseite der UCI herunterladen.

Wir beziehen uns auf den Datensatz day.

Berechnen Sie einen Entscheidungsbaum mit der Anzahl der aktuell vermieteten Räder als AV und der aktuellen Temperatur als UV!

Tunen Sie alle Paramter; lassen Sie sich 20 Tuningparameter vorschlagen.

Geben Sie den MSE an!

Hinweise

Lösung

library(tidymodels)
library(tidyverse)
library(tictoc)

d <- read.csv("/Users/sebastiansaueruser/datasets/Bike-Sharing-Dataset/day.csv")

glimpse(d)

Rows: 731
Columns: 16
$ instant    <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
$ dteday     <chr> "2011-01-01", "2011-01-02", "2011-01-03", "2011-01-04", "20…
$ season     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ yr         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ mnth       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ holiday    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,…
$ weekday    <int> 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4,…
$ workingday <int> 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,…
$ weathersit <int> 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2,…
$ temp       <dbl> 0.3441670, 0.3634780, 0.1963640, 0.2000000, 0.2269570, 0.20…
$ atemp      <dbl> 0.3636250, 0.3537390, 0.1894050, 0.2121220, 0.2292700, 0.23…
$ hum        <dbl> 0.805833, 0.696087, 0.437273, 0.590435, 0.436957, 0.518261,…
$ windspeed  <dbl> 0.1604460, 0.2485390, 0.2483090, 0.1602960, 0.1869000, 0.08…
$ casual     <int> 331, 131, 120, 108, 82, 88, 148, 68, 54, 41, 43, 25, 38, 54…
$ registered <int> 654, 670, 1229, 1454, 1518, 1518, 1362, 891, 768, 1280, 122…
$ cnt        <int> 985, 801, 1349, 1562, 1600, 1606, 1510, 959, 822, 1321, 126…

Data split

set.seed(42)
d_split <- initial_split(d, strata = cnt)

d_train <- training(d_split)
d_test <- testing(d_split)

Define recipe

rec1 <- 
  recipe(cnt ~ temp, data = d)

Define model

m1 <-
  decision_tree(cost_complexity = tune(),
                tree_depth = tune(),
                min_n = tune(),
                mode = "regression")

Tuning grid

grid <-
  grid_latin_hypercube(cost_complexity(), 
               tree_depth(),
               min_n(),
               size = 20)
grid

# A tibble: 20 × 3
   cost_complexity tree_depth min_n
             <dbl>      <int> <int>
 1        1.09e- 7          8    13
 2        9.98e- 9         14    32
 3        1.72e- 5          8    38
 4        6.73e- 5         11     9
 5        5.01e- 6         13    20
 6        1.60e- 2          5    18
 7        4.08e- 9         12     4
 8        3.49e- 3          2     8
 9        3.72e-10          9    27
10        3.14e- 7         11    21
11        3.92e- 2          3    30
12        8.08e- 5          6    26
13        1.04e- 6         14    33
14        1.17e-10          1    36
15        9.35e-10          4    16
16        3.05e- 4          7    15
17        1.80e- 6          6    23
18        8.38e- 4          3     5
19        8.01e- 3         13    11
20        3.46e- 8         10    39

Alternativ:

grid <-
  grid_latin_hypercube(extract_parameter_set_dials(m1), size = 50)
grid

# A tibble: 50 × 3
   cost_complexity tree_depth min_n
             <dbl>      <int> <int>
 1   0.000390               6    21
 2   0.0000000863           8    15
 3   0.000576              12    37
 4   0.0000000469           2    31
 5   0.0000000283           5    19
 6   0.00000000207          4     5
 7   0.000000614            2    23
 8   0.00000000952         14    13
 9   0.00000413            11     7
10   0.0000472              7    12
# ℹ 40 more rows

Define Resamples

rsmpl <- vfold_cv(d_train)

Workflow

wf1 <-
  workflow() %>% 
  add_model(m1) %>% 
  add_recipe(rec1)

Fit

tic()
fit1 <- tune_grid(
  object = wf1, 
  resamples = rsmpl)
toc()

6.762 sec elapsed

fit1

# Tuning results
# 10-fold cross-validation 
# A tibble: 10 × 4
   splits           id     .metrics          .notes          
   <list>           <chr>  <list>            <list>          
 1 <split [492/55]> Fold01 <tibble [20 × 7]> <tibble [0 × 3]>
 2 <split [492/55]> Fold02 <tibble [20 × 7]> <tibble [0 × 3]>
 3 <split [492/55]> Fold03 <tibble [20 × 7]> <tibble [0 × 3]>
 4 <split [492/55]> Fold04 <tibble [20 × 7]> <tibble [0 × 3]>
 5 <split [492/55]> Fold05 <tibble [20 × 7]> <tibble [0 × 3]>
 6 <split [492/55]> Fold06 <tibble [20 × 7]> <tibble [0 × 3]>
 7 <split [492/55]> Fold07 <tibble [20 × 7]> <tibble [0 × 3]>
 8 <split [493/54]> Fold08 <tibble [20 × 7]> <tibble [0 × 3]>
 9 <split [493/54]> Fold09 <tibble [20 × 7]> <tibble [0 × 3]>
10 <split [493/54]> Fold10 <tibble [20 × 7]> <tibble [0 × 3]>

Bester Kandidat

show_best(fit1)

Warning: No value of `metric` was given; metric 'rmse' will be used.

# A tibble: 5 × 9
  cost_complexity tree_depth min_n .metric .estimator  mean     n std_err
            <dbl>      <int> <int> <chr>   <chr>      <dbl> <int>   <dbl>
1        3.92e- 4          2    18 rmse    standard   1443.    10    29.9
2        1.46e- 2         11    38 rmse    standard   1453.    10    33.5
3        1.23e- 2         14    10 rmse    standard   1458.    10    32.5
4        1.17e- 9          3    29 rmse    standard   1459.    10    29.2
5        4.46e-10          5    36 rmse    standard   1460.    10    29.9
# ℹ 1 more variable: .config <chr>

wf1_best <-
  wf1 %>% 
  finalize_workflow(parameters = select_best(fit1))

Warning: No value of `metric` was given; metric 'rmse' will be used.

Last Fit

fit_testsample <- last_fit(wf1_best, d_split)

Model performance (metrics) in test set

fit_testsample %>% collect_metrics()

# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard    1399.    Preprocessor1_Model1
2 rsq     standard       0.497 Preprocessor1_Model1

MSE <- fit_testsample %>% collect_metrics() %>% pluck(3, 1)
MSE

[1] 1398.675

Solution: 1398.6748691

Categories:

statlearning
tidymodels
num