在 tidymodels 中获取有关 C5 模型的更多信息
Getting more information about C5 model in tidymodels
这是一个使用 palmerpenguins 数据集的简单建模工作流程:
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
#> Warning: package 'parsnip' was built under R version 4.1.3
library(rules)
#>
#> Attaching package: 'rules'
#> The following object is masked from 'package:dials':
#>
#> max_rules
library(palmerpenguins)
#> Warning: package 'palmerpenguins' was built under R version 4.1.3
set.seed(2022)
penguins_split <- initial_split(penguins)
penguins_training <- training(penguins_split)
penguins_testing <- testing(penguins_split)
folds <- vfold_cv(penguins_training, v = 3)
simple_rec <- penguins_training %>%
recipe(species ~ .)
C5_model <- C5_rules() %>%
set_engine("C5.0")
penguins_wf <- workflow() %>%
add_recipe(simple_rec) %>%
add_model(C5_model)
penguins_no_tuning <- fit_resamples(
object = penguins_wf,
resamples = folds,
control = control_resamples(save_pred = TRUE, verbose = TRUE)
)
#> Warning: package 'C50' was built under R version 4.1.3
#> i Fold1: preprocessor 1/1
#> v Fold1: preprocessor 1/1
#> i Fold1: preprocessor 1/1, model 1/1
#> v Fold1: preprocessor 1/1, model 1/1
#> i Fold1: preprocessor 1/1, model 1/1 (predictions)
#> i Fold2: preprocessor 1/1
#> v Fold2: preprocessor 1/1
#> i Fold2: preprocessor 1/1, model 1/1
#> v Fold2: preprocessor 1/1, model 1/1
#> i Fold2: preprocessor 1/1, model 1/1 (predictions)
#> i Fold3: preprocessor 1/1
#> v Fold3: preprocessor 1/1
#> i Fold3: preprocessor 1/1, model 1/1
#> v Fold3: preprocessor 1/1, model 1/1
#> i Fold3: preprocessor 1/1, model 1/1 (predictions)
collect_metrics(penguins_no_tuning)
#> # A tibble: 2 x 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy multiclass 0.977 3 0.0134 Preprocessor1_Model1
#> 2 roc_auc hand_till 0.985 3 0.00976 Preprocessor1_Model1
penguins_final_fit <- penguins_wf %>%
last_fit(split = penguins_split)
collect_metrics(penguins_final_fit)
#> # A tibble: 2 x 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 accuracy multiclass 0.942 Preprocessor1_Model1
#> 2 roc_auc hand_till 0.990 Preprocessor1_Model1
#Display rules
extract_fit_engine(penguins_final_fit) %>%
summary()
#>
#> Call:
#> C5.0.default(x = x, y = y, trials = trials, rules = TRUE, control
#> = C50::C5.0Control(minCases = minCases, seed = sample.int(10^5,
#> 1), earlyStopping = FALSE))
#>
#>
#> C5.0 [Release 2.07 GPL Edition] Thu Mar 17 17:42:59 2022
#> -------------------------------
#>
#> Class specified by attribute `outcome'
#>
#> Read 258 cases (8 attributes) from undefined.data
#>
#> Rules:
#>
#> Rule 1: (73, lift 2.3)
#> island in {Biscoe, Torgersen}
#> flipper_length_mm <= 206
#> -> class Adelie [0.987]
#>
#> Rule 2: (98/18, lift 1.8)
#> island in {Dream, Torgersen}
#> bill_length_mm <= 46.5
#> -> class Adelie [0.810]
#>
#> Rule 3: (53, lift 4.7)
#> island = Dream
#> bill_length_mm > 42.2
#> -> class Chinstrap [0.982]
#>
#> Rule 4: (90, lift 2.8)
#> island = Biscoe
#> flipper_length_mm > 206
#> -> class Gentoo [0.989]
#>
#> Default class: Gentoo
#>
#>
#> Evaluation on training data (258 cases):
#>
#> Rules
#> ----------------
#> No Errors
#>
#> 4 1( 0.4%) <<
#>
#>
#> (a) (b) (c) <-classified as
#> ---- ---- ----
#> 113 (a): class Adelie
#> 1 53 (b): class Chinstrap
#> 91 (c): class Gentoo
#>
#>
#> Attribute usage:
#>
#> 99.61% island
#> 63.18% flipper_length_mm
#> 51.94% bill_length_mm
#>
#>
#> Time: 0.0 secs
#Model information?
extract_workflow(penguins_final_fit)
#> == Workflow [trained] ==========================================================
#> Preprocessor: Recipe
#> Model: C5_rules()
#>
#> -- Preprocessor ----------------------------------------------------------------
#> 0 Recipe Steps
#>
#> -- Model -----------------------------------------------------------------------
#> C5.0 Model Specification ()
extract_workflow(penguins_final_fit) %>%
summary()
#> Length Class Mode
#> pre 2 stage_pre list
#> fit 2 stage_fit list
#> post 1 stage_post list
#> trained 1 -none- logical
由 reprex package (v2.0.1)
创建于 2022-03-17
我有三个问题:
- 当显示规则时,它说 Evaluation on training data but
penguins_final_fit
was fitted on the test data using last_fit()
。如何让模型在 测试数据 上输出评估?
- 如何从 C5 模型中获取更多信息,例如修剪前树有多深?
extract_fit_engine()
和 extract_workflow()
不提供该信息。
- C5.0 的辅助参数listed here - 可以调整这些参数吗?如果可以,应该在哪里添加这些参数?我看了一下
?C50::C5.0Control
但仍然不明白如何在 tidymodels 框架中实现这些。
当you use last_fit()
you fit to the training data and evaluate on the testing data. If you look at the output of last_fit()
, the metrics and predictions are from the testing data, while the fitted workflow was trained using the training data. You can read more关于使用测试集。
您有 surfaced a bug in how we handle tuning engine-specific arguments in parsnip extension packages。我知道这给您带来不便,但感谢您的报告!
这是一个使用 palmerpenguins 数据集的简单建模工作流程:
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
#> Warning: package 'parsnip' was built under R version 4.1.3
library(rules)
#>
#> Attaching package: 'rules'
#> The following object is masked from 'package:dials':
#>
#> max_rules
library(palmerpenguins)
#> Warning: package 'palmerpenguins' was built under R version 4.1.3
set.seed(2022)
penguins_split <- initial_split(penguins)
penguins_training <- training(penguins_split)
penguins_testing <- testing(penguins_split)
folds <- vfold_cv(penguins_training, v = 3)
simple_rec <- penguins_training %>%
recipe(species ~ .)
C5_model <- C5_rules() %>%
set_engine("C5.0")
penguins_wf <- workflow() %>%
add_recipe(simple_rec) %>%
add_model(C5_model)
penguins_no_tuning <- fit_resamples(
object = penguins_wf,
resamples = folds,
control = control_resamples(save_pred = TRUE, verbose = TRUE)
)
#> Warning: package 'C50' was built under R version 4.1.3
#> i Fold1: preprocessor 1/1
#> v Fold1: preprocessor 1/1
#> i Fold1: preprocessor 1/1, model 1/1
#> v Fold1: preprocessor 1/1, model 1/1
#> i Fold1: preprocessor 1/1, model 1/1 (predictions)
#> i Fold2: preprocessor 1/1
#> v Fold2: preprocessor 1/1
#> i Fold2: preprocessor 1/1, model 1/1
#> v Fold2: preprocessor 1/1, model 1/1
#> i Fold2: preprocessor 1/1, model 1/1 (predictions)
#> i Fold3: preprocessor 1/1
#> v Fold3: preprocessor 1/1
#> i Fold3: preprocessor 1/1, model 1/1
#> v Fold3: preprocessor 1/1, model 1/1
#> i Fold3: preprocessor 1/1, model 1/1 (predictions)
collect_metrics(penguins_no_tuning)
#> # A tibble: 2 x 6
#> .metric .estimator mean n std_err .config
#> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 accuracy multiclass 0.977 3 0.0134 Preprocessor1_Model1
#> 2 roc_auc hand_till 0.985 3 0.00976 Preprocessor1_Model1
penguins_final_fit <- penguins_wf %>%
last_fit(split = penguins_split)
collect_metrics(penguins_final_fit)
#> # A tibble: 2 x 4
#> .metric .estimator .estimate .config
#> <chr> <chr> <dbl> <chr>
#> 1 accuracy multiclass 0.942 Preprocessor1_Model1
#> 2 roc_auc hand_till 0.990 Preprocessor1_Model1
#Display rules
extract_fit_engine(penguins_final_fit) %>%
summary()
#>
#> Call:
#> C5.0.default(x = x, y = y, trials = trials, rules = TRUE, control
#> = C50::C5.0Control(minCases = minCases, seed = sample.int(10^5,
#> 1), earlyStopping = FALSE))
#>
#>
#> C5.0 [Release 2.07 GPL Edition] Thu Mar 17 17:42:59 2022
#> -------------------------------
#>
#> Class specified by attribute `outcome'
#>
#> Read 258 cases (8 attributes) from undefined.data
#>
#> Rules:
#>
#> Rule 1: (73, lift 2.3)
#> island in {Biscoe, Torgersen}
#> flipper_length_mm <= 206
#> -> class Adelie [0.987]
#>
#> Rule 2: (98/18, lift 1.8)
#> island in {Dream, Torgersen}
#> bill_length_mm <= 46.5
#> -> class Adelie [0.810]
#>
#> Rule 3: (53, lift 4.7)
#> island = Dream
#> bill_length_mm > 42.2
#> -> class Chinstrap [0.982]
#>
#> Rule 4: (90, lift 2.8)
#> island = Biscoe
#> flipper_length_mm > 206
#> -> class Gentoo [0.989]
#>
#> Default class: Gentoo
#>
#>
#> Evaluation on training data (258 cases):
#>
#> Rules
#> ----------------
#> No Errors
#>
#> 4 1( 0.4%) <<
#>
#>
#> (a) (b) (c) <-classified as
#> ---- ---- ----
#> 113 (a): class Adelie
#> 1 53 (b): class Chinstrap
#> 91 (c): class Gentoo
#>
#>
#> Attribute usage:
#>
#> 99.61% island
#> 63.18% flipper_length_mm
#> 51.94% bill_length_mm
#>
#>
#> Time: 0.0 secs
#Model information?
extract_workflow(penguins_final_fit)
#> == Workflow [trained] ==========================================================
#> Preprocessor: Recipe
#> Model: C5_rules()
#>
#> -- Preprocessor ----------------------------------------------------------------
#> 0 Recipe Steps
#>
#> -- Model -----------------------------------------------------------------------
#> C5.0 Model Specification ()
extract_workflow(penguins_final_fit) %>%
summary()
#> Length Class Mode
#> pre 2 stage_pre list
#> fit 2 stage_fit list
#> post 1 stage_post list
#> trained 1 -none- logical
由 reprex package (v2.0.1)
创建于 2022-03-17我有三个问题:
- 当显示规则时,它说 Evaluation on training data but
penguins_final_fit
was fitted on the test data usinglast_fit()
。如何让模型在 测试数据 上输出评估? - 如何从 C5 模型中获取更多信息,例如修剪前树有多深?
extract_fit_engine()
和extract_workflow()
不提供该信息。 - C5.0 的辅助参数listed here - 可以调整这些参数吗?如果可以,应该在哪里添加这些参数?我看了一下
?C50::C5.0Control
但仍然不明白如何在 tidymodels 框架中实现这些。
当you use
last_fit()
you fit to the training data and evaluate on the testing data. If you look at the output oflast_fit()
, the metrics and predictions are from the testing data, while the fitted workflow was trained using the training data. You can read more关于使用测试集。您有 surfaced a bug in how we handle tuning engine-specific arguments in parsnip extension packages。我知道这给您带来不便,但感谢您的报告!