Nonparametric estimator for ordered non-numeric outcomes. The estimator modifies a standard random forest splitting criterion to build a collection of forests, each estimating the conditional probability of a single class.
Outcome vector.
Covariate matrix (no intercept).
Whether to grow honest forests.
Fraction of honest sample. Ignored if honesty = FALSE
.
Whether to extract weights and compute standard errors. The weights extraction considerably slows down the routine. honesty = TRUE
is required for valid inference.
Controls the balance of each split. Each split leaves at least a fraction alpha
of observations in the parent node on each side of the split.
Number of trees.
Number of covariates to possibly split at in each node. Default is the square root of the number of covariates.
Minimal node size.
Maximal tree depth. A value of 0 corresponds to unlimited depth, 1 to "stumps" (one split per tree).
If TRUE
, grow trees on bootstrap subsamples. Otherwise, trees are grown on random subsamples drawn without replacement.
Fraction of observations to sample.
Number of threads. Zero corresponds to the number of CPUs available.
Object of class ocf
.
Di Francesco, R. (2023). Ordered Correlation Forest. arXiv preprint arXiv:2309.08755.
## Generate synthetic data.
set.seed(1986)
data <- generate_ordered_data(100)
sample <- data$sample
Y <- sample$Y
X <- sample[, -1]
## Training-test split.
train_idx <- sample(seq_len(length(Y)), floor(length(Y) * 0.5))
Y_tr <- Y[train_idx]
X_tr <- X[train_idx, ]
Y_test <- Y[-train_idx]
X_test <- X[-train_idx, ]
## Fit ocf on training sample.
forests <- ocf(Y_tr, X_tr)
## We have compatibility with generic S3-methods.
print(forests)
#> Call:
#> ocf(Y_tr, X_tr)
#>
#> Data info:
#> Full sample size: 50
#> N. covariates: 6
#> Classes: 1 2 3
#>
#> Relative variable importance:
#> x1 x2 x3 x4 x5 x6
#> 0.353 0.059 0.266 0.092 0.206 0.024
#>
#> Tuning parameters:
#> N. trees: 2000
#> mtry: 3
#> min.node.size 5
#> Subsampling scheme: No replacement
#> Honesty: FALSE
#> Honest fraction: 0
summary(forests)
#> Call:
#> ocf(Y_tr, X_tr)
#>
#> Data info:
#> Full sample size: 50
#> N. covariates: 6
#> Classes: 1 2 3
#>
#> Relative variable importance:
#> x1 x2 x3 x4 x5 x6
#> 0.353 0.059 0.266 0.092 0.206 0.024
#>
#> Tuning parameters:
#> N. trees: 2000
#> mtry: 3
#> min.node.size 5
#> Subsampling scheme: No replacement
#> Honesty: FALSE
#> Honest fraction: 0
predictions <- predict(forests, X_test)
head(predictions$probabilities)
#> P(Y=1) P(Y=2) P(Y=3)
#> [1,] 0.4224274 0.4548215 0.12275111
#> [2,] 0.4786262 0.4133636 0.10801015
#> [3,] 0.1446138 0.4064470 0.44893918
#> [4,] 0.6215123 0.3249310 0.05355674
#> [5,] 0.4359897 0.3503095 0.21370084
#> [6,] 0.6224514 0.3216924 0.05585619
table(Y_test, predictions$classification)
#>
#> Y_test 1 2 3
#> 1 11 4 1
#> 2 7 4 8
#> 3 3 1 11
## Compute standard errors. This requires honest forests.
honest_forests <- ocf(Y_tr, X_tr, honesty = TRUE, inference = TRUE)
head(honest_forests$predictions$standard.errors)
#> P(Y=1) P(Y=2) P(Y=3)
#> [1,] 0.15222731 0.11070564 0.1296048
#> [2,] 0.25687762 0.12911868 0.0855497
#> [3,] 0.09584680 0.08474158 0.1777998
#> [4,] 0.08506775 0.10642352 0.1795780
#> [5,] 0.07534241 0.07390018 0.1754728
#> [6,] 0.08414576 0.11421756 0.1380961
## Marginal effects.
me <- marginal_effects(forests, eval = "atmean")
print(me)
#> ocf marginal effects results
#>
#> Data info:
#> Number of classes: 3
#> Sample size: 50
#>
#> Tuning parameters:
#> Evaluation: atmean
#> Bandwidth: 0.1
#> Number of trees: 2000
#> Honest forests: FALSE
#> Honesty fraction: 0
#>
#> Marginal Effects:
#> P'(Y=1) P'(Y=2) P'(Y=3)
#> x1 -0.100 -0.311 0.411
#> x2 -0.078 0.065 0.013
#> x3 -0.048 -0.013 0.060
#> x4 -0.024 -0.175 0.198
#> x5 0.033 0.049 -0.082
#> x6 -0.006 0.010 -0.004
print(me, latex = TRUE)
#> \begingroup
#> \setlength{\tabcolsep}{8pt}
#> \renewcommand{\arraystretch}{1.1}
#> \begin{table}[H]
#> \centering
#> \begin{adjustbox}{width = 0.75\textwidth}
#> \begin{tabular}{@{\extracolsep{5pt}}l c c c}
#> \\[-1.8ex]\hline
#> \hline \\[-1.8ex]
#> & Class 1 & Class 2 & Class 3 \\
#> \addlinespace[2pt]
#> \hline \\[-1.8ex]
#>
#> \texttt{x1} & -0.1 & -0.311 & 0.411 \\
#> \texttt{x2} & -0.078 & 0.065 & 0.013 \\
#> \texttt{x3} & -0.048 & -0.013 & 0.06 \\
#> \texttt{x4} & -0.024 & -0.175 & 0.198 \\
#> \texttt{x5} & 0.033 & 0.049 & -0.082 \\
#> \texttt{x6} & -0.006 & 0.01 & -0.004 \\
#>
#> \addlinespace[3pt]
#> \\[-1.8ex]\hline
#> \hline \\[-1.8ex]
#> \end{tabular}
#> \end{adjustbox}
#> \caption{Marginal effects.}
#> \label{table:ocf.marginal.effects}
#> \end{table}
#> \endgroup
## Compute standard errors. This requires honest forests.
honest_me <- marginal_effects(honest_forests, eval = "atmean", inference = TRUE)
honest_me$standard.errors
#> P'(Y=1) P'(Y=2) P'(Y=3)
#> x1 0.020994263 0.01995347 0.002127984
#> x2 0.023534434 0.04554165 0.012388880
#> x3 0.046477600 0.04247743 0.000000000
#> x4 0.011836598 0.04421252 0.051962839
#> x5 0.023905929 0.02908331 0.027563358
#> x6 0.009801362 0.02172657 0.012120712
print(honest_me, latex = TRUE)
#> \begingroup
#> \setlength{\tabcolsep}{8pt}
#> \renewcommand{\arraystretch}{1.1}
#> \begin{table}[H]
#> \centering
#> \begin{adjustbox}{width = 0.75\textwidth}
#> \begin{tabular}{@{\extracolsep{5pt}}l c c c}
#> \\[-1.8ex]\hline
#> \hline \\[-1.8ex]
#> & Class 1 & Class 2 & Class 3 \\
#> \addlinespace[2pt]
#> \hline \\[-1.8ex]
#>
#> \texttt{x1} & -0.017 & 0 & 0.017 \\
#> & (0.021) & (0.02) & (0.002) \\
#> \texttt{x2} & -0.036 & 0.013 & 0.023 \\
#> & (0.024) & (0.046) & (0.012) \\
#> \texttt{x3} & -0.011 & 0.009 & 0.002 \\
#> & (0.046) & (0.042) & (0) \\
#> \texttt{x4} & -0.014 & -0.053 & 0.067 \\
#> & (0.012) & (0.044) & (0.052) \\
#> \texttt{x5} & -0.006 & 0.026 & -0.02 \\
#> & (0.024) & (0.029) & (0.028) \\
#> \texttt{x6} & -0.002 & -0.003 & 0.005 \\
#> & (0.01) & (0.022) & (0.012) \\
#>
#> \addlinespace[3pt]
#> \\[-1.8ex]\hline
#> \hline \\[-1.8ex]
#> \end{tabular}
#> \end{adjustbox}
#> \caption{Marginal effects.}
#> \label{table:ocf.marginal.effects}
#> \end{table}
#> \endgroup