Nonparametric estimator for ordered non-numeric outcomes. The estimator modifies a standard random forest splitting criterion to build a collection of forests, each estimating the conditional probability of a single class.

ocf(
  Y = NULL,
  X = NULL,
  honesty = FALSE,
  honesty.fraction = 0.5,
  inference = FALSE,
  alpha = 0,
  n.trees = 2000,
  mtry = ceiling(sqrt(ncol(X))),
  min.node.size = 5,
  max.depth = 0,
  replace = FALSE,
  sample.fraction = ifelse(replace, 1, 0.5),
  n.threads = 1
)

Arguments

Y

Outcome vector.

X

Covariate matrix (no intercept).

honesty

Whether to grow honest forests.

honesty.fraction

Fraction of honest sample. Ignored if honesty = FALSE.

inference

Whether to extract weights and compute standard errors. The weights extraction considerably slows down the routine. honesty = TRUE is required for valid inference.

alpha

Controls the balance of each split. Each split leaves at least a fraction alpha of observations in the parent node on each side of the split.

n.trees

Number of trees.

mtry

Number of covariates to possibly split at in each node. Default is the square root of the number of covariates.

min.node.size

Minimal node size.

max.depth

Maximal tree depth. A value of 0 corresponds to unlimited depth, 1 to "stumps" (one split per tree).

replace

If TRUE, grow trees on bootstrap subsamples. Otherwise, trees are grown on random subsamples drawn without replacement.

sample.fraction

Fraction of observations to sample.

n.threads

Number of threads. Zero corresponds to the number of CPUs available.

Value

Object of class ocf.

References

  • Di Francesco, R. (2023). Ordered Correlation Forest. arXiv preprint arXiv:2309.08755.

See also

Author

Riccardo Di Francesco

Examples

## Generate synthetic data.
set.seed(1986)

data <- generate_ordered_data(100)
sample <- data$sample
Y <- sample$Y
X <- sample[, -1]

## Training-test split.
train_idx <- sample(seq_len(length(Y)), floor(length(Y) * 0.5))

Y_tr <- Y[train_idx]
X_tr <- X[train_idx, ]

Y_test <- Y[-train_idx]
X_test <- X[-train_idx, ]

## Fit ocf on training sample.
forests <- ocf(Y_tr, X_tr)

## We have compatibility with generic S3-methods.
print(forests)
#> Call: 
#> ocf(Y_tr, X_tr) 
#> 
#> Data info: 
#> Full sample size:   50 
#> N. covariates:      6 
#> Classes:            1 2 3 
#> 
#> Relative variable importance: 
#>    x1    x2    x3    x4    x5    x6 
#> 0.353 0.059 0.266 0.092 0.206 0.024 
#> 
#> Tuning parameters: 
#> N. trees:           2000 
#> mtry:               3 
#> min.node.size       5 
#> Subsampling scheme: No replacement 
#> Honesty:            FALSE 
#> Honest fraction:    0
summary(forests)
#> Call: 
#> ocf(Y_tr, X_tr) 
#> 
#> Data info: 
#> Full sample size:   50 
#> N. covariates:      6 
#> Classes:            1 2 3 
#> 
#> Relative variable importance: 
#>    x1    x2    x3    x4    x5    x6 
#> 0.353 0.059 0.266 0.092 0.206 0.024 
#> 
#> Tuning parameters: 
#> N. trees:           2000 
#> mtry:               3 
#> min.node.size       5 
#> Subsampling scheme: No replacement 
#> Honesty:            FALSE 
#> Honest fraction:    0
predictions <- predict(forests, X_test)
head(predictions$probabilities)
#>         P(Y=1)    P(Y=2)     P(Y=3)
#> [1,] 0.4224274 0.4548215 0.12275111
#> [2,] 0.4786262 0.4133636 0.10801015
#> [3,] 0.1446138 0.4064470 0.44893918
#> [4,] 0.6215123 0.3249310 0.05355674
#> [5,] 0.4359897 0.3503095 0.21370084
#> [6,] 0.6224514 0.3216924 0.05585619
table(Y_test, predictions$classification)
#>       
#> Y_test  1  2  3
#>      1 11  4  1
#>      2  7  4  8
#>      3  3  1 11

## Compute standard errors. This requires honest forests.
honest_forests <- ocf(Y_tr, X_tr, honesty = TRUE, inference = TRUE)
head(honest_forests$predictions$standard.errors)
#>          P(Y=1)     P(Y=2)    P(Y=3)
#> [1,] 0.15222731 0.11070564 0.1296048
#> [2,] 0.25687762 0.12911868 0.0855497
#> [3,] 0.09584680 0.08474158 0.1777998
#> [4,] 0.08506775 0.10642352 0.1795780
#> [5,] 0.07534241 0.07390018 0.1754728
#> [6,] 0.08414576 0.11421756 0.1380961

## Marginal effects.
me <- marginal_effects(forests, eval = "atmean")
print(me)
#> ocf marginal effects results 
#> 
#> Data info: 
#> Number of classes:    3 
#> Sample size:          50 
#> 
#> Tuning parameters: 
#> Evaluation:           atmean 
#> Bandwidth:            0.1 
#> Number of trees:      2000 
#> Honest forests:       FALSE 
#> Honesty fraction:     0 
#> 
#> Marginal Effects: 
#>    P'(Y=1) P'(Y=2) P'(Y=3)
#> x1  -0.100  -0.311   0.411
#> x2  -0.078   0.065   0.013
#> x3  -0.048  -0.013   0.060
#> x4  -0.024  -0.175   0.198
#> x5   0.033   0.049  -0.082
#> x6  -0.006   0.010  -0.004
print(me, latex = TRUE)
#> \begingroup
#>     \setlength{\tabcolsep}{8pt}
#>     \renewcommand{\arraystretch}{1.1}
#>     \begin{table}[H]
#>         \centering
#>         \begin{adjustbox}{width = 0.75\textwidth}
#>         \begin{tabular}{@{\extracolsep{5pt}}l c c c}
#>         \\[-1.8ex]\hline
#>         \hline \\[-1.8ex]
#>         & Class 1 & Class 2 & Class 3 \\
#>         \addlinespace[2pt]
#>         \hline \\[-1.8ex] 
#> 
#>         \texttt{x1} & -0.1 & -0.311 & 0.411 \\ 
#>         \texttt{x2} & -0.078 & 0.065 & 0.013 \\ 
#>         \texttt{x3} & -0.048 & -0.013 & 0.06 \\ 
#>         \texttt{x4} & -0.024 & -0.175 & 0.198 \\ 
#>         \texttt{x5} & 0.033 & 0.049 & -0.082 \\ 
#>         \texttt{x6} & -0.006 & 0.01 & -0.004 \\ 
#> 
#>         \addlinespace[3pt]
#>         \\[-1.8ex]\hline
#>         \hline \\[-1.8ex]
#>         \end{tabular}
#>         \end{adjustbox}
#>         \caption{Marginal effects.}
#>         \label{table:ocf.marginal.effects}
#>     \end{table}
#> \endgroup

## Compute standard errors. This requires honest forests.
honest_me <- marginal_effects(honest_forests, eval = "atmean", inference = TRUE)
honest_me$standard.errors
#>        P'(Y=1)    P'(Y=2)     P'(Y=3)
#> x1 0.020994263 0.01995347 0.002127984
#> x2 0.023534434 0.04554165 0.012388880
#> x3 0.046477600 0.04247743 0.000000000
#> x4 0.011836598 0.04421252 0.051962839
#> x5 0.023905929 0.02908331 0.027563358
#> x6 0.009801362 0.02172657 0.012120712
print(honest_me, latex = TRUE)
#> \begingroup
#>     \setlength{\tabcolsep}{8pt}
#>     \renewcommand{\arraystretch}{1.1}
#>     \begin{table}[H]
#>         \centering
#>         \begin{adjustbox}{width = 0.75\textwidth}
#>         \begin{tabular}{@{\extracolsep{5pt}}l c c c}
#>         \\[-1.8ex]\hline
#>         \hline \\[-1.8ex]
#>         & Class 1 & Class 2 & Class 3 \\
#>         \addlinespace[2pt]
#>         \hline \\[-1.8ex] 
#> 
#>         \texttt{x1} & -0.017 & 0 & 0.017 \\ 
#>                      & (0.021) & (0.02) & (0.002) \\ 
#>         \texttt{x2} & -0.036 & 0.013 & 0.023 \\ 
#>                      & (0.024) & (0.046) & (0.012) \\ 
#>         \texttt{x3} & -0.011 & 0.009 & 0.002 \\ 
#>                      & (0.046) & (0.042) & (0) \\ 
#>         \texttt{x4} & -0.014 & -0.053 & 0.067 \\ 
#>                      & (0.012) & (0.044) & (0.052) \\ 
#>         \texttt{x5} & -0.006 & 0.026 & -0.02 \\ 
#>                      & (0.024) & (0.029) & (0.028) \\ 
#>         \texttt{x6} & -0.002 & -0.003 & 0.005 \\ 
#>                      & (0.01) & (0.022) & (0.012) \\ 
#> 
#>         \addlinespace[3pt]
#>         \\[-1.8ex]\hline
#>         \hline \\[-1.8ex]
#>         \end{tabular}
#>         \end{adjustbox}
#>         \caption{Marginal effects.}
#>         \label{table:ocf.marginal.effects}
#>     \end{table}
#> \endgroup