This file is indexed.

/usr/lib/R/site-library/recipes/doc/Custom_Steps.R is in r-cran-recipes 0.1.0-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
## ----ex_setup, include=FALSE---------------------------------------------
knitr::opts_chunk$set(
  message = FALSE,
  digits = 3,
  collapse = TRUE,
  comment = "#>"
  )
options(digits = 3)

## ----step_list-----------------------------------------------------------
library(recipes)
steps <- apropos("^step_")
steps[!grepl("new$", steps)]

## ----initial-------------------------------------------------------------
data(biomass)
str(biomass)

biomass_tr <- biomass[biomass$dataset == "Training",]
biomass_te <- biomass[biomass$dataset == "Testing",]

## ----carbon_dist---------------------------------------------------------
library(ggplot2)
theme_set(theme_bw())
ggplot(biomass_tr, aes(x = carbon)) + 
  geom_histogram(binwidth = 5, col = "blue", fill = "blue", alpha = .5) + 
  geom_vline(xintercept = biomass_te$carbon[1], lty = 2)

## ----initial_def---------------------------------------------------------
step_percentile <- function(recipe, ..., role = NA, 
                            trained = FALSE, ref_dist = NULL,
                            approx = FALSE, 
                            options = list(probs = (0:100)/100, names = TRUE)) {
## bake but do not evaluate the variable selectors with
## the `quos` function in `rlang`
  terms <- rlang::quos(...) 
  if(length(terms) == 0)
    stop("Please supply at least one variable specification. See ?selections.")
  add_step(
    recipe, 
    step_percentile_new(
      terms = terms, 
      trained = trained,
      role = role, 
      ref_dist = ref_dist,
      approx = approx,
      options = options))
}

## ----initialize----------------------------------------------------------
step_percentile_new <- function(terms = NULL, role = NA, trained = FALSE, 
                                ref_dist = NULL, approx = NULL, options = NULL) {
  step(
    subclass = "percentile", 
    terms = terms,
    role = role,
    trained = trained,
    ref_dist = ref_dist,
    approx = approx,
    options = options
  )
}

## ----prep_1, eval = FALSE------------------------------------------------
#  prep.step_percentile <- function(x, training, info = NULL, ...) {
#    col_names <- terms_select(terms = x$terms, info = info)
#  }

## ----prep_2--------------------------------------------------------------
get_pctl <- function(x, args) {
  args$x <- x
  do.call("quantile", args)
}

prep.step_percentile <- function(x, training, info = NULL, ...) {
  col_names <- terms_select(terms = x$terms, info = info) 
  ## You can add error trapping for non-numeric data here and so on.
  ## We'll use the names later so
  if(x$options$names == FALSE)
    stop("`names` should be set to TRUE")
  
  if(!x$approx) {
    x$ref_dist <- training[, col_names]
  } else {
    pctl <- lapply(
      training[, col_names],  
      get_pctl, 
      args = x$options
    )
    x$ref_dist <- pctl
  }
  ## Always return the updated step
  x
}

## ----bake----------------------------------------------------------------
## Two helper functions
pctl_by_mean <- function(x, ref) mean(ref <= x)

pctl_by_approx <- function(x, ref) {
  ## go from 1 column tibble to vector
  x <- getElement(x, names(x))
  ## get the percentiles values from the names (e.g. "10%")
  p_grid <- as.numeric(gsub("%$", "", names(ref))) 
  approx(x = ref, y = p_grid, xout = x)$y/100
}

bake.step_percentile <- function(object, newdata, ...) {
  require(tibble)
  ## For illustration (and not speed), we will loop through the affected variables
  ## and do the computations
  vars <- names(object$ref_dist)
  
  for(i in vars) {
    if(!object$approx) {
      ## We can use `apply` since tibbles do not drop dimensions:
      newdata[, i] <- apply(newdata[, i], 1, pctl_by_mean, 
                            ref = object$ref_dist[, i])
    } else 
      newdata[, i] <- pctl_by_approx(newdata[, i], object$ref_dist[[i]])
  }
  ## Always convert to tibbles on the way out
  as_tibble(newdata)
}

## ----example-------------------------------------------------------------
rec_obj <- recipe(HHV ~ ., data = biomass_tr[, -(1:2)])
rec_obj <- rec_obj %>%
  step_percentile(all_predictors(), approx = TRUE) 

rec_obj <- prep(rec_obj, training = biomass_tr)

percentiles <- bake(rec_obj, biomass_te)
percentiles

## ----cdf_plot, echo = FALSE----------------------------------------------
grid_pct <- rec_obj$steps[[1]]$options$probs
plot_data <- data.frame(
  carbon = c(
    quantile(biomass_tr$carbon, probs = grid_pct), 
    biomass_te$carbon
  ),
  percentile = c(grid_pct, percentiles$carbon),
  dataset = rep(
    c("Training", "Testing"), 
    c(length(grid_pct), nrow(percentiles))
  )
)

ggplot(plot_data, 
       aes(x = carbon, y = percentile, col = dataset)) + 
  geom_point(alpha = .4, cex = 2) + 
  theme(legend.position = "top")