/usr/lib/R/site-library/recipes/doc/Simple_Example.html is in r-cran-recipes 0.1.0-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 | <!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Basic Recipes</title>
<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
div.sourceCode { overflow-x: auto; }
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
margin: 0; padding: 0; vertical-align: baseline; border: none; }
table.sourceCode { width: 100%; line-height: 100%; }
td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
td.sourceCode { padding-left: 5px; }
code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
code > span.dt { color: #902000; } /* DataType */
code > span.dv { color: #40a070; } /* DecVal */
code > span.bn { color: #40a070; } /* BaseN */
code > span.fl { color: #40a070; } /* Float */
code > span.ch { color: #4070a0; } /* Char */
code > span.st { color: #4070a0; } /* String */
code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
code > span.ot { color: #007020; } /* Other */
code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
code > span.fu { color: #06287e; } /* Function */
code > span.er { color: #ff0000; font-weight: bold; } /* Error */
code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
code > span.cn { color: #880000; } /* Constant */
code > span.sc { color: #4070a0; } /* SpecialChar */
code > span.vs { color: #4070a0; } /* VerbatimString */
code > span.ss { color: #bb6688; } /* SpecialString */
code > span.im { } /* Import */
code > span.va { color: #19177c; } /* Variable */
code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code > span.op { color: #666666; } /* Operator */
code > span.bu { } /* BuiltIn */
code > span.ex { } /* Extension */
code > span.pp { color: #bc7a00; } /* Preprocessor */
code > span.at { color: #7d9029; } /* Attribute */
code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
</style>
<link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20800px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%2020px%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20both%3B%0Amargin%3A%200%200%2010px%200%3B%0Apadding%3A%204px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Aborder%2Dradius%3A%205px%3B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Afont%2Dsize%3A%2013px%3B%0Aline%2Dheight%3A%201%2E3%3B%0A%7D%0A%23TOC%20%2Etoctitle%20%7B%0Afont%2Dweight%3A%20bold%3B%0Afont%2Dsize%3A%2015px%3B%0Amargin%2Dleft%3A%205px%3B%0A%7D%0A%23TOC%20ul%20%7B%0Apadding%2Dleft%3A%2040px%3B%0Amargin%2Dleft%3A%20%2D1%2E5em%3B%0Amargin%2Dtop%3A%205px%3B%0Amargin%2Dbottom%3A%205px%3B%0A%7D%0A%23TOC%20ul%20ul%20%7B%0Amargin%2Dleft%3A%20%2D2em%3B%0A%7D%0A%23TOC%20li%20%7B%0Aline%2Dheight%3A%2016px%3B%0A%7D%0Atable%3Anot%28%5Bclass%5D%29%20%7B%0Amargin%3A%20auto%3B%0Amin%2Dwidth%3A%2040%25%3B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dcolor%3A%20%23DDDDDD%3B%0Aborder%2Dstyle%3A%20outset%3B%0Aborder%2Dcollapse%3A%20collapse%3B%0A%7D%0Atable%5Bsummary%3D%22R%20argblock%22%5D%20%7B%0Awidth%3A%20100%25%3B%0Aborder%3A%20none%3B%0A%7D%0Atable%3Anot%28%5Bclass%5D%29%20th%20%7B%0Aborder%2Dwidth%3A%202px%3B%0Apadding%3A%205px%3B%0Aborder%2Dstyle%3A%20inset%3B%0A%7D%0Atable%3Anot%28%5Bclass%5D%29%20td%20%7B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dstyle%3A%20inset%3B%0Aline%2Dheight%3A%2018px%3B%0Apadding%3A%205px%205px%3B%0A%7D%0Atable%3Anot%28%5Bclass%5D%29%2C%20table%3Anot%28%5Bclass%5D%29%20th%2C%20table%3Anot%28%5Bclass%5D%29%20td%20%7B%0Aborder%2Dleft%2Dstyle%3A%20none%3B%0Aborder%2Dright%2Dstyle%3A%20none%3B%0A%7D%0Atable%3Anot%28%5Bclass%5D%29%20tr%2Eodd%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Ap%20%7B%0Amargin%3A%200%2E5em%200%3B%0A%7D%0Ablockquote%20%7B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Apadding%3A%2013px%3B%0Apadding%2Dbottom%3A%201px%3B%0A%7D%0Ahr%20%7B%0Aborder%2Dstyle%3A%20solid%3B%0Aborder%3A%20none%3B%0Aborder%2Dtop%3A%201px%20solid%20%23777%3B%0Amargin%3A%2028px%200%3B%0A%7D%0Adl%20%7B%0Amargin%2Dleft%3A%200%3B%0A%7D%0Adl%20dd%20%7B%0Amargin%2Dbottom%3A%2013px%3B%0Amargin%2Dleft%3A%2013px%3B%0A%7D%0Adl%20dt%20%7B%0Afont%2Dweight%3A%20bold%3B%0A%7D%0Aul%20%7B%0Amargin%2Dtop%3A%200%3B%0A%7D%0Aul%20li%20%7B%0Alist%2Dstyle%3A%20circle%20outside%3B%0A%7D%0Aul%20ul%20%7B%0Amargin%2Dbottom%3A%200%3B%0A%7D%0Apre%2C%20code%20%7B%0Abackground%2Dcolor%3A%20%23f5f5f5%3B%0Aborder%2Dradius%3A%203px%3B%0Acolor%3A%20%23333%3B%0A%7D%0Apre%20%7B%0Aoverflow%2Dx%3A%20auto%3B%0Aborder%2Dradius%3A%203px%3B%0Amargin%3A%205px%200%2010px%200%3B%0Apadding%3A%2010px%3B%0A%7D%0Apre%3Anot%28%5Bclass%5D%29%20%7B%0Abackground%2Dcolor%3A%20white%3B%0Aborder%3A%20%23f5f5f5%201px%20solid%3B%0A%7D%0Apre%3Anot%28%5Bclass%5D%29%20code%20%7B%0Acolor%3A%20%23444%3B%0Abackground%2Dcolor%3A%20white%3B%0A%7D%0Acode%20%7B%0Afont%2Dfamily%3A%20monospace%3B%0Afont%2Dsize%3A%2090%25%3B%0A%7D%0Ap%20%3E%20code%2C%20li%20%3E%20code%20%7B%0Apadding%3A%202px%204px%3B%0Acolor%3A%20%23d14%3B%0Aborder%3A%201px%20solid%20%23e1e1e8%3B%0Awhite%2Dspace%3A%20inherit%3B%0A%7D%0Adiv%2Efigure%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0Atable%20%3E%20caption%2C%20div%2Efigure%20p%2Ecaption%20%7B%0Afont%2Dstyle%3A%20italic%3B%0A%7D%0Atable%20%3E%20caption%20span%2C%20div%2Efigure%20p%2Ecaption%20span%20%7B%0Afont%2Dstyle%3A%20normal%3B%0Afont%2Dweight%3A%20bold%3B%0A%7D%0Ap%20%7B%0Amargin%3A%200%200%2010px%3B%0A%7D%0Atable%3Anot%28%5Bclass%5D%29%20%7B%0Amargin%3A%20auto%20auto%2010px%20auto%3B%0A%7D%0Aimg%3Anot%28%5Bclass%5D%29%20%7B%0Abackground%2Dcolor%3A%20%23FFFFFF%3B%0Apadding%3A%202px%3B%0Aborder%2Dradius%3A%203px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Amargin%3A%200%205px%3B%0Amax%2Dwidth%3A%20100%25%3B%0A%7D%0Ah1%20%7B%0Amargin%2Dtop%3A%200%3B%0Afont%2Dsize%3A%2035px%3B%0Aline%2Dheight%3A%2040px%3B%0A%7D%0Ah2%20%7B%0Aborder%2Dbottom%3A%204px%20solid%20%23f5f5f5%3B%0Apadding%2Dtop%3A%2010px%3B%0Apadding%2Dbottom%3A%202px%3B%0Afont%2Dsize%3A%20145%25%3B%0A%7D%0Ah3%20%7B%0Aborder%2Dbottom%3A%202px%20solid%20%23f5f5f5%3B%0Apadding%2Dtop%3A%2010px%3B%0Afont%2Dsize%3A%20120%25%3B%0A%7D%0Ah4%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23f5f5f5%3B%0Amargin%2Dleft%3A%208px%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Ah5%2C%20h6%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23ccc%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Aa%20%7B%0Acolor%3A%20%230033dd%3B%0Atext%2Ddecoration%3A%20none%3B%0A%7D%0Aa%3Ahover%20%7B%0Acolor%3A%20%236666ff%3B%20%7D%0Aa%3Avisited%20%7B%0Acolor%3A%20%23800080%3B%20%7D%0Aa%3Avisited%3Ahover%20%7B%0Acolor%3A%20%23BB00BB%3B%20%7D%0Aa%5Bhref%5E%3D%22http%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0Aa%5Bhref%5E%3D%22https%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0Adiv%2Er%2Dhelp%2Dpage%20%7B%0Abackground%2Dcolor%3A%20%23f9f9f9%3B%0Aborder%2Dbottom%3A%20%23ddd%201px%20solid%3B%0Amargin%2Dbottom%3A%2010px%3B%0Apadding%3A%2010px%3B%0A%7D%0Adiv%2Er%2Dhelp%2Dpage%3Ahover%20%7B%0Abackground%2Dcolor%3A%20%23f4f4f4%3B%0A%7D%0A%0Acode%20%3E%20span%2Ekw%20%7B%20color%3A%20%23555%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Edt%20%7B%20color%3A%20%23902000%3B%20%7D%20%0Acode%20%3E%20span%2Edv%20%7B%20color%3A%20%2340a070%3B%20%7D%20%0Acode%20%3E%20span%2Ebn%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Efl%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Ech%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Est%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Eco%20%7B%20color%3A%20%23888888%3B%20font%2Dstyle%3A%20italic%3B%20%7D%20%0Acode%20%3E%20span%2Eot%20%7B%20color%3A%20%23007020%3B%20%7D%20%0Acode%20%3E%20span%2Eal%20%7B%20color%3A%20%23ff0000%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Efu%20%7B%20color%3A%20%23900%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Eer%20%7B%20color%3A%20%23a61717%3B%20background%2Dcolor%3A%20%23e3d2d2%3B%20%7D%20%0A" rel="stylesheet" type="text/css" />
</head>
<body>
<h1 class="title toc-ignore">Basic Recipes</h1>
<div id="TOC">
<ul>
<li><a href="#an-example">An Example</a></li>
<li><a href="#an-initial-recipe">An Initial Recipe</a></li>
<li><a href="#preprocessing-steps">Preprocessing Steps</a></li>
<li><a href="#adding-steps">Adding Steps</a></li>
</ul>
</div>
<p>This document demonstrates some basic uses of recipes. First, some definitions are required:</p>
<ul>
<li><strong>variables</strong> are the original (raw) data columns in a data frame or tibble. For example, in a traditional formula <code>Y ~ A + B + A:B</code>, the variables are <code>A</code>, <code>B</code>, and <code>Y</code>.</li>
<li><strong>roles</strong> define how variables will be used in the model. Examples are: <code>predictor</code> (independent variables), <code>response</code>, and <code>case weight</code>. This is meant to be open-ended and extensible.</li>
<li><strong>terms</strong> are columns in a design matrix such as <code>A</code>, <code>B</code>, and <code>A:B</code>. These can be other derived entities that are grouped such a a set of principal components or a set of columns that define a basis function for a variable. These are synonymous with features in machine learning. Variables that have <code>predictor</code> roles would automatically be main effect terms</li>
</ul>
<div id="an-example" class="section level2">
<h2>An Example</h2>
<p>The cell segmentation data will be used. It has 58 predictor columns, a factor variable <code>Class</code> (the outcome), and two extra labelling columns. Each of the predictors has a suffix for the optical channel (<code>"Ch1"</code>-<code>"Ch4"</code>). We will first separate the data into a training and test set then remove unimportant variables:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(recipes)
<span class="kw">library</span>(caret)
<span class="kw">data</span>(segmentationData)
seg_train <-<span class="st"> </span>segmentationData <span class="op">%>%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">filter</span>(Case <span class="op">==</span><span class="st"> "Train"</span>) <span class="op">%>%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">select</span>(<span class="op">-</span>Case, <span class="op">-</span>Cell)
seg_test <-<span class="st"> </span>segmentationData <span class="op">%>%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">filter</span>(Case <span class="op">==</span><span class="st"> "Test"</span>) <span class="op">%>%</span><span class="st"> </span>
<span class="st"> </span><span class="kw">select</span>(<span class="op">-</span>Case, <span class="op">-</span>Cell)</code></pre></div>
<p>The idea is that the preprocessing operations will all be created using the training set and then these steps will be applied to both the training and test set.</p>
</div>
<div id="an-initial-recipe" class="section level2">
<h2>An Initial Recipe</h2>
<p>For a first recipe, let’s plan on centering and scaling the predictors. First, we will create a recipe from the original data and then specify the processing steps.</p>
<p>Recipes can be created manually by sequentially adding roles to variables in a data set.</p>
<p>If the analysis only required <strong>outcomes</strong> and <strong>predictors</strong>, the easiest way to create the initial recipe is to use the standard formula method:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">rec_obj <-<span class="st"> </span><span class="kw">recipe</span>(Class <span class="op">~</span><span class="st"> </span>., <span class="dt">data =</span> seg_train)
rec_obj
<span class="co">#> Data Recipe</span>
<span class="co">#> </span>
<span class="co">#> Inputs:</span>
<span class="co">#> </span>
<span class="co">#> role #variables</span>
<span class="co">#> outcome 1</span>
<span class="co">#> predictor 58</span></code></pre></div>
<p>The data contained in the <code>data</code> argument need not be the training set; this data is only used to catalog the names of the variables and their types (e.g. numeric, etc.).</p>
<p>(Note that the formula method here is used to declare the variables and their roles and nothing else. If you use inline functions (e.g. <code>log</code>) it will complain. These types of operations can be added later.)</p>
</div>
<div id="preprocessing-steps" class="section level2">
<h2>Preprocessing Steps</h2>
<p>From here, preprocessing steps can be added sequentially in one of two ways:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">rec_obj <-<span class="st"> </span><span class="kw">step_name</span>(rec_obj, arguments) ## or
rec_obj <-<span class="st"> </span>rec_obj <span class="op">%>%</span><span class="st"> </span><span class="kw">step_name</span>(arguments)</code></pre></div>
<p><code>step_center</code> and the other functions will always return updated recipes.</p>
<p>One other important facet of the code is the method for specifying which variables should be used in different steps. The manual page <code>?selections</code> has more details but <a href="https://cran.r-project.org/package=dplyr"><code>dplyr</code></a>-like selector functions can be used:</p>
<ul>
<li>use basic variable names (e.g. <code>x1, x2</code>),</li>
<li><a href="https://cran.r-project.org/package=dplyr"><code>dplyr</code></a> functions for selecting variables: <code>contains</code>, <code>ends_with</code>, <code>everything</code>, <code>matches</code>, <code>num_range</code>, and <code>starts_with</code>,</li>
<li>functions that subset on the role of the variables that have been specified so far: <code>all_outcomes</code>, <code>all_predictors</code>, <code>has_role</code>, or</li>
<li>similar functions for the type of data: <code>all_nominal</code>, <code>all_numeric</code>, and <code>has_type</code>.</li>
</ul>
<p>Note that the functions listed above are the only ones that can be used to selecto variables inside the steps. Also, minus signs can be used to deselect variables.</p>
<p>For our data, we can add the two operations for all of the predictors:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">standardized <-<span class="st"> </span>rec_obj <span class="op">%>%</span>
<span class="st"> </span><span class="kw">step_center</span>(<span class="kw">all_predictors</span>()) <span class="op">%>%</span>
<span class="st"> </span><span class="kw">step_scale</span>(<span class="kw">all_predictors</span>())
standardized
<span class="co">#> Data Recipe</span>
<span class="co">#> </span>
<span class="co">#> Inputs:</span>
<span class="co">#> </span>
<span class="co">#> role #variables</span>
<span class="co">#> outcome 1</span>
<span class="co">#> predictor 58</span>
<span class="co">#> </span>
<span class="co">#> Steps:</span>
<span class="co">#> </span>
<span class="co">#> Centering for all_predictors()</span>
<span class="co">#> Scaling for all_predictors()</span></code></pre></div>
<p>It is important to realize that the <em>specific</em> variables have not been declared yet (in this example). In some preprocessing steps, variables will be added or removed from the current list of possible variables.</p>
<p>If this is the only preprocessing steps for the predictors, we can now estimate the means and standard deviations from the training set. The <code>prep</code> function is used with a recipe and a data set:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">trained_rec <-<span class="st"> </span><span class="kw">prep</span>(standardized, <span class="dt">training =</span> seg_train)
<span class="co">#> step 1 center training </span>
<span class="co">#> step 2 scale training</span></code></pre></div>
<p>Now that the statistics have been estimated, the preprocessing can be applied to the training and test set:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">train_data <-<span class="st"> </span><span class="kw">bake</span>(trained_rec, <span class="dt">newdata =</span> seg_train)
test_data <-<span class="st"> </span><span class="kw">bake</span>(trained_rec, <span class="dt">newdata =</span> seg_test)</code></pre></div>
<p><code>bake</code> returns a tibble:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">class</span>(test_data)
<span class="co">#> [1] "tbl_df" "tbl" "data.frame"</span>
test_data
<span class="co">#> # A tibble: 1,010 x 58</span>
<span class="co">#> AngleCh1 AreaCh1 AvgIntenCh1 AvgIntenCh2 AvgIntenCh3 AvgIntenCh4</span>
<span class="co">#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl></span>
<span class="co">#> 1 1.0656 -0.647 -0.684 -1.177 -0.926 -0.9238</span>
<span class="co">#> 2 -1.8040 -0.185 -0.632 -0.479 -0.809 -0.6666</span>
<span class="co">#> 3 -1.0300 -0.707 1.207 3.035 0.348 1.3864</span>
<span class="co">#> 4 1.6935 -0.684 0.806 2.664 0.296 0.8934</span>
<span class="co">#> 5 1.8129 -0.342 -0.668 -1.172 -0.843 -0.9282</span>
<span class="co">#> 6 -1.4759 0.784 -0.682 -0.628 -0.881 -0.5939</span>
<span class="co">#> 7 1.2702 0.272 -0.672 -0.625 -0.809 -0.5156</span>
<span class="co">#> 8 -1.5837 0.457 0.283 1.320 -0.613 -0.0891</span>
<span class="co">#> 9 -0.7957 -0.412 -0.669 -1.168 -0.845 -0.9258</span>
<span class="co">#> 10 0.0363 -0.638 -0.535 0.182 -0.555 -0.0253</span>
<span class="co">#> # ... with 1,000 more rows, and 52 more variables:</span>
<span class="co">#> # ConvexHullAreaRatioCh1 <dbl>, ConvexHullPerimRatioCh1 <dbl>,</span>
<span class="co">#> # DiffIntenDensityCh1 <dbl>, DiffIntenDensityCh3 <dbl>,</span>
<span class="co">#> # DiffIntenDensityCh4 <dbl>, EntropyIntenCh1 <dbl>,</span>
<span class="co">#> # EntropyIntenCh3 <dbl>, EntropyIntenCh4 <dbl>, EqCircDiamCh1 <dbl>,</span>
<span class="co">#> # EqEllipseLWRCh1 <dbl>, EqEllipseOblateVolCh1 <dbl>,</span>
<span class="co">#> # EqEllipseProlateVolCh1 <dbl>, EqSphereAreaCh1 <dbl>,</span>
<span class="co">#> # EqSphereVolCh1 <dbl>, FiberAlign2Ch3 <dbl>, FiberAlign2Ch4 <dbl>,</span>
<span class="co">#> # FiberLengthCh1 <dbl>, FiberWidthCh1 <dbl>, IntenCoocASMCh3 <dbl>,</span>
<span class="co">#> # IntenCoocASMCh4 <dbl>, IntenCoocContrastCh3 <dbl>,</span>
<span class="co">#> # IntenCoocContrastCh4 <dbl>, IntenCoocEntropyCh3 <dbl>,</span>
<span class="co">#> # IntenCoocEntropyCh4 <dbl>, IntenCoocMaxCh3 <dbl>,</span>
<span class="co">#> # IntenCoocMaxCh4 <dbl>, KurtIntenCh1 <dbl>, KurtIntenCh3 <dbl>,</span>
<span class="co">#> # KurtIntenCh4 <dbl>, LengthCh1 <dbl>, NeighborAvgDistCh1 <dbl>,</span>
<span class="co">#> # NeighborMinDistCh1 <dbl>, NeighborVarDistCh1 <dbl>, PerimCh1 <dbl>,</span>
<span class="co">#> # ShapeBFRCh1 <dbl>, ShapeLWRCh1 <dbl>, ShapeP2ACh1 <dbl>,</span>
<span class="co">#> # SkewIntenCh1 <dbl>, SkewIntenCh3 <dbl>, SkewIntenCh4 <dbl>,</span>
<span class="co">#> # SpotFiberCountCh3 <dbl>, SpotFiberCountCh4 <dbl>, TotalIntenCh1 <dbl>,</span>
<span class="co">#> # TotalIntenCh2 <dbl>, TotalIntenCh3 <dbl>, TotalIntenCh4 <dbl>,</span>
<span class="co">#> # VarIntenCh1 <dbl>, VarIntenCh3 <dbl>, VarIntenCh4 <dbl>,</span>
<span class="co">#> # WidthCh1 <dbl>, XCentroid <dbl>, YCentroid <dbl></span></code></pre></div>
</div>
<div id="adding-steps" class="section level2">
<h2>Adding Steps</h2>
<p>After exploring the data, more preprocessing might be required. Steps can be added to the trained recipe. Suppose that we need to create PCA components but only from the predictors from channel 1 and any predictors that are areas:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">trained_rec <-<span class="st"> </span>trained_rec <span class="op">%>%</span>
<span class="st"> </span><span class="kw">step_pca</span>(<span class="kw">ends_with</span>(<span class="st">"Ch1"</span>), <span class="kw">contains</span>(<span class="st">"area"</span>), <span class="dt">num =</span> <span class="dv">5</span>)
trained_rec
<span class="co">#> Data Recipe</span>
<span class="co">#> </span>
<span class="co">#> Inputs:</span>
<span class="co">#> </span>
<span class="co">#> role #variables</span>
<span class="co">#> outcome 1</span>
<span class="co">#> predictor 58</span>
<span class="co">#> </span>
<span class="co">#> Training data contained 1009 data points and no missing data.</span>
<span class="co">#> </span>
<span class="co">#> Steps:</span>
<span class="co">#> </span>
<span class="co">#> Centering for AngleCh1, AreaCh1, ... [trained]</span>
<span class="co">#> Scaling for AngleCh1, AreaCh1, ... [trained]</span>
<span class="co">#> PCA extraction with ends_with("Ch1"), contains("area")</span></code></pre></div>
<p>Note that only the last step has been estimated; the first two were previously trained and these activities are not duplicated. We can add the PCA estimates using <code>prep</code> again:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">trained_rec <-<span class="st"> </span><span class="kw">prep</span>(trained_rec, <span class="dt">training =</span> seg_train)
<span class="co">#> step 1 center [pre-trained]</span>
<span class="co">#> step 2 scale [pre-trained]</span>
<span class="co">#> step 3 pca training</span></code></pre></div>
<p><code>bake</code> can be reapplied to get the principal components in addition to the other variables:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">test_data <-<span class="st"> </span><span class="kw">bake</span>(trained_rec, <span class="dt">newdata =</span> seg_test)
<span class="kw">names</span>(test_data)
<span class="co">#> [1] "AvgIntenCh2" "AvgIntenCh3" "AvgIntenCh4" </span>
<span class="co">#> [4] "DiffIntenDensityCh3" "DiffIntenDensityCh4" "EntropyIntenCh3" </span>
<span class="co">#> [7] "EntropyIntenCh4" "FiberAlign2Ch3" "FiberAlign2Ch4" </span>
<span class="co">#> [10] "IntenCoocASMCh3" "IntenCoocASMCh4" "IntenCoocContrastCh3"</span>
<span class="co">#> [13] "IntenCoocContrastCh4" "IntenCoocEntropyCh3" "IntenCoocEntropyCh4" </span>
<span class="co">#> [16] "IntenCoocMaxCh3" "IntenCoocMaxCh4" "KurtIntenCh3" </span>
<span class="co">#> [19] "KurtIntenCh4" "SkewIntenCh3" "SkewIntenCh4" </span>
<span class="co">#> [22] "SpotFiberCountCh3" "SpotFiberCountCh4" "TotalIntenCh2" </span>
<span class="co">#> [25] "TotalIntenCh3" "TotalIntenCh4" "VarIntenCh3" </span>
<span class="co">#> [28] "VarIntenCh4" "XCentroid" "YCentroid" </span>
<span class="co">#> [31] "PC1" "PC2" "PC3" </span>
<span class="co">#> [34] "PC4" "PC5"</span></code></pre></div>
<p>Note that the PCA components have replaced the original variables that were from channel 1 or measured an area aspect of the cells.</p>
<p>There are a number of different steps included in the package:</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">steps <-<span class="st"> </span><span class="kw">apropos</span>(<span class="st">"^step_"</span>)
steps[<span class="op">!</span><span class="kw">grepl</span>(<span class="st">"new$"</span>, steps)]
<span class="co">#> [1] "step_BoxCox" "step_YeoJohnson" "step_bagimpute" </span>
<span class="co">#> [4] "step_bin2factor" "step_center" "step_classdist" </span>
<span class="co">#> [7] "step_corr" "step_date" "step_depth" </span>
<span class="co">#> [10] "step_discretize" "step_dummy" "step_holiday" </span>
<span class="co">#> [13] "step_hyperbolic" "step_ica" "step_interact" </span>
<span class="co">#> [16] "step_intercept" "step_invlogit" "step_isomap" </span>
<span class="co">#> [19] "step_knnimpute" "step_kpca" "step_lincomb" </span>
<span class="co">#> [22] "step_log" "step_logit" "step_meanimpute" </span>
<span class="co">#> [25] "step_modeimpute" "step_ns" "step_nzv" </span>
<span class="co">#> [28] "step_ordinalscore" "step_other" "step_pca" </span>
<span class="co">#> [31] "step_percentile" "step_poly" "step_range" </span>
<span class="co">#> [34] "step_ratio" "step_regex" "step_rm" </span>
<span class="co">#> [37] "step_scale" "step_shuffle" "step_spatialsign" </span>
<span class="co">#> [40] "step_sqrt" "step_window"</span></code></pre></div>
</div>
<script type="text/javascript">
window.onload = function() {
var i, fig = 1, caps = document.getElementsByClassName('caption');
for (i = 0; i < caps.length; i++) {
var cap = caps[i];
if (cap.parentElement.className !== 'figure' || cap.nodeName !== 'P')
continue;
cap.innerHTML = '<span>Figure ' + fig + ':</span> ' + cap.innerHTML;
fig++;
}
fig = 1;
caps = document.getElementsByTagName('caption');
for (i = 0; i < caps.length; i++) {
var cap = caps[i];
if (cap.parentElement.nodeName !== 'TABLE') continue;
cap.innerHTML = '<span>Table ' + fig + ':</span> ' + cap.innerHTML;
fig++;
}
}
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>
|