This file is indexed.

/usr/lib/R/site-library/multcomp/doc/chfls1.Rnw is in r-cran-multcomp 1.4-8-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
\documentclass[11pt]{article}

%%\VignetteIndexEntry{Supplementary Material for "A re-evaluation of the model selection procedure in Pollet \& Nettle (2009)"}
%%\VignetteDepends{xtable,car,MASS,multcomp,foreign,TH.data}

\usepackage{amsmath}
\usepackage[round,authoryear]{natbib}
\usepackage{tabularx} 
\usepackage{rotating}
\usepackage{wasysym}
\usepackage[utf8x]{inputenc}
\usepackage[left=3.5cm,right=3.5cm, bottom=3.5cm]{geometry}
\usepackage[justification=justified,singlelinecheck=false,labe lfont={bf,small,sf},font={small,sf},
aboveskip=0em,belowskip=0em]{caption}
\renewcommand{\captionfont}{\small}
\title{Supplementary Material for \emph{A re-evaluation of the model selection 
       procedure in Pollet \& Nettle (2009)}}
\author{Esther Herberich, Torsten Hothorn, Daniel Nettle \& Thomas Pollet}
\date{}
\begin{document}
  \maketitle 

\SweaveOpts{engine = R, echo = FALSE, eps = TRUE}

<<setup, results = tex, cache = TRUE >>=
options(SweaveHooks = list(leftpar = 
    function() par(mai = par("mai") * c(1, 1.1, 1, 1))))
#options(width = 70)
library("xtable")
library("car")
library("MASS")
library("multcomp")
library("foreign")

#dataurl <- "http://www.src.uchicago.edu/datalib/chfls/data/chfls1.sav"
#td <- tempdir()
#derror <- try(download.file(dataurl, destfile = file.path(td, "chfls1.sav"),
#                           mode = "wb"))
#if (inherits(derror, "try-error")) {
#    cat("Vignette could not be processed -- download error.\n",
#        "\\end{document}\n")
#} else {
#### data see http://popcenter.uchicago.edu/data/chfls.shtml
#chfls1 <- read.spss(file.path(td, "chfls1.sav"), to.data.frame = TRUE)
#}

library("TH.data")
load(file.path(path.package(package="TH.data"), "rda", "CHFLS.rda"))

### warnings: Variables DC04, MZ09, and MZ11 contain duplicated
### levels. These are not needed anyway, so we ignore the warning
### for the time being.

### choose neccessary variables
org <- chfls1[, c("REGION6", "ZJ05", "ZJ06", "A35", "ZJ07", "ZJ16M", "INCRM",
                  "JK01", "JK02", "JK20", "HY04", "HY07", "A02", "AGEGAPM", 
                  "A07M", "A14", "A21", "A22M", "A23", "AX16", "INCAM", "SEXNOW", "ZW04")]

names(org) <- c("Region",
                "Rgender",               ### gender of respondent
                "Rage",                  ### age of respondent
		"RagestartA",		 ### age of respondent at beginning of relationship with partner A
                "Redu",                  ### education of respondent
                "RincomeM",              ### rounded monthly income of respondent
		"RincomeComp",		 ### inputed monthly income of respondent
                "Rhealth",               ### health condition respondent
                "Rheight",               ### respondent's height
                "Rhappy",                ### respondent's happiness
                "Rmartial",              ### respondent's marital status
                "RhasA",                 ### R has current A partner
                "Agender",               ### gender of partner A
                "RAagegap",              ### age gap
                "RAstartage",            ### age at marriage
                "Aheight",               ### height of partner A
                "Aedu",                  ### education of partner A
                "AincomeM",              ### rounded partner A income
                "AincomeEst",            ### estimated partner A income
                "orgasm",                ### orgasm frequency
                "AincomeComp",           ### imputed partner A income
                "Rsexnow",               ### has sex last year
                "Rhomosexual")           ### R is homosexual

### duration of partnership 
org$RAduration <- org$Rage - org$RagestartA

### code missing values
org$AincomeM[org$AincomeM < 0] <- NA
org$RincomeM[org$RincomeM < 0] <- NA
org$Aheight[org$Aheight < 0] <- NA

olevels <- c("never", "rarely", "sometimes", "often", "always")
orgA <- subset(org, Rgender == "female" & Rhomosexual != "yes" & orgasm %in% olevels)

orgA$orgasm <- ordered(as.character(orgA$orgasm),
        levels = c("never", "rarely", "sometimes", "often", "always"))

orgA$Redu <- factor(as.character(orgA$Redu),
        levels = c("univ/grad", "j col", "up mid", "low mid", "primary", "no school"))
levels(orgA$Redu) <-  c("univ", "jcol", "upmid", "lowmid", "primary", "noschool")

orgA$Aedu <- factor(as.character(orgA$Aedu),
        levels = c("univ/grad", "j col", "up mid", "low mid", "primary", "no school"))

orgA$Rhappy <- factor(as.character(orgA$Rhappy),
        levels = c("v unhappy", "not too", "relatively", "very"))

orgA$Rhealth <- factor(as.character(orgA$Rhealth),
        levels = c("poor", "not good", "fair", "good", "excellent"))

orgA$Region <- factor(as.character(orgA$Region),
        levels = c("CentralW", "Northeast", "North", "InlandS", "CoastalE", "CoastalS"))

orgA$AincomeSD <- orgA$AincomeComp/sd(orgA$AincomeComp)
orgA$AheightSD <- orgA$Aheight/sd(orgA$Aheight)
orgA$RageSD <- orgA$Rage/sd(orgA$Rage)
orgA$edudiff <- as.numeric(orgA$Aedu) - as.numeric(orgA$Redu)
orgA$edudiffSD <- orgA$edudiff/sd(orgA$edudiff, na.rm=TRUE)
orgA$wealthdiff <- orgA$RincomeComp - orgA$AincomeComp
orgA$wealthdiffSD <- orgA$wealthdiff/sd(orgA$wealthdiff, na.rm=TRUE)
orgA$RAdurationSD <- orgA$RAduration/sd(orgA$RAduration, na.rm=TRUE)

### Data set as used by Pollet & Nettle (2009)
save(orgA, file = "orgA.Rda")

@

\section*{Summary}
In this paper, we first explain the statistical model underlying the ordinal regression technique used by \citet{Pollet2009}, including the two possible 
ways of calculating the likelihood function (section 1). We then show that the model fit criteria reported were in fact invalid, and calculate the correct 
ones, showing that this leads to a different choice of best model (section 2). We then suggest two other strategies of model selection for these data, and 
show that these also lead to different best-fitting models than that reported by \citet{Pollet2009} (sections 3 and 4).

\section{Ordinal regression: The cumulative Logit Model}
The appropriate model for a dependent variable $Y_i \in \{1, \ldots, R\}, \, i=1, \ldots, n$, consisting of ranked outcome categories is a cumulative 
logit model \citep{agresti02}:
\begin{eqnarray*}P(Y_i \leq r | x_i) = \frac{\exp(\beta_{0r} - x_i^\top \beta)}{1 + \exp(\beta_{0r} - x_i^\top \beta)}, \quad r = 1, \dots, R-1.
\end{eqnarray*}
The model includes intercepts $\beta_{0r}$ for each category and a global parameter vector $\beta = (\beta_1, \ldots, \beta_p)$ for the $p$ covariates. \\
To obtain parameter estimates the maximum-likelihood method is used. The responses are conditionally independent and follow a multinomial distribution 
with 
\begin{eqnarray*} y_i|x_i &\sim& \mathcal{M}(1,\pi_i), \\
y_i &=& (y_{i1}, \ldots, y_{i R-1}) = (0, \ldots, 0, \underbrace{1}_{r-\text{th position}}, 0, \ldots, 0) \quad \Leftrightarrow \quad Y_i = r,\\
\pi_i &=& (\pi_{i1}, \ldots, \pi_{i R-1}) \quad \text{with} \\
\pi_{ir} &=& P(Y_i = r | x_i) = P(Y_i \leq r | x_i) - P(Y_i \leq r-1 | x_i), \; r = 1, \ldots, R-1. \end{eqnarray*}
The associated likelihood function is \begin{eqnarray*}\mathcal{L}(\beta_{01}, \ldots \beta_{0R-1}, \beta; x_1, \ldots x_n) = \quad \quad \quad \quad 
\quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \\ \quad \quad \quad \quad \quad \quad \quad \quad \quad 
\prod_{i=1}^n{\pi_{i1}}^{y_{i1}} \cdot {\pi_{i2}}^{y_{i2}} \cdot \ldots \cdot (1 - \pi_{i1} - \ldots - \pi_{iR-1})^{1 - y_{i1} - \ldots - y_{iR-1}}.
\end{eqnarray*}
To obtain the parameter estimates, the data are often (as by default in SPSS 15.0) pooled in $K$ groups,  and the likelihood of the grouped data is 
maximized, instead of the likelihood of the individual data. 
Group $k, \; k = 1, \ldots K,$ includes all $h_k$ observations with the value $\tilde{x}_k = (\tilde{x}_{k1}, \ldots, \tilde{x}_{kp})$ of the covariates 
$x = (x_1, \ldots, x_p)$. The responses again 
follow a multinomial distribution: \begin{eqnarray*} \tilde{y}_k | \tilde{x}_k &\sim& \mathcal{M}(h_k, \tilde{\pi}_k), \\
\tilde{y}_k  &=& (\tilde{y}_{k1}, \ldots, \tilde{y}_{kR-1}), \\
\tilde{\pi}_k  &=& (\tilde{\pi}_{k1}, \ldots, \tilde{\pi}_{kR-1}). \end{eqnarray*}
The vector $\tilde{y}_k$ contains the observed frequencies of the categories $1$ to $R-1$ in group $k$. $\tilde{\pi}_{kr}$ is the probability of an 
individual of group $k$ being in category $r$. The likelihood function of the grouped data results in \begin{eqnarray*}\mathcal{L}(\beta_{01}, \ldots 
\beta_{0R-1}, \beta; \tilde{x}_1, \ldots \tilde{x}_K) = \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad \quad 
\quad \quad \quad \quad \\ \underbrace{\prod_{k=1}^K{\frac{h_k!}{\tilde{y}_{k1}! \cdot \ldots \cdot \tilde{y}_{kR-1}}}}_{\text{multinomial constant}} 
\cdot \underbrace{\prod_{k=1}^K{{\tilde{\pi}_{k1}}\hspace{0.001cm}^{\tilde{y}_{k1}} \cdot {\tilde{\pi}_{k2}}\hspace{0.001cm}^{\tilde{y}_{k2}} \cdot \ldots 
\cdot (1 - \tilde{\pi}_{k1} - \ldots - \tilde{\pi}_{kR-1})^{1 - \tilde{y}_{k1} \ldots - \tilde{y}_{kR-1}}}}_{\text{kernel}}.\end{eqnarray*} 
The kernel of the likelihood function of the grouped data equals the likelihood function of the individual data. Both likelihood functions only differ by 
the multinomial constant in the likelihood for grouped data. Maximization of both likelihood functions results in the same parameter estimates.
\section{Variable Selection according to Pollet and Nettle\label{VarSelPollet}}
The analytical strategy of \citet{Pollet2009} was as follows: \\
\underline{Start}: Inclusion of partner income and partner height as independent variables. \\
\underline{Step 1}: Omission of any independent variable not significant in the start model. Significance is assessed by the Wald test without adjusting 
for multiplicity.  \\ 
\underline{Subsequent steps}: Stepwise inclusion of the remaining variables in the order in which they improve model fit the most compared to the start 
model. The procedure stops, when model fit cannot be improved further by including another covariate.

Model fit was assessed by the criteria AIC and BIC: \begin{eqnarray*} \text{AIC} &=& - 2 \cdot \ell(\hat{\theta}) + 2 \cdot \dim(\theta), \\ 
\text{BIC} &=& - 2 \cdot \ell(\hat{\theta}) + \log(n) \cdot \dim(\theta). \end{eqnarray*}
$\ell$ denotes the logarithmized likelihood function. In the cumulative logit model the parameter vector $\theta$ is $\theta = (\beta_{01}, \ldots, 
\beta_{0R-1}, \beta_1, \ldots, \beta_p)$. 

In SPSS 15.0, the likelihood function for multinomial distributed responses is calculated by pooling the data according to the covariates (see above). 
Parameter estimates are the same whether they are obtained by maximization of the likelihood function for individual or grouped data. To compare several 
models, which differ in terms of their covariates, by the (log) likelihood function or by criteria calculated by the (log) likelihood function (like AIC 
and BIC), the multinomial constant has to be omitted. As grouping differs among the models due to different covariates in the models, the multinomial 
constant differs as well and the models cannot be compared by the likelihood which includes the constant.

As SPSS 15.0 provides only $- 2 \cdot \ell(\hat{\theta})$, \citet{Pollet2009} calculated AIC and BIC by adding the penalization terms $2 \cdot 
\dim(\theta)$ and $\log(n) \cdot \dim(\theta)$ respectively 
to -2 log likelihood of the grouped data including the multinomial constant, leading to an invalid model choice.

Table \ref{Pollet} shows the progress of model choice following the strategy of \citet{Pollet2009}. The invalid model fit criteria used in the paper, as 
well as the correctly calculated criteria, are shown. The number of model parameters differs, because Pollet and Nettle did not account for the category 
specific intercepts $\beta_{01}, \ldots, \beta_{0R-1}$.


<<table-summary-PN, cache = TRUE>>=
start <- polr(orgasm ~ AincomeSD + AheightSD, data=orgA, Hess=TRUE)
step1 <- polr(orgasm ~ AincomeSD, data=orgA, Hess=TRUE)
step2 <- polr(orgasm ~ AincomeSD + Rhappy, data=orgA, Hess=TRUE)
aic <- formatC(c(AIC(start), AIC(step1), AIC(step2)), digits = 1, format = "f")
bic <- formatC(c(AIC(start, k=log(nrow(orgA))), AIC(step1, k=log(nrow(orgA))), AIC(step2, k=log(nrow(orgA)))), digits = 1, format = "f")
dim_theta  <- c(start$edf, step1$edf, step2$edf)
logLikel <- formatC(-2* c(logLik(start), logLik(step1), logLik(step2)), digits =  1, format = "f")

@

\begin{table}[t!]
\centering
\small
\begin{tabularx}{\textwidth}{llll}
\hline
\vspace*{-0.3cm} \\ 
& Start & Step 1 & Step 2 \\
\vspace*{-0.3cm} \\ 
\hline
\vspace*{-0.3cm} \\ 
Partner income                & $ \surd$        & $ \surd$      & $ \surd$ \\
Partner height                & $ \surd^1$      & ---           & --- \\
Happiness                     & ---             & ---           & $\surd$ \\
\vspace*{-0.3cm} \\ 
\hline	
\vspace*{-0.3cm} \\
Calculations by \citet{Pollet2009}: \\
$-2 \cdot \ell(\hat{\theta})$ & 1868.1          & 405.6         & 752.4 \\
$\dim(\theta)$                & 2               & 1             & 4 \\
AIC                           & 1872.1          & 407.6         & 760.4$^2$ \\
BIC                           & 1882.8          & 412.9         & 781.7$^2$ \\
\vspace*{-0.3cm} \\ 	
Correct calculations: \\
$-2 \cdot \ell(\hat{\theta})$ & \Sexpr{paste(logLikel, collapse = " & ")}\\
$\dim(\theta)$                & \Sexpr{paste(dim_theta, collapse = " & ")}\\
AIC                           & \Sexpr{paste(aic, collapse = " & ")}\\
BIC                           & \Sexpr{paste(bic, collapse = " & ")}\\
\vspace*{-0.3cm} \\
\hline	
\vspace*{-0.3cm} \\
\multicolumn{4}{l}{$^1$ Coefficient of this variable not significant based on Wald test.} \\
\multicolumn{4}{l}{$^2$ No reduction of AIC and BIC by adding a further variable} \\
\vspace*{-0.3cm} \\
\hline	
\vspace*{-0.3cm} 				
\end{tabularx}
\caption{\label{Pollet}Summary of variable selection in \citet{Pollet2009}.}
\end{table}
Start model and step 1 are the same as in table \ref{Pollet}. In the subsequent models further variables were added one at a time starting with the 
variable which improved model fit the most. The selected variables were the same using AIC and BIC to assess model fit except for step 4a/4b. Using BIC 
the model in step 5 was chosen as the best model including the variables partner income, education, age, happiness and difference in education. Using AIC 
as model fit criterion, inclusion of region and health could further improve model fit.
The start model included partner income and partner height. The variable partner income was significant based on the Wald test and remained in the model 
while the variable partner height was excluded  from the model due to non-significance. In step 2 inclusion of the variable self-reported happiness 
resulted in the best improvement of model fit compared to the start model. Inclusion of further variables did not improve model fit. Therefore the model 
with partner income and happiness was chosen as the best model with partner income being the only significant variable based on the 
Wald test. 

When using the correctly calculated criteria AIC and BIC, a different model is chosen. In step 2 the variable education instead of happiness is included. 
The progress of variable selection following 
to the analytical strategy of \citet{Pollet2009} using the correctly calculated criteria, is shown in table \ref{Pollet_korr}. Start model and step 1 are 
the same as in table \ref{Pollet}. In the subsequent models further variables were added one at a time starting with the variable which improved model fit 
the most. The selected variables were the same using AIC and BIC to assess model fit except for step 4a/4b. Using BIC the model in step 5 was chosen as 
the best model including the variables partner income, education, age, happiness and difference in education. Using AIC as model fit criterion, inclusion 
of region and health could further improve model fit. In the next section a further method of variable selection based on the AIC is used to determine the 
important factors for orgasm frequency. 

\newpage

<<table-summary-PN_corr, cache = TRUE>>=

step2 <- polr(orgasm ~ AincomeSD + Redu, data=orgA, Hess=TRUE)
step3 <- polr(orgasm ~ AincomeSD + Redu + RageSD, data=orgA, Hess=TRUE)
step4a <- polr(orgasm ~ AincomeSD + Redu + RageSD + Rhappy, data=orgA, Hess=TRUE)
step4b <- polr(orgasm ~ AincomeSD + Redu  + RageSD + edudiffSD, data=orgA, Hess=TRUE)
step5 <- polr(orgasm ~ AincomeSD + Redu + RageSD + Rhappy + edudiffSD, data=orgA, Hess=TRUE)
step6 <- polr(orgasm ~ AincomeSD + Redu + RageSD + Rhappy + edudiffSD + Region, data=orgA, Hess=TRUE)
step7 <- polr(orgasm ~ AincomeSD + Redu + RageSD + Rhappy + edudiffSD + Region + Rhealth, data=orgA, Hess=TRUE)
aic <- formatC(c(AIC(start), AIC(step1), AIC(step2), AIC(step3), AIC(step4a), AIC(step5), AIC(step6), AIC(step7)), digits = 1, format = "f")
bic <- formatC(c(AIC(start, k=log(nrow(orgA))), AIC(step1, k=log(nrow(orgA))), AIC(step2, k=log(nrow(orgA))), AIC(step3, k=log(nrow(orgA))), 
AIC(step4b, k=log(nrow(orgA))), AIC(step5, k=log(nrow(orgA)))), digits = 1, format = "f")

@

\rotatebox{90}{\parbox{\textheight}{%
\begin{center}
\small
\vspace*{2cm}
\begin{tabular}{lccccccccc}
\hline
\vspace*{-0.3cm} \\ 
                        & Start   & Step 1  & Step 2  & Step 3  & Step 4a & Step 4b & Step 5  & Step 6  & Step 7\\
\vspace*{-0.3cm} \\  
\hline
\vspace*{-0.3cm} \\ 
Partner income          & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ \\
Partner height          & $\surd$ & ---     & ---     & ---     & ---     & ---     & ---     & ---     & --- \\
Education \female       & ---     & ---     & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ \\
Age \female             & ---     & ---     & ---     & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ \\
Happiness \female       & ---     & ---     & ---     & ---     & $\surd$ & ---     & $\surd$ & $\surd$ & $\surd$ \\
Difference in Education & ---     & ---     & ---     & ---     & ---     & $\surd$ & $\surd$ & $\surd$ & $\surd$ \\
Region                  & ---     & ---     & ---     & ---     & ---     & ---     & ---     & $\surd$ & $\surd$ \\
Health \female          & ---     & ---     & ---     & ---     & ---     & ---     & ---     & ---     & $\surd$ \\
\vspace*{-0.3cm} \\ 
\hline	
\vspace*{-0.3cm} \\
AIC                     & \Sexpr{paste(aic[1:5], collapse = " & ")}$^1$ &  & \Sexpr{paste(aic[6:8], collapse = " & ")}$^4$ \\
BIC                     & \Sexpr{paste(bic[1:4], collapse = " & ")} & & \Sexpr{bic[5]}$^2$ & \Sexpr{bic[6]}$^3$ & &\\
\vspace*{-0.4cm} \\
\hline	
\vspace*{-0.3cm} \\
\multicolumn{9}{l}{$^1$ AIC for step 4a.} \\
\multicolumn{9}{l}{$^2$ BIC for step 4b.} \\
\multicolumn{9}{l}{$^3$ No reduction of BIC by adding a further variable.} \\
\multicolumn{9}{l}{$^4$ No reduction of AIC by adding a further variable.} \\
\vspace*{-0.4cm} \\
\hline	
\vspace*{-0.3cm} 
\end{tabular}
\end{center}
\vspace{-1em}
\captionof{table}{Summary of variable selection following the strategy of \citet{Pollet2009} using the correctly calculated AIC and BIC.\newline 
\vspace{0.7cm}\label{Pollet_korr}}}}\newpage

\section{Stepwise Backward Selection \label{VarSelstepAIC}}

<<table-summary-stepAIC, cache = TRUE>>=

### stepAIC does not automatically remove missing values as of R 2.13.0
orgAtmp <- orgA[, c("orgasm", "AincomeSD", "AheightSD", "RAdurationSD",
                 "RageSD", "edudiffSD", "wealthdiffSD", "Redu", "Rhealth",
                 "Rhappy", "Region")]
cc <- complete.cases(orgAtmp)
summary(cc)
orgAcc <- subset(orgA, cc)

step_AIC <- stepAIC(polr(orgasm ~ AincomeSD + AheightSD + RAdurationSD + RageSD + edudiffSD 
+ wealthdiffSD + Redu + Rhealth + Rhappy + Region, data=orgAcc, Hess=TRUE), trace = FALSE)
aic <- formatC(step_AIC$anova[,6], digits = 1, format = "f")

@

The stepwise backward selection starts with the saturated model, which includes all variables. Variables are omitted one at a time starting with the 
variable that reduces the AIC most. Variable 
selection stops, when the AIC cannot be reduced further by removing a variable. Note that the original data
contains three missing values in variable \texttt{edudiffSD}. The corresponding observations have been
removed from the data set before fitting all models presented in Table~\ref{stepAIC} but only for
models involving these variable presented in Table~\ref{Pollet_korr} (since we assume the same approach
was taken in SPSS).

In our data a stepwise backward selection results in a reduction of the AIC from \Sexpr{aic[1]} in the saturated model to \Sexpr{aic[5]} in the reduced 
model. The steps of the backwise selection are shown
in table \ref{stepAIC}. The variable partner income, which was included in all models when following the strategy of \citet{Pollet2009}, is here dropped 
in step 2. By stepwise backward selection the same variables except for partner income are chosen as by the strategy of Pollet and Nettle using the 
correctly calculated AIC. 

\begin{table}[h]
\centering
\small
\vspace*{0.5cm}
\begin{tabular}{lccccc}
\hline
\vspace*{-0.3cm} \\ 
Model                    & Start   & Step 1  & Step 2  & Step 3  & Step 4\\
\vspace*{-0.3cm} \\ 
\hline
\vspace*{-0.3cm} \\ 
Partner height           & $\surd$ & ---     & ---     & ---     & ---  \\
Partner income           & $\surd$ & $\surd$ & ---     & ---     & --- \\
Duration of relationship & $\surd$ & $\surd$ & $\surd$ & ---     & --- \\
Difference in income     & $\surd$ & $\surd$ & $\surd$ & $\surd$ & --- \\
Age \female              & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ \\
Diffference in education & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ \\
Education \female        & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ \\
Happiness \female        & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ \\
Region                   & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ \\
Health \female           & $\surd$ & $\surd$ & $\surd$ & $\surd$ & $\surd$ \\
\vspace*{-0.3cm} \\ 
\hline	
\vspace*{-0.3cm} \\
AIC                      & \Sexpr{paste(aic, collapse = " & ")} \\
\vspace*{-0.4cm} \\
\hline	
\vspace*{-0.3cm} 
\end{tabular}
\caption{\label{stepAIC}Steps of backward variable selection based on the AIC.}
\end{table}

\section{Variable Selection by Simultaneous Inference}

<<table-summary-siminf, cache = TRUE>>=

ordRegr <- polr(orgasm ~ AincomeSD + AheightSD + RAdurationSD 
+ RageSD + edudiffSD + wealthdiffSD + Redu + Rhealth + Rhappy 
+ Region, data=orgA, Hess=TRUE)
K <- diag(1,length(coef(ordRegr)))
rownames(K) <- names(coef(ordRegr))
s <- summary(glht(ordRegr, linfct = K))
variable <- c("Partner income", "Partner height", "Duration of relationship", "Age", "Difference in education", "Difference in income", 
"Education", "$\\quad$ University (reference category)", "$\\quad$ Junior college", "$\\quad$ Upper middle", "$\\quad$ Lower middle", "$\\quad$ Primary", 
"$\\quad$ No school", 
"Health", "$\\quad$ Poor (reference category)", "$\\quad$ Not good", "$\\quad$ Fair", "$\\quad$ Good", "$\\quad$ Excellent",
"Happiness", "$\\quad$ Very unhappy (reference category)", "$\\quad$ Not too happy", "$\\quad$ Relatively happy", "$\\quad$ Very happy",
"Region", "$\\quad$ Central West (reference category)", "$\\quad$ North East", "$\\quad$ North", "$\\quad$ Inland South", "$\\quad$ Coastal East", 
"$\\quad$ Coastal South")
estimate <- formatC(as.vector(s$coef), digits = 2, format = "f")
estimate <- c(estimate[1:6], "", "NA", estimate[7:11], "", "NA", estimate[12:15], "", "NA", estimate[16:18], "", "NA", estimate[19:23])
padj <- formatC(s$test$pvalue, digits = 3, format = "f")
padj <- c(padj[1:6], "", "---", padj[7:11], "", "---", padj[12:15], "", "---", padj[16:18], "", "---", padj[19:23])
siminf <- cbind(variable, estimate, padj)
colnames(siminf) <- c("Variable", "Estimate", "Adjusted $p$-value")

@

In the following, the relevant factors for orgasm frequency are assessed using the procedure for simultaneous inference introduced by \citet{Hothorn2008b} 
instead of using model fit criterions like AIC and BIC. Therefore, we fit a cumulative logit model, which includes all covariates and use the max-$t$-test 
to select important variables based on adjusted $p$-values. The hypotheses are $$H_j^0: \beta_j = 0, \; j = 1, \ldots, p,$$ and can be specified as linear 
hypotheses $K \beta = 0$ with the matrix $K$ being the $p \times p$ identity matrix. Three observations with
missings in variable \texttt{edudiffSD} have been removed prior to fitting the model.

The parameter estimates and associated adjusted $p$-values are shown in table \ref{simOrgA}. The respondant's education is the relevant factor for orgasm 
frequency with a cumulative odds ratio of
$\exp(\Sexpr{formatC(s$test$coef[11], digits = 2, format = "f")})  =  \Sexpr{formatC(exp(s$test$coef[11]), digits = 2, format = "f")}$ comparing the 
categories ``No school'' and ``University''. Women with university degree have a higher chance of having an orgasm more frequently than women without 
school education. Associated with this is the significance of the variable ``difference in education'' with women having less orgasms the higher their 
partners' level of education is above their own. Further differences in orgasm frequency exist between two regions of China.

<<table-summary-siminf-tex, results = tex>>=

siminfPrint <- xtable(siminf, caption="Parameter estimates of the saturated cumulative logit model with associated adjusted $p$-values of the max-$t$-test.",
                  label="simOrgA")
align(siminfPrint) <- "llcc"
print(siminfPrint, table.placement = "h!", include.rownames = FALSE, sanitize.text.function = function(x) {x})

@ 

Not only when selecting important variables by simultaneous inference of all parameter estimates the respondent's education was chosen as the relevant 
factor for orgasm frequency, but also the methods described in sections \ref{VarSelPollet} and \ref{VarSelstepAIC} selected education as an important 
variable among others. Therefore we further investigate the effect of education and take a look at the cumulative odds ratios when comparing the levels 
of the respondent's education. Again we fit a cumulative logit model including all covariates. The matrix of linear functions $K$, which sets up the 
linear hypothesis of model parameters, is defined in the form that consecutive levels of education are compared. The estimated log odds ratios and 
associated $p$-values of the simultaneous comparisons based on the max-$t$-test are summarized in table \ref{simRedu}.

<<table-summary-comp-edu, cache = TRUE>>=
s <- summary(glht(ordRegr, linfct = mcp(Redu = c("univ - jcol = 0",
                                                 "jcol - upmid = 0",
                                                 "upmid - lowmid = 0",
                                                 "lowmid - primary = 0",
                                                 "primary - noschool = 0"))))
comparison <- c("University - Junior college", "Junior college - Upper middle", "Upper middle - Lower middle", "Lower middle - Primary", 
"Primary - No school")
estimate <- formatC(as.vector(s$test$coef), digits = 2, format = "f")
padj <- formatC(s$test$pvalue, digits = 3, format = "f")
comp_edu <- cbind(comparison, estimate, padj)
colnames(comp_edu) <- c("Compared levels of education", "Estimated log odds ratio", "Adjusted $p$-value")

@

<<table-summary-comp-edu-tex, results = tex>>=

comp_eduPrint <- xtable(comp_edu, caption="Estimated log odds ratios for comparisons of consecutive levels of education and associated adjusted $p$-values 
of the simultaneous comparisons.",
                  label="simRedu")
align(comp_eduPrint) <- "llcc"
print(comp_eduPrint, table.placement = "h!", include.rownames = FALSE, sanitize.text.function = function(x) {x})

@ 

When comparing levels of education from ``No school'' to ``Upper middle school'' women with the respective higher level of education tend to have more 
frequent orgasms with cumulative odds ratios of \Sexpr{formatC(exp(s$test$coef[5]), digits = 2, format = "f")} (Comparison Primary school - No school), 
\Sexpr{formatC(exp(s$test$coef[4]), digits = 2, format = "f")} (Comparison Lower middle school - Primary school) und 
\Sexpr{formatC(exp(s$test$coef[3]), digits = 2, format = "f")} (Comparison Upper middle school - Lower middle school). 


\bibliographystyle{jss}
\bibliography{chfls1}
\end{document}