This file is indexed.

/usr/lib/R/site-library/VariantAnnotation/scripts/test_Rplinkseq.R is in r-bioc-variantannotation 1.20.2-1+b1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
### ======================================================================
### Testing Rplinkseq and VariantAnnotation
### ======================================================================
###
### February 2014
### 'rhino03' Ubuntu server, 387 Gb RAM
### 16 processors with the following configuration:
###
### vendor_id	: GenuineIntel
### cpu family	: 6
### model		: 45
### model name	: Intel(R) Xeon(R) CPU E5-2690 0 @ 2.90GHz
### stepping	: 7
### microcode	: 0x70d
### cpu MHz		: 2900.142
### cache size	: 20480 KB
### physical id	: 1
### siblings	: 8
### core id		: 0
### cpu cores	: 8
### apicid		: 32
### initial apicid	: 32
### fpu		: yes
### fpu_exception	: yes
### cpuid level	: 13
### wp		: yes
### flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
### pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb
### rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
### aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr
### pdcm pcid dca sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx
### lahf_lm ida arat xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid
### bogomips	: 5799.87
### clflush size	: 64
### cache_alignment	: 64
### address sizes	: 46 bits physical, 48 bits virtual
### power management:

### --------------------------------------------------------------------------
### Objectives
### --------------------------------------------------------------------------

### Compare runtimes between Rplinkseq and VariantAnnotation packages.
###
###
### Test functions:
###
### load.vcf:    Data are loaded from a vcf file into a list of lists 
###              and accessed with x.consensus* functions. (Rplinkseq) 
###
### var.fetch:   Data are loaded from a PLINK/Seq 'project' into a list of
###              lists and accessed with x.consensus* functions. (Rplinkseq) 
###
### meta.fetch:  Data are loaded from a PLINK/Seq 'project' and parsed into
###              a data.frame. (Rplinkseq)
###
### var.iterate: Applies a function to data from a PLINK/Seq 'project'
###              while iterating. Result of function is returned.
###              (Rplinkseq)
###
### scanVcf:     Data are loaded from a vcf file. Info and geno fields are 
###              parsed into a list of lists; other 'core' fields are
###              returned as a GRAnges object. (VariantAnnotation)
###
###
### Test cases:
###
### Test I:      Query by range.
###              Import data by randomly chosen genomic range. 
###              (Functions: load.vcf, var.fetch, scanVcf)
###
### Test II:     Query by range and fields.
###              Import data by genomic range and 4 randomly
###              chosen info (2) and geno (2) fields.
###              (Functions: meta.fetch, scanVcf)
### 
### Test III.    Iterate through file.
###              A simple function is applied to all records in the
###              file in an iterative fashion.
###              (Functions: var.iterate, scanVcf)
###

### --------------------------------------------------------------------------
### Data
### --------------------------------------------------------------------------

### Test file:
### Size on disk: 1.8G compressed
### Contents: 494328 variants, 1092 samples, 22 INFO, and 3 GENO fields
### Download location: 
### ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20110521/ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz
###
### Test subset:
### An arbitrary subset of data was chosen for testing because the full file 
### requires >250G in memory. The subset range is 2e7 to 2.5e7 and contains 
### 63088 records. This subset is the 'range' in the mask and param objects. 
###
### PLINK/Seq 'project':
### From the raw vcf on disk we creted a PLINK/Seq 'project' so we could use 
### the var.iterate function. A 'project' creates compressed, indexed, SQLite
### database from input files. This one took ~30 minutes to create.
### pseq proj new-project
### pseq proj load-vcf --vcf 'ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz'

### --------------------------------------------------------------------------
### Set up 
### --------------------------------------------------------------------------
library(microbenchmark)
library(VariantAnnotation)
library(Rplinkseq)  ## version 0.08
fl <- "/loc/no-backup/vobencha/ALL.chr22.phase1_release_v3.20101123.snps_indels_svs.genotypes.vcf.gz"
### Attach the 'project' to the R session:
pseq.project("proj") 

### --------------------------------------------------------------------------
### Test I. Query by range
### --------------------------------------------------------------------------

### load.vcf:
mask <- "reg=22:20000000..25000000"
loadvcf_1 <- function()
    load.vcf(fl, mask=mask, limit=200000)

### var.fetch:
varfetch_1 <- function()
     var.fetch(mask=mask, limit=200000)

### scanVcf:
tfile <- TabixFile(fl)
which <- GRanges("22", IRanges(2e7, 2.5e7))
param <- ScanVcfParam(which=which)
scanvcf_1 <- function() 
    scanVcf(tfile, param=param)

micro <- microbenchmark(loadvcf_1(), varfetch_1(), scanvcf_1(), times=5)
micro
###Unit: seconds
###         expr      min       lq   median       uq      max neval
###  loadvcf_1() 313.1729 335.5738 359.7979 368.4054 369.9397     5
### varfetch_1() 264.4101 283.2388 291.7533 305.7028 318.7879     5
###  scanvcf_1() 300.5585 308.5507 359.0814 400.2049 678.2296     5

### --------------------------------------------------------------------------
### Test II. Query by range and fields
### --------------------------------------------------------------------------

### Randomly select 2 INFO and 2 GENO fields:
info_var <- c("RSQ", "THETA")
geno_var <- c("GT", "DS")
### Other fields parsed by scanVcf:
other_var <- c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER")

### load.vcf: To the best of our knowledge, this function can import
###           by range but not by select fields. To isolate a field,
###           all fields are read in and parsed with the x.consensus 
###           functions. There is no equivalent comparision for 
###           load.vcf in this test case.

### meta.fetch:
mask <- "reg=22:20000000..25000000"
metafetch_2 <- function()
    meta.fetch(c(info_var, geno_var, other_var), mask=mask) 

### scanVcf:  NOTE: Fields in 'other_var' are imported and parsed by
###           scanVcf() under names of 'rowRanges' and 'fixed'.
tfile <- TabixFile(fl)
which <- GRanges("22", IRanges(2e7, 2.5e7))
param <- ScanVcfParam(which=which, info=info_var, geno=geno_var)
scanvcf_2 <- function()
    scanVcf(tfile, param=param)

micro <- microbenchmark(metafetch_2(), scanvcf_2(), times=5)
micro
###Unit: seconds
###          expr       min        lq    median        uq       max neval
### metafetch_2() 118.14090 120.65009 120.87346 120.88615 121.13340     5
###   scanvcf_2()  35.45512  35.46179  35.51305  35.59804  37.41809     5


### --------------------------------------------------------------------------
### Test III. Iterate through file 
### --------------------------------------------------------------------------

### load.vcf: To the best of our knowledge Rplinkseq cannot iterate on 
###           raw vcf files. Iteration must be done on a 'project' with
###           var.iterate. There is no equivalent comparision for load.vcf
###           in this test case.

### var.iterate:
simple_ct <- function(v)
    ct <<- ct + length(v$ID) 

variterate_3 <- function()
{
    ct <<- 0
    var.iterate(simple_ct)
}

### scanVcf:
tfile <- TabixFile(fl, yieldSize=10000)
param <- ScanVcfParam(fixed=c("ALT"), info=NA, geno=NA)
scanvcf_3 <- function()
{
    ct2 <<- 0
    open(tfile)
    while((len <- length(scanVcf(tfile, param=param)[[1]]$rowRanges)) > 0) 
        ct2 <<- ct2 + len 
    close(tfile)
}

micro <- microbenchmark(variterate_3(), scanvcf_3(), times=5)
micro
##Unit: seconds
##           expr        min         lq     median         uq        max neval
## variterate_3() 1552.88520 1576.55856 1583.13120 1585.65281 1591.96375     5
##    scanvcf_3()   50.04566   50.06945   50.27194   50.56145   50.70112     5


### --------------------------------------------------------------------------
### Scaling with number of variants and samples (scanVcf only)
### --------------------------------------------------------------------------

### Linear scaling by variants:
tfile <- TabixFile(fl)
yieldSize <- c(100000, 200000, 300000, 400000, 500000) 
param <- ScanVcfParam(info=NA, geno=NA)
fun0 <- function(tfile, param) scanVcf(tfile, param=param)

res <- lapply(yieldSize, function(i) {
              tf <- TabixFile(fl, yieldSize=i)
              microbenchmark(fun0(tf, param), times=5)})
res
### [[1]]
### Unit: seconds
###             expr      min       lq   median       uq      max neval
###  fun0(tf, param) 9.883719 10.13574 10.15165 10.17318 10.70124     5
### 
### [[2]]
### Unit: seconds
###             expr    min      lq   median       uq      max neval
###  fun0(tf, param) 19.385 19.6715 19.71968 19.72206 20.23362     5
### 
### [[3]]
### Unit: seconds
###             expr      min       lq   median       uq      max neval
###  fun0(tf, param) 29.24268 29.33945 29.36927 29.38207 30.01012     5
### 
### [[4]]
### Unit: seconds
###             expr      min       lq   median      uq      max neval
###  fun0(tf, param) 38.71675 38.73608 38.77965 38.8696 39.27382     5
### 
### [[5]]
### Unit: seconds
###             expr      min       lq   median       uq      max neval
###  fun0(tf, param) 47.94605 48.76884 49.03272 49.03859 49.08618     5


### Linear scaling by sample:
tf <- TabixFile(fl)
ids <- samples(scanVcfHeader(fl))
sampleSize <- c(200, 400, 600, 800, 1000)
which <- GRanges("22", IRanges(2e7, 2.5e7)) ## 63088 records
fun0 <- function(tf, param) scanVcf(tf, param=param) 
 
res <- lapply(sampleSize, function(i) {
              param <- ScanVcfParam(which=which, samples=ids[1:i])
              microbenchmark(fun0(tf, param), times=5)})
res
### [[1]]
### Unit: seconds
###             expr      min       lq   median       uq      max neval
###  fun0(tf, param) 61.80377 61.91141 69.62993 70.10218 103.1949     5
### 
### [[2]]
### Unit: seconds
###             expr      min       lq   median      uq      max neval
###  fun0(tf, param) 128.3595 132.1819 135.7386 135.966 150.7588     5
### 
### [[3]]
### Unit: seconds
###             expr      min       lq   median       uq     max neval
###  fun0(tf, param) 204.0067 211.0546 219.2069 232.3388 249.188     5
### 
### [[4]]
### Unit: seconds
###             expr      min       lq   median       uq      max neval
###  fun0(tf, param) 266.7383 269.8461 273.7986 299.7805 330.6505     5
### 
### [[5]]
### Unit: seconds
###             expr      min       lq   median       uq      max neval
###  fun0(tf, param) 329.6348 337.1841 348.7968 359.5972 364.2781     5