/usr/lib/R/site-library/Biostrings/doc/SolexaYi2-benchmark.txt is in r-bioc-biostrings 2.30.1-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 | Some PDict()/matchPDict() benchmarks using Yi's second Solexa data set
======================================================================
"Yi's second data set" is originally available at:
lamprey:/mnt/fred/solexa/ycao/080623_HWI-EAS88_0001/
Lane1. Myoblasts IPed with 7311 Antibody
Lane2: Myotubes IPed with 7311 Antibody
Lane3. Myoblasts IPed with 6975 Antibody
Lane4: Myotubes IPed with 6975 Antibody
Lane5: phiX control
Lane6. Myoblasts IPed with 6196 Antibody
Lane7: Myotubes IPed with 6196 Antibody
Lane8: Myotubes IPed with preimmune serum
===============================================================================
On george1 (16G of RAM, 8 CPUs)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
8 lanes together
----------------
After cleaning, dict0 is 35567623 x 35.
> library(ShortRead)
> path0 <- "~/SolexaYi2"
> list.files(path=path0, pattern="^s_._sequence.txt$") # 8 lanes
[1] "s_1_sequence.txt" "s_2_sequence.txt" "s_3_sequence.txt" "s_4_sequence.txt"
[5] "s_5_sequence.txt" "s_6_sequence.txt" "s_7_sequence.txt" "s_8_sequence.txt"
## Loading and cleaning
> system.time(rfq <- readFastq(path0, pattern="^s_._sequence.txt$"))
user system elapsed
382.303 21.193 414.080
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 442339 23.7 83093856 4437.7 107396566 5735.6
Vcells 528200241 4029.9 1715252697 13086.4 1704641212 13005.4
> system.time(dict0 <- clean(sread(rfq)))
user system elapsed
5.892 0.804 31.320
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 445623 23.8 53180067 2840.2 107396566 5735.6
Vcells 563768713 4301.3 1715252697 13086.4 1704641212 13005.4
## Preprocessing
> pdict0 <- PDict(dict0) # ERROR: Trusted Band is too big!
> pdict1 <- PDict(dict0, max.mismatch=1) # takes forever (lots of swapping)
lane 1 only
-----------
After cleaning, dict0 is 4469757 x 35.
> library(ShortRead)
> path0 <- "~/SolexaYi2"
## Loading and cleaning
> system.time(rfq <- readFastq(path0, pattern="s_1_sequence.txt"))
user system elapsed
44.450 2.120 49.867
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 442269 23.7 10798339 576.7 13860972 740.3
Vcells 66506225 507.5 215613665 1645.1 214056559 1633.2
> system.time(dict0 <- clean(sread(rfq)))
user system elapsed
0.793 0.000 0.791
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 443715 23.7 6910936 369.1 13860972 740.3
Vcells 70976449 541.6 215613665 1645.1 214056559 1633.2
## Preprocessing for exact matching
> system.time(pdict0 <- PDict(dict0))
user system elapsed
14.520 2.856 17.385
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 874117 46.7 2830718 151.2 13860972 740.3
Vcells 442722844 3377.8 569051730 4341.6 515966682 3936.6
## This shows that the size of pdict0 is about 2837 Mb (unreliable
## object.size() reports 3448 Mb).
## Preprocessing for max.mismatching=1
> system.time(pdict1 <- PDict(dict0, max.mismatch=1))
user system elapsed
26.966 2.164 29.130
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1894849 101.2 4065505 217.2 13860972 740.3
Vcells 684867192 5225.2 1023208512 7806.5 797303398 6083.0
## This shows that the size of pdict1 is about 1848 Mb (unreliable
## object.size() reports 2928 Mb).
## Preprocessing for max.mismatching=2
> system.time(pdict2 <- PDict(dict0, max.mismatch=2))
user system elapsed
56.760 1.532 58.288
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 4409170 235.5 9820776 524.5 13860972 740.3
Vcells 815612649 6222.7 1371741416 10465.6 977847099 7460.4
## This shows that the size of pdict2 is about 998 Mb (unreliable
## object.size() reports 2582 Mb).
## Preprocessing for max.mismatching=3
> system.time(pdict3 <- PDict(dict0, max.mismatch=3))
user system elapsed
29.694 2.656 32.349
Warning message:
In .MTB_PDict(x, max.mismatch, type) :
given the characteristics of dictionary 'x', this value of 'max.mismatch' will give poor (or very poor) performance when you use this MTB_PDict with matchPDict() (it will of course depend on the length of the subject)
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 5555999 296.8 10351814 552.9 13860972 740.3
Vcells 922524858 7038.4 1440408486 10989.5 1370974508 10459.8
## This shows that the size of pdict3 is about 816 Mb (unreliable
## object.size() reports 2800 Mb).
## Loading Mouse chr1
> library(BSgenome.Mmusculus.UCSC.mm9)
> chr1 <- Mmusculus$chr1
## Exact matching
> system.time(m0 <- matchPDict(pdict0, chr1))
user system elapsed
51.455 0.956 52.410
> object.size(m0)/1024/1024
[1] 468.3648
> system.time(cm0 <- countIndex(m0))
user system elapsed
23.217 0.000 23.218
> sum(cm0)
[1] 110578790
## Matching with max.mismatch=1
> system.time(m1 <- matchPDict(pdict1, chr1, max.mismatch=1))
user system elapsed
641.280 4.552 649.840
> object.size(m1)/1024/1024
[1] 1038.721
> system.time(cm1 <- countIndex(m1))
user system elapsed
11.412 0.072 13.297
> sum(cm1)
[1] 258783032
> sum(cm0 == 0 & cm1 != 0)
[1] 118600
## Matching with max.mismatch=2
> system.time(m2 <- matchPDict(pdict2, chr1, max.mismatch=2))
user system elapsed
12290.748 42.695 12442.805
> object.size(m2)/1024/1024
[1] 2007.031
> system.time(cm2 <- countIndex(m2))
user system elapsed
8.116 0.416 56.458
> sum(cm2)
[1] 511651541
## Some sanity checking
m10 <- matchPDict(pdict1, chr1, max.mismatch=0)
identical(m10, m0)
m20 <- matchPDict(pdict2, chr1, max.mismatch=0)
identical(m20, m0)
m30 <- matchPDict(pdict3, chr1, max.mismatch=0)
identical(m30, m0)
m21 <- matchPDict(pdict2, chr1, max.mismatch=1)
identical(m21, m1)
m31 <- matchPDict(pdict3, chr1, max.mismatch=1)
identical(m31, m1)
m32 <- matchPDict(pdict3, chr1, max.mismatch=2)
identical(m32, m2)
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 6776903 362.0 19227046 1026.9 19227046 1026.9
Vcells 1413290587 10782.6 4217248080 32175.1 3591384996 27400.1
## Max used was 27400.1 Mb so with only 16G of RAM there was a lot
## of swapping!
> sessionInfo()
R version 2.8.0 Under development (unstable) (2008-06-17 r45946)
x86_64-unknown-linux-gnu
locale:
LC_CTYPE=en_CA.UTF-8;LC_NUMERIC=C;LC_TIME=en_CA.UTF-8;LC_COLLATE=en_CA.UTF-8;LC_MONETARY=C;LC_MESSAGES=en_CA.UTF-8;LC_PAPER=en_CA.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_CA.UTF-8;LC_IDENTIFICATION=C
attached base packages:
[1] tools stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] BSgenome.Mmusculus.UCSC.mm9_1.3.7 BSgenome_1.9.3
[3] ShortRead_0.1.26 lattice_0.17-8
[5] Biobase_2.1.7 Biostrings_2.9.31
loaded via a namespace (and not attached):
[1] grid_2.8.0
===============================================================================
On lamprey (128G of RAM, 16 CPUs)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
lane 1 only
-----------
After cleaning, dict0 is 4469757 x 35.
> library(ShortRead)
> path0 <- "."
## Loading and cleaning
> system.time(rfq <- readFastq(path0, pattern="s_1_sequence.txt"))
user system elapsed
58.147 2.572 81.392
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 428340 22.9 10798339 576.7 13844467 739.4
Vcells 66500371 507.4 215606846 1645.0 214050084 1633.1
> system.time(dict0 <- clean(sread(rfq)))
user system elapsed
0.788 0.000 0.789
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 428828 23.0 6910936 369.1 13844467 739.4
Vcells 70970334 541.5 215606846 1645.0 214050084 1633.1
## Preprocessing for exact matching
> system.time(pdict0 <- PDict(dict0))
user system elapsed
17.834 2.688 20.520
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 861818 46.1 2830718 151.2 13844467 739.4
Vcells 442717290 3377.7 569044281 4341.5 515961030 3936.5
## This shows that the size of pdict0 is about 2837 Mb (unreliable
## object.size() reports 3448 Mb).
## Preprocessing for max.mismatching=1
> system.time(pdict1 <- PDict(dict0, max.mismatch=1))
user system elapsed
32.982 1.960 34.945
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1841306 98.4 3833815 204.8 13844467 739.4
Vcells 684845356 5225.0 974395366 7434.1 797342043 6083.3
## This shows that the size of pdict1 is about 1848 Mb (unreliable
## object.size() reports 2928 Mb).
## Preprocessing for max.mismatching=2
> system.time(pdict2 <- PDict(dict0, max.mismatch=2))
user system elapsed
73.860 1.760 75.619
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 4355627 232.7 9820776 524.5 13844467 739.4
Vcells 815590813 6222.5 1371723488 10465.5 966735831 7375.7
## This shows that the size of pdict2 is about 998 Mb (unreliable
## object.size() reports 2582 Mb).
## Preprocessing for max.mismatching=3
> system.time(pdict3 <- PDict(dict0, max.mismatch=3))
user system elapsed
34.590 3.068 37.657
Warning message:
In .MTB_PDict(x, max.mismatch, type) :
given the characteristics of dictionary 'x', this value of 'max.mismatch' will give poor (or very poor) performance when you use this MTB_PDict with matchPDict() (it will of course depend on the length of the subject)
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 5502456 293.9 9820776 524.5 13844467 739.4
Vcells 922503022 7038.2 1371723488 10465.5 1370952672 10459.6
## This shows that the size of pdict3 is about 816 Mb (unreliable
## object.size() reports 2800 Mb).
## Loading Mouse chr1
> library(BSgenome.Mmusculus.UCSC.mm9)
> chr1 <- Mmusculus$chr1
## Exact matching
> system.time(m0 <- matchPDict(pdict0, chr1))
user system elapsed
60.604 0.024 60.628
> object.size(m0)/1024/1024
[1] 468.3648
> system.time(cm0 <- countIndex(m0))
user system elapsed
30.946 0.000 30.945
> sum(cm0)
[1] 110578790
## Matching with max.mismatch=1
> system.time(m1 <- matchPDict(pdict1, chr1, max.mismatch=1))
user system elapsed
643.588 3.665 647.242
> object.size(m1)/1024/1024
[1] 1038.721
> system.time(cm1 <- countIndex(m1))
user system elapsed
12.153 0.024 12.180
> sum(cm1)
[1] 258783032
> sum(cm0 == 0 & cm1 != 0)
[1] 118600
## Matching with max.mismatch=2
> system.time(m2 <- matchPDict(pdict2, chr1, max.mismatch=2))
user system elapsed
16388.369 14.624 16402.895
> object.size(m2)/1024/1024
[1] 2007.031
> system.time(cm2 <- countIndex(m2))
user system elapsed
9.944 0.028 9.973
> sum(cm2)
[1] 511651541
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 6748852 360.5 19227046 1026.9 19227046 1026.9
Vcells 1408812643 10748.4 4217175576 32174.5 3584563517 27348.1
> sessionInfo()
R version 2.8.0 Under development (unstable) (2008-05-13 r45683)
x86_64-unknown-linux-gnu
locale:
LC_CTYPE=en_CA.UTF-8;LC_NUMERIC=C;LC_TIME=en_CA.UTF-8;LC_COLLATE=en_CA.UTF-8;LC_MONETARY=C;LC_MESSAGES=en_CA.UTF-8;LC_PAPER=en_CA.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_CA.UTF-8;LC_IDENTIFICATION=C
attached base packages:
[1] tools stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] BSgenome.Mmusculus.UCSC.mm9_1.3.6 BSgenome_1.9.1
[3] ShortRead_0.1.26 lattice_0.17-7
[5] Biobase_2.1.3 Biostrings_2.9.31
loaded via a namespace (and not attached):
[1] grid_2.8.0
===============================================================================
Summary for lane 1 / Mouse chr1+
--------------------------------
NM0 <- cm0
NM1 <- cm1 - NM0
NM2 <- cm2 - NM1
==================================================
NM0 | NM1 | NM2 | NM3 | nb of reads | % of reads
==================================================
> 1 | any | 186274 4.17%
-----|-----------------|--------------------------
= 1 | any | 106373 2.38%
-----|-----------------|--------------------------
| > 1 | any | 76754 1.72%
|-----|-----------|--------------------------
| = 1 | any | 41846 0.94%
|-----|-----------|--------------------------
| | > 1 | any | 53419 1.20%
| |-----|-----|--------------------------
= 0 | | = 1 | any | 33474 0.75%
| |-----|-----|--------------------------
| = 0 | | > 1 |
| | |-----|
| | = 0 | = 1 | 3971617 88.86%
| | |-----|
| | | = 0 |
-----------------------|--------------------------
T O T A L | 4469757 100.00%
==================================================
===============================================================================
More benchmarks
---------------
on george1:
library(BSgenome.Mmusculus.UCSC.mm9)
chr1 <- unmasked(Mmusculus$chr1)
library(ShortRead)
path0 <- "~hpages/SolexaYi2"
rfq <- readFastq(path0, pattern="s_1_sequence.txt")
dict0 <- clean(sread(rfq))
pdict10 <- PDict(dict0[1:1000])
pdict11 <- PDict(dict0[1:1000], max.mismatch=1)
pdict12 <- PDict(dict0[1:1000], max.mismatch=2)
pdict13 <- PDict(dict0[1:1000], max.mismatch=3)
pdict20 <- PDict(dict0[1:10000])
pdict21 <- PDict(dict0[1:10000], max.mismatch=1)
pdict22 <- PDict(dict0[1:10000], max.mismatch=2)
pdict23 <- PDict(dict0[1:10000], max.mismatch=3)
pdict30 <- PDict(dict0[1:100000])
pdict31 <- PDict(dict0[1:100000], max.mismatch=1)
pdict32 <- PDict(dict0[1:100000], max.mismatch=2)
pdict33 <- PDict(dict0[1:100000], max.mismatch=3)
pdict40 <- PDict(dict0[1:1000000])
pdict41 <- PDict(dict0[1:1000000], max.mismatch=1)
pdict42 <- PDict(dict0[1:1000000], max.mismatch=2)
pdict43 <- PDict(dict0[1:1000000], max.mismatch=3)
| | | | | | avg nb of matches
| | | | nb of reads | total nb | for reads with
dict | subject | max.mismatch | time | with matches | of matches | matches
--------------------------------------------------------------------------------------------------------
dict0[1:1000] | Mouse chr1+ | 0 | 1.673 | 68 | 8273 | 121.6618
| | 1 | 12.987 | 111 | 24408 | 219.8919
| | 2 | 38.301 | 131 | 45897 | 350.3588
| | 3 | 158.322 | 151 | 105827 | 700.841
--------------------------------------------------------------------------------------------------------
dict0[1:10000] | Mouse chr1+ | 0 | 2.995 | 645 | 85745 | 132.9380
| | 1 | 40.133 | 938 | 220686 | 235.2729
| | 2 | 137.215 | 1154 | 416988 | 361.3414
| | 3 | 494.440 | 1315 | 762942 | 580.184
--------------------------------------------------------------------------------------------------------
dict0[1:100000] | Mouse chr1+ | 0 | 15.534 | 6448 | 1695089 | 262.886
| | 1 | 76.388 | 9220 | 4618443 | 500.9157
| | 2 | 224.011 | 11206 | 9319253 | 831.6306
| | 3 | 1204.326 | 13037 | 16226338 | 1244.637
--------------------------------------------------------------------------------------------------------
dict0[1:1000000] | Mouse chr1+ | 0 | 32.423 | 64249 | 20751467 | 322.9851
| | 1 | 148.992 | 90811 | 51550694 | 567.6702
| | 2 | 1120.742 | 110296 | 102207683 | 926.6672
| | 3 | 2457.180 | 127862 | 176974849 | 1384.108
--------------------------------------------------------------------------------------------------------
|