This file is indexed.

/usr/lib/R/site-library/ensembldb/doc/ensembldb.R is in r-bioc-ensembldb 2.2.2-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
## ----load-libs, warning=FALSE, message=FALSE-------------------------------
library(EnsDb.Hsapiens.v75)

## Making a "short cut"
edb <- EnsDb.Hsapiens.v75
## print some informations for this package
edb

## For what organism was the database generated?
organism(edb)
 

## ----no-network, echo = FALSE, results = "hide"----------------------------
## Disable code chunks that require network connection - conditionally
## disable this on Windows only. This is to avoid TIMEOUT errors on the
## Bioconductor Windows build maching (issue #47).
use_network <- FALSE
 

## ----filters---------------------------------------------------------------
supportedFilters(edb)
 

## ----transcripts-----------------------------------------------------------
Tx <- transcripts(edb, filter = list(GenenameFilter("BCL2L11")))

Tx

## as this is a GRanges object we can access e.g. the start coordinates with
head(start(Tx))

## or extract the biotype with
head(Tx$tx_biotype)
 

## ----transcripts-filter-expression-----------------------------------------
## Use a filter expression to perform the filtering.
transcripts(edb, filter = ~ genename == "ZBTB16")
 

## ----transcripts-filter----------------------------------------------------
library(magrittr)

filter(edb, ~ symbol == "BCL2" & tx_biotype != "protein_coding") %>% transcripts
 

## ----filter-Y--------------------------------------------------------------
edb_y <- addFilter(edb, SeqNameFilter("Y"))

## All subsequent filters on that EnsDb will only work on features encoded on
## chromosome Y
genes(edb_y)

## Get all lincRNAs on chromosome Y
genes(edb_y, filter = ~ gene_biotype == "lincRNA")
 

## ----list-columns----------------------------------------------------------
## list all database tables along with their columns
listTables(edb)

## list columns from a specific table
listColumns(edb, "tx")
 

## ----transcripts-example2--------------------------------------------------
Tx <- transcripts(edb,
                  columns = c(listColumns(edb , "tx"), "gene_name"),
                  filter = TxBiotypeFilter("nonsense_mediated_decay"),
                  return.type = "DataFrame")
nrow(Tx)
Tx
 

## ----cdsBy-----------------------------------------------------------------
yCds <- cdsBy(edb, filter = SeqNameFilter("Y"))
yCds
 

## ----genes-GRangesFilter---------------------------------------------------
## Define the filter
grf <- GRangesFilter(GRanges("11", ranges = IRanges(114000000, 114000050),
                             strand = "+"), type = "any")

## Query genes:
gn <- genes(edb, filter = grf)
gn

## Next we retrieve all transcripts for that gene so that we can plot them.
txs <- transcripts(edb, filter = GenenameFilter(gn$gene_name))
 

## ----tx-for-zbtb16, message=FALSE, fig.align='center', fig.width=7.5, fig.height=5----
plot(3, 3, pch = NA, xlim = c(start(gn), end(gn)), ylim = c(0, length(txs)),
     yaxt = "n", ylab = "")
## Highlight the GRangesFilter region
rect(xleft = start(grf), xright = end(grf), ybottom = 0, ytop = length(txs),
     col = "red", border = "red")
for(i in 1:length(txs)) {
    current <- txs[i]
    rect(xleft = start(current), xright = end(current), ybottom = i-0.975,
         ytop = i-0.125, border = "grey")
    text(start(current), y = i-0.5, pos = 4, cex = 0.75, labels = current$tx_id)
}
 

## ----transcripts-GRangesFilter---------------------------------------------
transcripts(edb, filter = grf)
 

## ----biotypes--------------------------------------------------------------
## Get all gene biotypes from the database. The GeneBiotypeFilter
## allows to filter on these values.
listGenebiotypes(edb)

## Get all transcript biotypes from the database.
listTxbiotypes(edb)
 

## ----genes-BCL2------------------------------------------------------------
## We're going to fetch all genes which names start with BCL. To this end
## we define a GenenameFilter with partial matching, i.e. condition "like"
## and a % for any character/string.
BCLs <- genes(edb,
	      columns = c("gene_name", "entrezid", "gene_biotype"),
	      filter = GenenameFilter("BCL", condition = "startsWith"),
	      return.type = "DataFrame")
nrow(BCLs)
BCLs
 

## ----example-AnnotationFilterList------------------------------------------
## determine the average length of snRNA, snoRNA and rRNA genes encoded on
## chromosomes X and Y.
mean(lengthOf(edb, of = "tx", filter = AnnotationFilterList(
                                  GeneBiotypeFilter(c("snRNA", "snoRNA", "rRNA")),
                                  SeqNameFilter(c("X", "Y")))))

## determine the average length of protein coding genes encoded on the same
## chromosomes.
mean(lengthOf(edb, of = "tx", filter = ~ gene_biotype == "protein_coding" &
                                  seq_name %in% c("X", "Y")))
 

## ----example-first-two-exons-----------------------------------------------
## Extract all exons 1 and (if present) 2 for all genes encoded on the
## Y chromosome
exons(edb, columns = c("tx_id", "exon_idx"),
      filter = list(SeqNameFilter("Y"),
                    ExonRankFilter(3, condition = "<")))
 

## ----transcriptsBy-X-Y-----------------------------------------------------
TxByGns <- transcriptsBy(edb, by = "gene", filter = SeqNameFilter(c("X", "Y")))
TxByGns
 

## ----exonsBy-RNAseq, message = FALSE, eval = FALSE-------------------------
#  ## will just get exons for all genes on chromosomes 1 to 22, X and Y.
#  ## Note: want to get rid of the "LRG" genes!!!
#  EnsGenes <- exonsBy(edb, by = "gene", filter = AnnotationFilterList(
#                                            SeqNameFilter(c(1:22, "X", "Y")),
#                                            GeneIdFilter("ENSG", "startsWith")))
#  

## ----toSAF-RNAseq, message = FALSE, eval=FALSE-----------------------------
#  ## Transforming the GRangesList into a data.frame in SAF format
#  EnsGenes.SAF <- toSAF(EnsGenes)
#  

## ----disjointExons, message = FALSE, eval=FALSE----------------------------
#  ## Create a GRanges of non-overlapping exon parts.
#  DJE <- disjointExons(edb, filter = AnnotationFilterList(
#  			      SeqNameFilter(c(1:22, "X", "Y")),
#  			      GeneIdFilter("ENSG%", "startsWith")))
#  

## ----transcript-sequence-AnnotationHub, message = FALSE, eval = FALSE------
#  library(EnsDb.Hsapiens.v75)
#  library(Rsamtools)
#  edb <- EnsDb.Hsapiens.v75
#  
#  ## Get the FaFile with the genomic sequence matching the Ensembl version
#  ## using the AnnotationHub package.
#  Dna <- getGenomeFaFile(edb)
#  
#  ## Get start/end coordinates of all genes.
#  genes <- genes(edb)
#  ## Subset to all genes that are encoded on chromosomes for which
#  ## we do have DNA sequence available.
#  genes <- genes[seqnames(genes) %in% seqnames(seqinfo(Dna))]
#  
#  ## Get the gene sequences, i.e. the sequence including the sequence of
#  ## all of the gene's exons and introns.
#  geneSeqs <- getSeq(Dna, genes)
#  

## ----transcript-sequence-extractTranscriptSeqs, message = FALSE, eval = FALSE----
#  ## get all exons of all transcripts encoded on chromosome Y
#  yTx <- exonsBy(edb, filter = SeqNameFilter("Y"))
#  
#  ## Retrieve the sequences for these transcripts from the FaFile.
#  library(GenomicFeatures)
#  yTxSeqs <- extractTranscriptSeqs(Dna, yTx)
#  yTxSeqs
#  
#  ## Extract the sequences of all transcripts encoded on chromosome Y.
#  yTx <- extractTranscriptSeqs(Dna, edb, filter = SeqNameFilter("Y"))
#  
#  ## Along these lines, we could use the method also to retrieve the coding sequence
#  ## of all transcripts on the Y chromosome.
#  cdsY <- cdsBy(edb, filter = SeqNameFilter("Y"))
#  extractTranscriptSeqs(Dna, cdsY)
#  

## ----seqlevelsStyle, message = FALSE---------------------------------------
## Change the seqlevels style form Ensembl (default) to UCSC:
seqlevelsStyle(edb) <- "UCSC"

## Now we can use UCSC style seqnames in SeqNameFilters or GRangesFilter:
genesY <- genes(edb, filter = ~ seq_name == "chrY")
## The seqlevels of the returned GRanges are also in UCSC style
seqlevels(genesY)
 

## ----seqlevelsStyle-2, message = FALSE-------------------------------------
seqlevelsStyle(edb) <- "UCSC"

## Getting the default option:
getOption("ensembldb.seqnameNotFound")

## Listing all seqlevels in the database.
seqlevels(edb)[1:30]

## Setting the option to NA, thus, for each seqname for which no mapping is available,
## NA is returned.
options(ensembldb.seqnameNotFound=NA)
seqlevels(edb)[1:30]

## Resetting the option.
options(ensembldb.seqnameNotFound = "ORIGINAL")
 

## ----extractTranscriptSeqs-BSGenome, warning = FALSE, message = FALSE------
library(BSgenome.Hsapiens.UCSC.hg19)
bsg <- BSgenome.Hsapiens.UCSC.hg19

## Get the genome version
unique(genome(bsg))
unique(genome(edb))
## Although differently named, both represent genome build GRCh37.

## Extract the full transcript sequences.
yTxSeqs <- extractTranscriptSeqs(bsg, exonsBy(edb, "tx",
					      filter = SeqNameFilter("chrY")))

yTxSeqs

## Extract just the CDS
Test <- cdsBy(edb, "tx", filter = SeqNameFilter("chrY"))
yTxCds <- extractTranscriptSeqs(bsg, cdsBy(edb, "tx",
                                           filter = SeqNameFilter("chrY")))
yTxCds
 

## ----seqlevelsStyle-restore------------------------------------------------
seqlevelsStyle(edb) <- "Ensembl"
 

## ----gviz-plot, message=FALSE, fig.align='center', fig.width=7.5, fig.height=2.3----
## Loading the Gviz library
library(Gviz)
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75

## Retrieving a Gviz compatible GRanges object with all genes
## encoded on chromosome Y.
gr <- getGeneRegionTrackForGviz(edb, chromosome = "Y",
                                start = 20400000, end = 21400000)
## Define a genome axis track
gat <- GenomeAxisTrack()

## We have to change the ucscChromosomeNames option to FALSE to enable Gviz usage
## with non-UCSC chromosome names.
options(ucscChromosomeNames = FALSE)

plotTracks(list(gat, GeneRegionTrack(gr)))

options(ucscChromosomeNames = TRUE)
 

## ----message=FALSE---------------------------------------------------------
seqlevelsStyle(edb) <- "UCSC"
## Retrieving the GRanges objects with seqnames corresponding to UCSC chromosome names.
gr <- getGeneRegionTrackForGviz(edb, chromosome = "chrY",
                                start = 20400000, end = 21400000)
seqnames(gr)
## Define a genome axis track
gat <- GenomeAxisTrack()
plotTracks(list(gat, GeneRegionTrack(gr)))
 

## ----gviz-separate-tracks, message=FALSE, warning=FALSE, fig.align='center', fig.width=7.5, fig.height=2.25----
protCod <- getGeneRegionTrackForGviz(edb, chromosome = "chrY",
                                     start = 20400000, end = 21400000,
                                     filter = GeneBiotypeFilter("protein_coding"))
lincs <- getGeneRegionTrackForGviz(edb, chromosome = "chrY",
                                   start = 20400000, end = 21400000,
                                   filter = GeneBiotypeFilter("lincRNA"))

plotTracks(list(gat, GeneRegionTrack(protCod, name = "protein coding"),
                GeneRegionTrack(lincs, name = "lincRNAs")), transcriptAnnotation = "symbol")

## At last we change the seqlevels style again to Ensembl
seqlevelsStyle <- "Ensembl"
 

## ----pplot-plot, message=FALSE, fig.align='center', fig.width=7.5, fig.height=4----
library(ggbio)

## Create a plot for all transcripts of the gene SKA2
autoplot(edb, ~ genename == "SKA2")
 

## ----pplot-plot-2, message=FALSE, fig.align='center', fig.width=7.5, fig.height=4----
## Get the chromosomal region in which the gene is encoded
ska2 <- genes(edb, filter = ~ genename == "SKA2")
strand(ska2) <- "*"
autoplot(edb, GRangesFilter(ska2), names.expr = "gene_name")
 

## ----AnnotationDbi, message = FALSE----------------------------------------
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75

## List all available columns in the database.
columns(edb)

## Note that these do *not* correspond to the actual column names
## of the database that can be passed to methods like exons, genes,
## transcripts etc. These column names can be listed with the listColumns
## method.
listColumns(edb)

## List all of the supported key types.
keytypes(edb)

## Get all gene ids from the database.
gids <- keys(edb, keytype = "GENEID")
length(gids)

## Get all gene names for genes encoded on chromosome Y.
gnames <- keys(edb, keytype = "GENENAME", filter = SeqNameFilter("Y"))
head(gnames)
 

## ----select, message = FALSE, warning=FALSE--------------------------------
## Use the /standard/ way to fetch data.
select(edb, keys = c("BCL2", "BCL2L11"), keytype = "GENENAME",
       columns = c("GENEID", "GENENAME", "TXID", "TXBIOTYPE"))

## Use the filtering system of ensembldb
select(edb, keys = ~ genename %in% c("BCL2", "BCL2L11") &
                tx_biotype == "protein_coding",
       columns = c("GENEID", "GENENAME", "TXID", "TXBIOTYPE"))
 

## ----mapIds, message = FALSE-----------------------------------------------
## Use the default method, which just returns the first value for multi mappings.
mapIds(edb, keys = c("BCL2", "BCL2L11"), column = "TXID", keytype = "GENENAME")

## Alternatively, specify multiVals="list" to return all mappings.
mapIds(edb, keys = c("BCL2", "BCL2L11"), column = "TXID", keytype = "GENENAME",
       multiVals = "list")

## And, just like before, we can use filters to map only to protein coding transcripts.
mapIds(edb, keys = list(GenenameFilter(c("BCL2", "BCL2L11")),
                        TxBiotypeFilter("protein_coding")), column = "TXID",
       multiVals = "list")
 

## ----AnnotationHub-query, message = FALSE, eval = use_network--------------
#  library(AnnotationHub)
#  ## Load the annotation resource.
#  ah <- AnnotationHub()
#  
#  ## Query for all available EnsDb databases
#  query(ah, "EnsDb")
#  

## ----AnnotationHub-query-2, message = FALSE, eval = use_network------------
#  ahDb <- query(ah, pattern = c("Xiphophorus Maculatus", "EnsDb", 87))
#  ## What have we got
#  ahDb
#  

## ----AnnotationHub-fetch, message = FALSE, eval = FALSE--------------------
#  ahEdb <- ahDb[[1]]
#  
#  ## retriebe all genes
#  gns <- genes(ahEdb)
#  

## ----edb-from-ensembl, message = FALSE, eval = FALSE-----------------------
#  library(ensembldb)
#  
#  ## get all human gene/transcript/exon annotations from Ensembl (75)
#  ## the resulting tables will be stored by default to the current working
#  ## directory
#  fetchTablesFromEnsembl(75, species = "human")
#  
#  ## These tables can then be processed to generate a SQLite database
#  ## containing the annotations (again, the function assumes the required
#  ## txt files to be present in the current working directory)
#  DBFile <- makeEnsemblSQLiteFromTables()
#  
#  ## and finally we can generate the package
#  makeEnsembldbPackage(ensdb = DBFile, version = "0.99.12",
#                       maintainer = "Johannes Rainer <johannes.rainer@eurac.edu>",
#                       author = "J Rainer")
#  

## ----gtf-gff-edb, message = FALSE, eval = FALSE----------------------------
#  ## Load the AnnotationHub data.
#  library(AnnotationHub)
#  ah <- AnnotationHub()
#  
#  ## Query all available files for Ensembl release 77 for
#  ## Mus musculus.
#  query(ah, c("Mus musculus", "release-77"))
#  
#  ## Get the resource for the gtf file with the gene/transcript definitions.
#  Gtf <- ah["AH28822"]
#  ## Create a EnsDb database file from this.
#  DbFile <- ensDbFromAH(Gtf)
#  ## We can either generate a database package, or directly load the data
#  edb <- EnsDb(DbFile)
#  
#  
#  ## Identify and get the FaFile object with the genomic DNA sequence matching
#  ## the EnsDb annotation.
#  Dna <- getGenomeFaFile(edb)
#  library(Rsamtools)
#  ## We next retrieve the sequence of all exons on chromosome Y.
#  exons <- exons(edb, filter = SeqNameFilter("Y"))
#  exonSeq <- getSeq(Dna, exons)
#  
#  ## Alternatively, look up and retrieve the toplevel DNA sequence manually.
#  Dna <- ah[["AH22042"]]
#  

## ----EnsDb-from-Y-GRanges, message = FALSE, eval = use_network-------------
#  ## Generate a sqlite database from a GRanges object specifying
#  ## genes encoded on chromosome Y
#  load(system.file("YGRanges.RData", package = "ensembldb"))
#  Y
#  
#  ## Create the EnsDb database file
#  DB <- ensDbFromGRanges(Y, path = tempdir(), version = 75,
#  		       organism = "Homo_sapiens")
#  
#  ## Load the database
#  edb <- EnsDb(DB)
#  edb
#  

## ----EnsDb-from-GTF, message = FALSE, eval = FALSE-------------------------
#  library(ensembldb)
#  
#  ## the GTF file can be downloaded from
#  ## ftp://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/
#  gtffile <- "Homo_sapiens.GRCh37.75.gtf.gz"
#  ## generate the SQLite database file
#  DB <- ensDbFromGtf(gtf = gtffile)
#  
#  ## load the DB file directly
#  EDB <- EnsDb(DB)
#  
#  ## alternatively, build the annotation package
#  ## and finally we can generate the package
#  makeEnsembldbPackage(ensdb = DB, version = "0.99.12",
#                       maintainer = "Johannes Rainer <johannes.rainer@eurac.edu>",
#                       author = "J Rainer")
#