/usr/lib/R/site-library/ensembldb/extended_tests/performance_tests.R is in r-bioc-ensembldb 2.2.2-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | ############################################################
## Compare MySQL vs SQLite backends:
## Amazing how inefficient the MySQL backend seems to be! Most
## likely it's due to RMySQL, not MySQL.
dontrun_test_MySQL_vs_SQLite <- function() {
## Compare the performance of the MySQL backend against
## the SQLite backend.
edb_mysql <- useMySQL(edb, user = "anonuser", pass = "")
library(microbenchmark)
## genes
microbenchmark(genes(edb), genes(edb_mysql), times = 5)
microbenchmark(genes(edb, filter = GeneBiotypeFilter("lincRNA")),
genes(edb_mysql, filter = GeneBiotypeFilter("lincRNA")),
times = 5)
microbenchmark(genes(edb, filter = SeqNameFilter(20:23)),
genes(edb_mysql, filter = SeqNameFilter(20:23)),
times = 5)
microbenchmark(genes(edb, columns = "tx_id"),
genes(edb_mysql, columns = "tx_id"),
times = 5)
microbenchmark(genes(edb, filter = GenenameFilter("BCL2L11")),
genes(edb_mysql, filter = GenenameFilter("BCL2L11")),
times = 5)
## transcripts
microbenchmark(transcripts(edb),
transcripts(edb_mysql),
times = 5)
microbenchmark(transcripts(edb, filter = GenenameFilter("BCL2L11")),
transcripts(edb_mysql, filter = GenenameFilter("BCL2L11")),
times = 5)
## exons
microbenchmark(exons(edb),
exons(edb_mysql),
times = 5)
microbenchmark(exons(edb, filter = GenenameFilter("BCL2L11")),
exons(edb_mysql, filter = GenenameFilter("BCL2L11")),
times = 5)
## exonsBy
microbenchmark(exonsBy(edb),
exonsBy(edb_mysql),
times = 5)
microbenchmark(exonsBy(edb, filter = SeqNameFilter("Y")),
exonsBy(edb_mysql, filter = SeqNameFilter("Y")),
times = 5)
## cdsBy
microbenchmark(cdsBy(edb), cdsBy(edb_mysql), times = 5)
microbenchmark(cdsBy(edb, by = "gene"), cdsBy(edb_mysql, by = "gene"),
times = 5)
microbenchmark(cdsBy(edb, filter = SeqStrandFilter("-")),
cdsBy(edb_mysql, filter = SeqStrandFilter("-")),
times = 5)
}
## Compare the performance of doing the sorting within R or
## directly in the SQL query.
dontrun_test_ordering_performance <- function() {
library(RUnit)
library(RSQLite)
## gene table: order by in SQL query vs R:
db_con <- dbconn(edb)
.callWithOrder <- function(con, query, orderBy = "",
orderSQL = TRUE) {
if (all(orderBy == ""))
orderBy <- NULL
if (orderSQL & !is.null(orderBy)) {
orderBy <- paste(orderBy, collapse = ", ")
query <- paste0(query, " order by ", orderBy)
}
res <- dbGetQuery(con, query)
if (!orderSQL & !all(is.null(orderBy))) {
if (!all(orderBy %in% colnames(res)))
stop("orderBy not in columns!")
## Do the ordering in R
res <- res[do.call(order,
c(list(method = "radix"),
as.list(res[, orderBy, drop = FALSE]))), ]
}
rownames(res) <- NULL
return(res)
}
#######################
## gene table
## Simple condition
the_q <- "select * from gene"
system.time(res1 <- .callWithOrder(db_con, query = the_q))
system.time(res2 <- .callWithOrder(db_con, query = the_q,
orderSQL = FALSE))
checkIdentical(res1, res2)
## order by gene_id
orderBy <- "gene_id"
system.time(res1 <- .callWithOrder(db_con, query = the_q, orderBy = orderBy))
system.time(res2 <- .callWithOrder(db_con, query = the_q,
orderBy = orderBy, orderSQL = FALSE))
## SQL: 0.16, R: 0.164.
checkIdentical(res1, res2)
## order by gene_name
orderBy <- "gene_name"
system.time(res1 <- .callWithOrder(db_con, query = the_q, orderBy = orderBy))
system.time(res2 <- .callWithOrder(db_con, query = the_q,
orderBy = orderBy, orderSQL = FALSE))
checkIdentical(res1, res2)
## SQL: 0.245, R: 0.185
## sort by gene_name and gene_seq_start
orderBy <- c("gene_name", "gene_seq_start")
system.time(res1 <- .callWithOrder(db_con, query = the_q, orderBy = orderBy))
system.time(res2 <- .callWithOrder(db_con, query = the_q,
orderBy = orderBy, orderSQL = FALSE))
## SQL: 0.26, R: 0.188
checkEquals(res1, res2)
## with subsetting:
the_q <- "select * from gene where seq_name in ('5', 'Y')"
orderBy <- c("gene_name", "gene_seq_start")
system.time(res1 <- .callWithOrder(db_con, query = the_q, orderBy = orderBy))
system.time(res2 <- .callWithOrder(db_con, query = the_q,
orderBy = orderBy, orderSQL = FALSE))
## SQL: 0.031, R: 0.024
checkEquals(res1, res2)
########################
## joining tables.
the_q <- paste0("select * from gene join tx on (gene.gene_id = tx.gene_id)",
" join tx2exon on (tx.tx_id = tx2exon.tx_id)")
orderBy <- c("tx_id", "exon_id")
system.time(res1 <- .callWithOrder(db_con, query = the_q, orderBy = orderBy))
system.time(res2 <- .callWithOrder(db_con, query = the_q,
orderBy = orderBy, orderSQL = FALSE))
## SQL: 9.6, R: 9.032
checkEquals(res1, res2)
## subsetting.
the_q <- paste0("select * from gene join tx on (gene.gene_id = tx.gene_id)",
" join tx2exon on (tx.tx_id = tx2exon.tx_id) where",
" seq_name = 'Y'")
orderBy <- c("tx_id", "exon_id")
system.time(res1 <- .callWithOrder(db_con, query = the_q, orderBy = orderBy))
system.time(res2 <- .callWithOrder(db_con, query = the_q,
orderBy = orderBy, orderSQL = FALSE))
## SQL: 0.9, R: 1.6
checkEquals(res1, res2)
}
## Compare the performance of inner join with left outer join.
dontrun_test_outer_join_performance <- function() {
Q_1 <- ensembldb:::joinQueryOnTables2(edb, tab = c("gene", "exon"))
Q_2 <- ensembldb:::joinQueryOnTables2(edb, tab = c("gene", "exon"),
startWith = "exon")
Q_3 <- ensembldb:::joinQueryOnTables2(edb, tab = c("gene", "exon"),
startWith = "exon",
join = "left outer join")
library(microbenchmark)
library(RSQLite)
microbenchmark(dbGetQuery(dbconn(edb), paste0("select * from ", Q_1)),
dbGetQuery(dbconn(edb), paste0("select * from ", Q_2)),
dbGetQuery(dbconn(edb), paste0("select * from ", Q_3)),
times = 10)
## Result: Q_1 is a second faster (13 instead of 14).
## Check performance joining tx and genes.
Q_1 <- ensembldb:::joinQueryOnTables2(edb, tab = c("tx", "gene"))
Q_2 <- ensembldb:::joinQueryOnTables2(edb, tab = c("tx", "gene"),
startWith = "tx")
Q_3 <- ensembldb:::joinQueryOnTables2(edb, tab = c("tx", "gene"),
startWith = "tx",
join = "left outer join")
microbenchmark(dbGetQuery(dbconn(edb), paste0("select * from ", Q_1)),
dbGetQuery(dbconn(edb), paste0("select * from ", Q_2)),
dbGetQuery(dbconn(edb), paste0("select * from ", Q_3)),
times = 10)
## No difference.
}
|