/usr/bin/rsem-control-fdr is in rsem 1.2.31+dfsg-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | #!/usr/bin/env perl
use Getopt::Long;
use Pod::Usage;
use strict;
my $hard = 0;
my $soft = 0;
my $help = 0;
GetOptions("hard-threshold" => \$hard,
"soft-threshold" => \$soft,
"h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
pod2usage(-verbose => 2) if ($help == 1);
pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
pod2usage(-msg => "--hard-threshold and --soft-threshold cannot be set at the same time!", -exitval => 2, -verbose => 2) if ($hard && $soft);
if ($hard == 0 && $soft == 0) { $hard = 1; }
my $fdr = $ARGV[1];
open(INPUT, "$ARGV[0]");
open(OUTPUT, ">$ARGV[2]");
my $header = <INPUT>;
chomp($header);
my @columns = split(/\t/, $header);
my $pos = 0;
while ($pos <= $#columns && $columns[$pos] ne "\"PPDE\"") { ++$pos; }
if ($pos > $#columns) { print "Error: Cannot find column PPDE!\n"; exit(-1); }
++$pos;
print OUTPUT "$header\n";
my ($n, $sum) = (0, 0);
my $line = "";
while($line = <INPUT>) {
chomp($line);
my @fields = split(/\t/, $line);
my $ppee = 1.0 - $fields[$pos];
if ($hard) {
if ($ppee > $fdr) { last; }
++$n;
print OUTPUT "$line\n";
}
else {
if ($sum + $ppee > $fdr * ($n + 1)) { last; }
++$n;
$sum += $ppee;
print OUTPUT "$line\n";
}
}
print "There are $n genes/transcripts reported at FDR = $fdr.\n";
close(INPUT);
close(OUTPUT);
__END__
=head1 NAME
rsem-control-fdr
=head1 PURPOSE
Filter EBSeq output for statistical significance.
=head1 SYNOPSIS
rsem-control-fdr [options] input_file fdr_rate output_file
=head1 ARGUMENTS
=over
=item B<input_file>
This should be the main result file generated by 'rsem-run-ebseq', which contains all genes/transcripts and their associated statistics.
=item B<fdr_rate>
The desire false discovery rate (FDR).
=item B<output_file>
This file is a subset of the 'input_file'. It only contains the genes/transcripts called as differentially expressed (DE). When more than 2 conditions exist, DE is defined as not all conditions are equally expressed. Because statistical significance does not necessarily mean biological significance, users should also refer to the fold changes to decide which genes/transcripts are biologically significant. When more than two conditions exist, this file will not contain fold change information and users need to calculate it from 'input_file.condmeans' by themselves.
=back
=head1 OPTIONS
=over
=item B<--hard-threshold>
Use hard threshold method to control FDR. If this option is set, only those genes/transcripts with their PPDE >= 1 - fdr_rate are called as DE. (Default: on)
=item B<--soft-threshold>
Use soft threshold method to control FDR. If this option is set, this program will try to report as many genes/transcripts as possible, as long as their average PPDE >= 1 - fdr_rate. This option is equivalent to use EBSeq's 'crit_fun' for FDR control. (Default: off)
=item B<-h/--help>
Show help information.
=back
=head1 DESCRIPTION
This program controls the false discovery rate and reports differentially expressed genes/transcripts.
=head1 EXAMPLES
We assume that we have 'GeneMat.results' as input. We want to control FDR at 0.05 using hard threshold method and name the output file as 'GeneMat.de.txt':
rsem-control-fdr GeneMat.results 0.05 GeneMat.de.txt
=cut
|