This file is indexed.

/usr/share/pyshared/cogent/app/blat.py is in python-cogent 1.5.3-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/usr/bin/env python

"""Application controller for BLAT v34"""

from cogent.app.parameters import FlagParameter, ValuedParameter, \
                                  MixedParameter, FilePath
from cogent.app.util import CommandLineApplication, ResultPath, \
                            ApplicationError, get_tmp_filename
from cogent import DNA, PROTEIN
from cogent.core.genetic_code import DEFAULT as standard_code
from cogent.parse.fasta import MinimalFastaParser
from os import remove
from os.path import isabs

__author__ = "Adam Robbins-Pianka"
__copyright__ = "Copyright 2007-2012, The QIIME Project"
__credits__ = ["Adam Robbins-Pianka", "Daniel McDonald"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Adam Robbins-Pianka"
__email__ = "adam.robbinspianka@colorado.edu"
__status__ = "Prototype"

class Blat(CommandLineApplication):
    """BLAT generic application controller"""

    _command = 'blat'
    _input_handler = "_input_as_list"

    _database_types = ['dna', 'prot', 'dnax']
    _query_types = ['dna', 'rna', 'prot', 'dnax', 'rnax']
    _mask_types = ['lower', 'upper', 'out', 'file.out']
    _out_types = ['psl', 'pslx', 'axt', 'maf', 'sim4', 'wublast', 'blast',
                 'blast8', 'blast9']
    _valid_combinations = [('dna', 'dna'), ('dna', 'rna'), ('prot', 'prot'),
                           ('dnax', 'prot'), ('dnax', 'dnax'), 
                           ('dnax', 'rnax')]
    _database = None
    _query = None
    _output = None

    _parameters = {
        # database type (dna, prot, or dnax, where dnax is DNA sequence
        # translated in six frames to protein
        '-t':ValuedParameter('-',Delimiter='=',Name='t'),

        # query type (dna, rna, prot, dnax, rnax, where rnax is DNA sequence
        # translated in three frames to protein
        '-q':ValuedParameter('-',Delimiter='=',Name='q'),

        # Use overused tile file N.ooc, and N should correspond to the tileSize
        '-ooc':ValuedParameter('-',Delimiter='=',Name='ooc', IsPath=True),

        # Sets the size of at match that that triggers an alignment
        '-tileSize':ValuedParameter('-',Delimiter='=',Name='tileSize'),

        # Spacing between tiles.
        '-stepSize':ValuedParameter('-',Delimiter='=',Name='stepSize'),

        # If set to 1, allows one mismatch in the tile and still triggers
        # an alignment.
        '-oneOff':ValuedParameter('-',Delimiter='=',Name='oneOff'),

        # sets the number of tile matches
        '-minMatch':ValuedParameter('-',Delimiter='=',Name='minMatch'),

        #sets the minimum score
        '-minScore':ValuedParameter('-',Delimiter='=',Name='minScore'),

        # sets the minimum sequence identity in percent
        '-minIdentity':ValuedParameter('-',Delimiter='=',Name='minIdentity'),

        # sets the size o the maximum gap between tiles in a clump
        '-maxGap':ValuedParameter('-',Delimiter='=',Name='maxGap'),

        # make an overused tile file. Target needs to be complete genome.
        '-makeOoc':ValuedParameter('-',Delimiter='=',Name='makeOoc',
                                   IsPath=True),

        # sets the number of repetitions of a tile allowed before it is marked
        # as overused
        '-repMatch':ValuedParameter('-',Delimiter='=',Name='repMatch'),

        # mask out repeats.  Alignments won't be started in masked region but
        # may extend through it in nucleotide searches.  Masked areas are
        # ignored entirely in protein or translated searches.  Types are:
        # lower, upper, out, file.out (file.out - mask database according to
        # RepeatMasker file.out
        '-mask':ValuedParameter('-',Delimiter='=',Name='mask'),

        # Mask out repeats in query sequence.  similar to -mask but for query
        # rather than target sequence
        '-qMask':ValuedParameter('-',Delimiter='=',Name='qMask'),

        # repeat bases will not be masked in any way, but matches in
        # repeat areas will be reported separately from matches in other
        # areas in the pls output
        '-repeats':ValuedParameter('-',Delimiter='=',Name='repeats'),

        # minimum percent divergence of repeats to allow them to be unmasked
        '-minRepDivergence':ValuedParameter('-',Delimiter='=',
                                            Name='minRepDivergence'),

        # output dot every N sequences to show program's progress
        '-dots':ValuedParameter('-',Delimiter='=',Name='dots'),

        # controls output file format.  One of:
        # psl - Default.  Tab separated format, no sequence
        # pslx - Tab separated format with sequence
        # axt - blastz-associated axt format
        # maf - multiz-associated maf format
        # sim4 - similar to sim4 format
        # wublast - similar to wublast format
        # blast - similar to NCBI blast format
        # blast8- NCBI blast tabular format
        # blast9 - NCBI blast tabular format with comments
        '-out':ValuedParameter('-',Delimiter='=',Name='out'),

        # sets maximum intron size
        '-maxIntron':ValuedParameter('-',Delimiter='=',Name='maxIntron'),

        # suppress column headers in psl output
        '-noHead':FlagParameter('-',Name='noHead'),

        # trim leading poly-T
        '-trimT':FlagParameter('-',Name='trimT'),

        # do not trim trailing poly-A
        '-noTrimA':FlagParameter('-',Name='noTrimA'),

        # Remove poly-A tail from qSize as well as alignments in psl output
        '-trimHardA':FlagParameter('-',Name='trimHardA'),

        # run for fast DNA/DNA remapping - not allowing introns,
        # requiring high %ID
        '-fastMap':FlagParameter('-',Name='fastMap'),

        # for high quality mRNAs, look harder for small initial and terminal
        # exons
        '-fine':FlagParameter('-',Name='fine'),

        # Allows extension of alignment through large blocks of N's
        '-extendThroughN':FlagParameter('-',Name='extendThroughN')
        }
    
    def _get_result_paths(self, data):
        """Returns the file location for result output
        """

        return {'output':ResultPath(data[2], IsWritten=True)}

    def _get_base_command(self):
        """Gets the command that will be run when the app controller is
        called.
        """
        command_parts = []
        cd_command = ''.join(['cd ',str(self.WorkingDir),';'])
        if self._command is None:
            raise ApplicationError, '_command has not been set.'
        command = self._command
        parameters = sorted([str(x) for x in self.Parameters.values() 
                            if str(x)])

        synonyms = self._synonyms

        command_parts.append(cd_command)
        command_parts.append(command)
        command_parts.append(self._database) # Positional argument
        command_parts.append(self._query) # Positional argument
        command_parts += parameters
        if self._output: command_parts.append(self._output.Path) # Positional

        return self._command_delimiter.join(filter(None,command_parts)).strip()

    BaseCommand = property(_get_base_command)

    def _input_as_list(self, data):
        '''Takes the positional arguments as input in a list.
        
        The list input here should be [query_file_path, database_file_path, 
        output_file_path]'''
        query, database, output = data
        if (not isabs(database)) \
          or (not isabs(query)) \
          or (not isabs(output)):
            raise ApplicationError, "Only absolute paths allowed.\n%s" %\
                                    ', '.join(data)

        self._database = FilePath(database)
        self._query = FilePath(query)
        self._output = ResultPath(output, IsWritten=True)

        ## check parameters that can only take a particular set of values
        # check combination of databse and query type
        if self.Parameters['-t'].isOn() and self.Parameters['-q'].isOn() and \
         (self.Parameters['-t'].Value, self.Parameters['-q'].Value) not in \
         self._valid_combinations:
            error_message = "Invalid combination of database and query " + \
                            "types ('%s', '%s').\n" % \
                            (self.Paramters['-t'].Value, 
                            self.Parameters['-q'].Value)

            error_message += "Must be one of: %s\n" % \
                             repr(self._valid_combinations)

            raise ApplicationError(error_message)

        # check database type
        if self.Parameters['-t'].isOn() and \
         self.Parameters['-t'].Value not in self._database_types:
            error_message = "Invalid database type %s\n" % \
                            self.Parameters['-t'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._database_types)

            raise ApplicationError(error_message)

        # check query type
        if self.Parameters['-q'].isOn() and \
         self.Parameters['-q'].Value not in self._query_types:
            error_message = "Invalid query type %s\n" % \
                            self.Parameters['-q'].Value

            error_message += "Allowed values: %s\n" % \
                            ', '.join(self._query_types)

            raise ApplicationError(error_message)

        # check mask type
        if self.Parameters['-mask'].isOn() and \
         self.Parameters['-mask'].Value not in self._mask_types:
            error_message = "Invalid mask type %s\n" % \
                            self.Parameters['-mask']
            
            error_message += "Allowed Values: %s\n" % \
                            ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check qmask type
        if self.Parameters['-qMask'].isOn() and \
         self.Parameters['-qMask'].Value not in self._mask_types:
            error_message = "Invalid qMask type %s\n" % \
                            self.Parameters['-qMask'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check repeat type
        if self.Parameters['-repeats'].isOn() and \
         self.Parameters['-repeats'].Value not in self._mask_types:
            error_message = "Invalid repeat type %s\n" % \
                            self.Parameters['-repeat'].Value

            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._mask_types)

            raise ApplicationError(error_message)

        # check output format
        if self.Parameters['-out'].isOn() and \
         self.Parameters['-out'].Value not in self._out_types:
            error_message = "Invalid output type %s\n" % \
                            self.Parameters['-out']
 
            error_message += "Allowed values: %s\n" % \
                             ', '.join(self._out_types)

            raise ApplicationError(error_message)

        return ''

def assign_reads_to_database(query_fasta_fp, database_fasta_fp, output_fp, 
                             params=None):
    """Assign a set of query sequences to a reference database
    
    query_fasta_fp : absolute file path to query sequences
    database_fasta_fp : absolute file path to the reference database
    output_fp : absolute file path of the output file to write
    params : dict of BLAT specific parameters.
    
    This method returns an open file object. The output format
    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
    """
    if params is None:
        params = {}
    if '-out' not in params:
        params['-out'] = 'blast9'
    blat = Blat(params = params)

    result = blat([query_fasta_fp, database_fasta_fp, output_fp])
    return result['output']

def assign_dna_reads_to_dna_database(query_fasta_fp, database_fasta_fp, 
                        output_fp, params = {}):
    """Assign DNA reads to a database fasta of DNA sequences.

    Wraps assign_reads_to_database, setting database and query types. All
    parameters are set to default unless params is passed.

    query_fasta_fp: absolute path to the query fasta file containing DNA
                   sequences.
    database_fasta_fp: absolute path to the database fasta file containing
                      DNA sequences.
    output_fp: absolute path where the output file will be generated.
    params: optional. dict containing parameter settings to be used
                  instead of default values. Cannot change database or query
                  file types from dna and dna, respectively.

    This method returns an open file object. The output format
    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
    """
    my_params = {'-t': 'dna',
              '-q': 'dna'
             }

    # if the user specified parameters other than default, then use them.
    # However, if they try to change the database or query types, raise an
    # applciation error.
    if '-t' in params or '-q' in params:
        raise ApplicationError("Cannot change database or query types when " +\
                               "using assign_dna_reads_to_dna_database. " +\
                               "Use assign_reads_to_database instead.\n")
    
    my_params.update(params)

    result = assign_reads_to_database(query_fasta_fp, database_fasta_fp, 
                                      output_fp, my_params)

    return result

def assign_dna_reads_to_protein_database(query_fasta_fp, database_fasta_fp, 
                        output_fp, temp_dir = "/tmp", params = {}):
    """Assign DNA reads to a database fasta of protein sequences.

    Wraps assign_reads_to_database, setting database and query types. All
    parameters are set to default unless params is passed. A temporary
    file must be written containing the translated sequences from the input
    query fasta file because BLAT cannot do this automatically.

    query_fasta_fp: absolute path to the query fasta file containing DNA
                   sequences.
    database_fasta_fp: absolute path to the database fasta file containing
                      protein sequences.
    output_fp: absolute path where the output file will be generated.
    temp_dir: optional. Change the location where the translated sequences
              will be written before being used as the query. Defaults to 
              /tmp.
    params: optional. dict containing parameter settings to be used
                  instead of default values. Cannot change database or query
                  file types from protein and dna, respectively.

    This method returns an open file object. The output format
    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
    """
    my_params = {'-t': 'prot',
              '-q': 'prot'
             }

    # make sure temp_dir specifies an absolute path
    if not isabs(temp_dir):
        raise ApplicationError, "temp_dir must be an absolute path."

    # if the user specified parameters other than default, then use them.
    # However, if they try to change the database or query types, raise an
    # applciation error.
    if '-t' in params or '-q' in params:
        raise ApplicationError, "Cannot change database or query types " + \
                                "when using " + \
                                "assign_dna_reads_to_dna_database. " + \
                                "Use assign_reads_to_database instead."
    
    my_params.update(params)

    # get six-frame translation of the input DNA sequences and write them to
    # temporary file.
    tmp = get_tmp_filename(tmp_dir=temp_dir, result_constructor=str)
    tmp_out = open(tmp, 'w')

    for label, sequence in MinimalFastaParser(open(query_fasta_fp)):
        seq_id = label.split()[0]

        s = DNA.makeSequence(sequence)
        translations = standard_code.sixframes(s)
        frames = [1,2,3,-1,-2,-3]
        translations = dict(zip(frames, translations))

        for frame, translation in sorted(translations.iteritems()):
            entry = '>{seq_id}_frame_{frame}\n{trans}\n'
            entry = entry.format(seq_id=seq_id, frame=frame, trans=translation)
            tmp_out.write(entry)

    tmp_out.close()
    result = assign_reads_to_database(tmp, database_fasta_fp, output_fp, \
                                      params = my_params)

    remove(tmp)

    return result