/usr/share/pyshared/qiime/split.py is in qiime 1.8.0+dfsg-4ubuntu1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | #!/usr/bin/env python
# File created on 24 Feb 2012
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso", "Daniel McDonald"]
__license__ = "GPL"
__version__ = "1.8.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
from cogent.parse.fasta import MinimalFastaParser
from cogent.util.misc import create_dir
from qiime.parse import parse_mapping_file
from qiime.filter import filter_mapping_file, sample_ids_from_metadata_description
from qiime.format import format_mapping_file, format_biom_table
from biom.parse import parse_biom_table
from biom.table import TableException
def get_mapping_values(mapping_f,mapping_field):
mapping_data, mapping_headers, comments = parse_mapping_file(mapping_f)
try:
field_index = mapping_headers.index(mapping_field)
except ValueError:
raise KeyError("Field is not in mapping file (search is case "+\
"and white-space sensitive). \n\tProvided field: "+\
"%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers)))
return set([e[field_index] for e in mapping_data])
def split_mapping_file_on_field(mapping_f,
mapping_field,
column_rename_ids=None,
include_repeat_cols=True):
""" split mapping file based on value in field """
mapping_f = list(mapping_f)
mapping_values = get_mapping_values(mapping_f,mapping_field)
mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
if column_rename_ids:
try:
column_rename_ids = mapping_headers.index(column_rename_ids)
except ValueError:
raise KeyError("Field is not in mapping file (search is case "+\
"and white-space sensitive). \n\tProvided field: "+\
"%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers)))
for v in mapping_values:
v_fp_str = v.replace(' ','_')
sample_ids_to_keep = sample_ids_from_metadata_description(
mapping_f,valid_states_str="%s:%s" % (mapping_field,v))
# parse mapping file each time though the loop as filtering operates on values
mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
mapping_headers, mapping_data = filter_mapping_file(
mapping_data,
mapping_headers,
sample_ids_to_keep,
include_repeat_cols=include_repeat_cols,
column_rename_ids=column_rename_ids)
yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def split_otu_table_on_sample_metadata(otu_table_f,mapping_f,mapping_field):
""" split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field
"""
mapping_f = list(mapping_f)
mapping_values = get_mapping_values(mapping_f,mapping_field)
otu_table = parse_biom_table(otu_table_f)
for v in mapping_values:
v_fp_str = v.replace(' ','_')
sample_ids_to_keep = sample_ids_from_metadata_description(
mapping_f,valid_states_str="%s:%s" % (mapping_field,v))
try:
filtered_otu_table = otu_table.filterSamples(
lambda values,id_,metadata: id_ in sample_ids_to_keep)
except TableException:
# all samples are filtered out, so no otu table to write
continue
yield v_fp_str, format_biom_table(filtered_otu_table)
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=''):
""" Split infile into files with seqs_per_file sequences in each
infile: list of fasta lines or open file object
seqs_per_file: the number of sequences to include in each file
out_fileprefix: string used to create output filepath - output filepaths
are <out_prefix>.<i>.fasta where i runs from 0 to number of output files
working_dir: directory to prepend to temp filepaths (defaults to
empty string -- files written to cwd)
List of output filepaths is returned.
"""
if seqs_per_file <= 0:
raise ValueError("seqs_per_file must be > 0!")
seq_counter = 0
out_files = []
if working_dir and not working_dir.endswith('/'):
working_dir += '/'
create_dir(working_dir)
for seq_id,seq in MinimalFastaParser(infile):
if seq_counter == 0:
current_out_fp = '%s%s.%d.fasta' \
% (working_dir,outfile_prefix,len(out_files))
current_out_file = open(current_out_fp, 'w')
out_files.append(current_out_fp)
current_out_file.write('>%s\n%s\n' % (seq_id, seq))
seq_counter += 1
if seq_counter == seqs_per_file:
current_out_file.close()
seq_counter = 0
if not current_out_file.closed:
current_out_file.close()
return out_files
|