/usr/lib/python2.7/dist-packages/biom/cli/uc_processor.py is in python-biom-format 2.1.5+dfsg-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | # ----------------------------------------------------------------------------
# Copyright (c) 2011-2013, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from __future__ import division
import click
from biom.cli import cli
from biom.cli.util import write_biom_table
from biom.parse import parse_uc
from biom.exception import TableException
@cli.command('from-uc')
@click.option('-i', '--input-fp', required=True,
type=click.Path(exists=True, dir_okay=False),
help='The input uc filepath.')
@click.option('-o', '--output-fp', default=None,
type=click.Path(writable=True),
help='The output BIOM filepath', required=False)
@click.option('--rep-set-fp', type=click.Path(exists=True, dir_okay=False),
help="Fasta file containing representative sequences with "
"where sequences are labeled with OTU identifiers, and "
"description fields contain original sequence identifiers. "
"This output is created, for example, by vsearch with the "
"--relabel_sha1 --relabel_keep options.",
required=False)
def from_uc(input_fp, output_fp, rep_set_fp):
"""Create a BIOM table from a vsearch/uclust/usearch BIOM file.
Example usage:
Simple BIOM creation:
$ biom from-uc -i in.uc -o out.biom
BIOM creation with OTU re-naming:
$ biom from-uc -i in.uc -o out.biom --rep-set-fp rep-set.fna
"""
input_f = open(input_fp, 'U')
if rep_set_fp is not None:
rep_set_f = open(rep_set_fp, 'U')
else:
rep_set_f = None
table = _from_uc(input_f, rep_set_f)
write_biom_table(table, 'hdf5', output_fp)
def _id_map_from_fasta(fasta_lines):
result = {}
for line in fasta_lines:
if line.startswith('>'):
try:
obs_id, seq_id = line.split()[:2]
except ValueError:
raise ValueError('Sequence identifiers in fasta file '
'must contain at least two space-'
'separated fields.')
result[seq_id] = obs_id[1:]
else:
pass
return result
def _from_uc(input_f, rep_set_f=None):
table = parse_uc(input_f)
if rep_set_f is not None:
obs_id_map = _id_map_from_fasta(rep_set_f)
try:
table.update_ids(obs_id_map, axis='observation', strict=True,
inplace=True)
except TableException:
raise ValueError('Not all sequence identifiers in the input BIOM '
'file are present in description fields in the '
'representative sequence fasta file.')
return table
|