This file is indexed.

/usr/lib/python2.7/dist-packages/biom/parse.py is in python-biom-format 2.1.5+dfsg-7.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
#!/usr/bin/env python

# ----------------------------------------------------------------------------
# Copyright (c) 2011-2013, The BIOM Format Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

from __future__ import division

import numpy as np
from future.utils import string_types

from biom.exception import BiomParseException, UnknownAxisError
from biom.table import Table
from biom.util import biom_open, __version__
import json
import collections

__author__ = "Justin Kuczynski"
__copyright__ = "Copyright 2011-2013, The BIOM Format Development Team"
__credits__ = ["Justin Kuczynski", "Daniel McDonald", "Greg Caporaso",
               "Jose Carlos Clemente Litran", "Adam Robbins-Pianka",
               "Jose Antonio Navas Molina"]
__license__ = "BSD"
__url__ = "http://biom-format.org"
__maintainer__ = "Daniel McDonald"
__email__ = "daniel.mcdonald@colorado.edu"

MATRIX_ELEMENT_TYPE = {'int': int, 'float': float, 'unicode': str,
                       u'int': int, u'float': float, u'unicode': str}

QUOTE = '"'
JSON_OPEN = set(["[", "{"])
JSON_CLOSE = set(["]", "}"])
JSON_SKIP = set([" ", "\t", "\n", ","])
JSON_START = set(
    ["0",
     "1",
     "2",
     "3",
     "4",
     "5",
     "6",
     "7",
     "8",
     "9",
     "{",
     "[",
     '"'])


def direct_parse_key(biom_str, key):
    """Returns key:value from the biom string, or ""

    This method pulls an arbitrary key/value pair out from a BIOM string
    """
    base_idx = biom_str.find('"%s":' % key)
    if base_idx == -1:
        return ""
    else:
        start_idx = base_idx + len(key) + 3  # shift over "key":

    # find the start token
    cur_idx = start_idx
    while biom_str[cur_idx] not in JSON_START:
        cur_idx += 1

    if biom_str[cur_idx] not in JSON_OPEN:
        # do we have a number?
        while biom_str[cur_idx] not in [",", "{", "}"]:
            cur_idx += 1

    else:
        # we have an object
        stack = [biom_str[cur_idx]]
        cur_idx += 1
        while stack:
            cur_char = biom_str[cur_idx]

            if cur_char == QUOTE:
                if stack[-1] == QUOTE:
                    stack.pop()
                else:
                    stack.append(cur_char)
            elif cur_char in JSON_CLOSE:
                try:
                    stack.pop()
                except IndexError:  # got an int or float?
                    cur_idx -= 1
                    break
            elif cur_char in JSON_OPEN:
                stack.append(cur_char)
            cur_idx += 1

    return biom_str[base_idx:cur_idx]


def direct_slice_data(biom_str, to_keep, axis):
    """Pull out specific slices from a BIOM string

    biom_str : JSON-formatted BIOM string
    to_keep  : indices to keep
    axis     : either 'samples' or 'observations'

    Will raise IndexError if the inices are out of bounds. Fully zerod rows
    or columns are possible and this is _not_ checked.
    """
    if axis not in ['observation', 'sample']:
        raise IndexError("Unknown axis type")

    # it would be nice if all of these lookups could be done in a single
    # traversal of biom_str, but it likely is at the cost of code complexity
    shape_kv_pair = direct_parse_key(biom_str, "shape")
    if shape_kv_pair == "":
        raise ValueError("biom_str does not appear to be in BIOM format!")

    data_fields = direct_parse_key(biom_str, "data")
    if data_fields == "":
        raise ValueError("biom_str does not appear to be in BIOM format!")

    matrix_type_kv_pair = direct_parse_key(biom_str, "matrix_type")
    if matrix_type_kv_pair == "":
        raise ValueError("biom_str does not appear to be in BIOM format!")

    # determine shape
    raw_shape = shape_kv_pair.split(':')[-1].replace("[", "").replace("]", "")
    n_rows, n_cols = list(map(int, raw_shape.split(",")))

    # slice to just data
    data_start = data_fields.find('[') + 1
    # trim trailing ]
    data_fields = data_fields[data_start:len(data_fields) - 1]

    # bounds check
    if min(to_keep) < 0:
        raise IndexError("Observations to keep are out of bounds!")

    # more bounds check and set new shape
    new_shape = "[%d, %d]"
    if axis == 'observation':
        if max(to_keep) >= n_rows:
            raise IndexError("Observations to keep are out of bounds!")
        new_shape = new_shape % (len(to_keep), n_cols)
    elif axis == 'sample':
        if max(to_keep) >= n_cols:
            raise IndexError("Samples to keep are out of bounds!")
        new_shape = new_shape % (n_rows, len(to_keep))

    to_keep = set(to_keep)
    new_data = []

    if axis == 'observation':
        new_data = _direct_slice_data_sparse_obs(data_fields, to_keep)
    elif axis == 'sample':
        new_data = _direct_slice_data_sparse_samp(data_fields, to_keep)

    return '"data": %s, "shape": %s' % (new_data, new_shape)


def strip_f(x):
    return x.strip("[] \n\t")


def _remap_axis_sparse_obs(rcv, lookup):
    """Remap a sparse observation axis"""
    row, col, value = list(map(strip_f, rcv.split(',')))
    return "%s,%s,%s" % (lookup[row], col, value)


def _remap_axis_sparse_samp(rcv, lookup):
    """Remap a sparse sample axis"""
    row, col, value = list(map(strip_f, rcv.split(',')))
    return "%s,%s,%s" % (row, lookup[col], value)


def _direct_slice_data_sparse_obs(data, to_keep):
    """slice observations from data

    data : raw data string from a biom file
    to_keep : rows to keep
    """
    # interogate all the datas
    new_data = []
    remap_lookup = dict([(str(v), i) for i, v in enumerate(sorted(to_keep))])
    for rcv in data.split('],'):
        r, c, v = strip_f(rcv).split(',')
        if r in remap_lookup:
            new_data.append(_remap_axis_sparse_obs(rcv, remap_lookup))
    return '[[%s]]' % '],['.join(new_data)


def _direct_slice_data_sparse_samp(data, to_keep):
    """slice samples from data

    data : raw data string from a biom file
    to_keep : columns to keep
    """
    # could do sparse obs/samp in one forloop, but then theres the
    # expense of the additional if-statement in the loop
    new_data = []
    remap_lookup = dict([(str(v), i) for i, v in enumerate(sorted(to_keep))])
    for rcv in data.split('],'):
        r, c, v = rcv.split(',')
        if c in remap_lookup:
            new_data.append(_remap_axis_sparse_samp(rcv, remap_lookup))
    return '[[%s]]' % '],['.join(new_data)


def get_axis_indices(biom_str, to_keep, axis):
    """Returns the indices for the associated ids to keep

    biom_str : a BIOM formatted JSON string
    to_keep  : a list of IDs to get indices for
    axis     : either 'samples' or 'observations'

    Raises KeyError if unknown key is specified
    """
    to_keep = set(to_keep)
    if axis == 'observation':
        axis_key = 'rows'
        axis_data = direct_parse_key(biom_str, axis_key)
    elif axis == "sample":
        axis_key = 'columns'
        axis_data = direct_parse_key(biom_str, axis_key)
    else:
        raise ValueError("Unknown axis!")

    if axis_data == "":
        raise ValueError("biom_str does not appear to be in BIOM format!")

    axis_data = json.loads("{%s}" % axis_data)

    all_ids = set([v['id'] for v in axis_data[axis_key]])
    if not to_keep.issubset(all_ids):
        raise KeyError("Not all of the to_keep ids are in biom_str!")

    idxs = [i for i, v in enumerate(axis_data[axis_key]) if v['id'] in to_keep]
    idxs_lookup = set(idxs)

    subset = {axis_key: []}
    for i, v in enumerate(axis_data[axis_key]):
        if i in idxs_lookup:
            subset[axis_key].append(v)

    return idxs, json.dumps(subset)[1:-1]  # trim off { and }


def parse_uc(fh):
    """ Create a Table object from a uclust/usearch/vsearch uc file.

        Parameters
        ----------
        fh : file handle
            The ``.uc`` file to be parsed.

        Returns
        -------
        biom.Table : The resulting BIOM table.

        Raises
        ------
        ValueError
            If a sequence identifier is encountered that doesn't have at least
            one underscore in it (see Notes).

        Notes
        -----
        This function assumes sequence identifiers in this file are in QIIME's
        "post-split-libraries" format, where the identifiers are of the form
        ``<sample-id>_<sequence-id>``. Everything before the first underscore
        will be used as the sample identifier in the resulting ``Table``.
        The information after the first underscore is not used directly, though
        the full identifiers of seeds will be used as the observation
        identifier in the resulting ``Table``.

    """
    data = collections.defaultdict(int)
    sample_idxs = {}
    sample_ids = []
    observation_idxs = {}
    observation_ids = []
    # The types of hit lines we need here are hit (H), seed (S) and
    # library seed (L). Store these in a set for quick reference.
    line_types = set('HSL')
    for line in fh:
        # determine if the current line is one that we need
        line = line.strip()
        if not line:
            continue
        fields = line.split('\t')

        line_type = fields[0]
        if line_type not in line_types:
            continue

        # grab the fields we care about
        observation_id = fields[9].split()[0]
        query_id = fields[8].split()[0]

        if observation_id == '*':
            # S and L lines don't have a separate observation id
            observation_id = query_id

        # get the index of the current observation id, or create it if it's
        # the first time we're seeing this id
        if observation_id in observation_idxs:
            observation_idx = observation_idxs[observation_id]
        else:
            observation_idx = len(observation_ids)
            observation_ids.append(observation_id)
            observation_idxs[observation_id] = observation_idx

        if line_type == 'H' or line_type == 'S':
            # get the sample id
            try:
                underscore_index = query_id.index('_')
            except ValueError:
                raise ValueError(
                 "A query sequence was encountered that does not have an "
                 "underscore. An underscore is required in all query "
                 "sequence identifiers to indicate the sample identifier.")
            # get the sample id and its index, creating the index if it is the
            # first time we're seeing this id
            sample_id = query_id[:underscore_index]
            if sample_id in sample_idxs:
                sample_idx = sample_idxs[sample_id]
            else:
                sample_idx = len(sample_ids)
                sample_idxs[sample_id] = sample_idx
                sample_ids.append(sample_id)
            # increment the count of the current observation in the current
            # sample by one.
            data[(observation_idx, sample_idx)] += 1
        else:
            # nothing else needs to be done for 'L' records
            pass
    return Table(data, observation_ids=observation_ids, sample_ids=sample_ids)


def parse_biom_table(fp, ids=None, axis='sample', input_is_dense=False):
    r"""Parses the biom table stored in the filepath `fp`

    Parameters
    ----------
    fp : file like
        File alike object storing the BIOM table
    ids : iterable
        The sample/observation ids of the samples/observations that we need
        to retrieve from the biom table
    axis : {'sample', 'observation'}, optional
        The axis to subset on
    input_is_dense : boolean
        Indicates if the BIOM table is dense or sparse. Valid only for JSON
        tables.

    Returns
    -------
    Table
        The BIOM table stored at fp

    Raises
    ------
    ValueError
        If `samples` and `observations` are provided.

    Notes
    -----
    Subsetting from the BIOM table is only supported in one axis

    Examples
    --------
    Parse a hdf5 biom table

    >>> from h5py import File # doctest: +SKIP
    >>> from biom.parse import parse_biom_table
    >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP
    >>> t = parse_biom_table(f) # doctest: +SKIP

    Parse a hdf5 biom table subsetting observations
    >>> from h5py import File # doctest: +SKIP
    >>> from biom.parse import parse_biom_table
    >>> f = File('rich_sparse_otu_table_hdf5.biom') # doctest: +SKIP
    >>> t = parse_biom_table(f, ids=["GG_OTU_1"],
    ...                      axis='observation') # doctest: +SKIP
    """
    if axis not in ['observation', 'sample']:
        UnknownAxisError(axis)

    try:
        return Table.from_hdf5(fp, ids=ids, axis=axis)
    except:
        pass
    if hasattr(fp, 'read'):
        old_pos = fp.tell()
        # Read in characters until first non-whitespace
        # If it is a {, then this is (most likely) JSON
        c = fp.read(1)
        while c.isspace():
            c = fp.read(1)
        if c == '{':
            fp.seek(old_pos)
            t = Table.from_json(json.load(fp), input_is_dense=input_is_dense)
        else:
            fp.seek(old_pos)
            t = Table.from_tsv(fp, None, None, lambda x: x)
    elif isinstance(fp, list):
        try:
            t = Table.from_json(json.loads(''.join(fp)),
                                input_is_dense=input_is_dense)
        except ValueError:
            t = Table.from_tsv(fp, None, None, lambda x: x)
    else:
        t = Table.from_json(json.loads(fp), input_is_dense=input_is_dense)

    def subset_ids(data, id_, md):
        return id_ in ids

    def gt_zero(vals, id_, md):
        return np.any(vals)

    if ids is not None:
        t.filter(subset_ids, axis=axis)
        axis = 'observation' if axis == 'sample' else 'sample'
        t.filter(gt_zero, axis=axis)

    return t


def sc_pipe_separated(x):
    complex_metadata = []
    for y in x.split('|'):
        simple_metadata = []
        for e in y.split(';'):
            simple_metadata.append(e.strip())
        complex_metadata.append(simple_metadata)
    return complex_metadata


class MetadataMap(dict):

    @classmethod
    def from_file(cls, lines, strip_quotes=True, suppress_stripping=False,
                  header=None, process_fns=None):
        """Parse mapping file that relates samples or observations to metadata.

        Format: header line with fields
                optionally other comment lines starting with #
                tab-delimited fields

        process_fns: a dictionary of functions to apply to metadata categories.
         the keys should be the column headings, and the values should be
         functions which take a single value. For example, if the values in a
         column called "taxonomy" should be split on semi-colons before being
         added as metadata, and all other columns should be left as-is,
         process_fns should be:
          {'taxonomy': lambda x: x.split(';')}

        Assumes the first column in the mapping file is the id.

        This method is ported from QIIME (http://www.qiime.org), previously
        named parse_mapping_file/parse_mapping_file_to_dict. QIIME is a GPL
        project, but we obtained permission from the authors of this method
        to port it to the BIOM Format project (and keep it under BIOM's BSD
        license).
        """
        if hasattr(lines, "upper"):
            # Try opening if a string was passed
            try:
                lines = open(lines, 'U')
            except IOError:
                raise BiomParseException("A string was passed that doesn't "
                                         "refer to an accessible filepath.")

        if strip_quotes:
            if suppress_stripping:
                def strip_f(x):
                    # remove quotes but not spaces
                    return x.replace('"', '')
            else:
                def strip_f(x):
                    # remove quotes and spaces
                    return x.replace('"', '').strip()
        else:
            if suppress_stripping:
                def strip_f(x):
                    # don't remove quotes or spaces
                    return x
            else:
                def strip_f(x):
                    # remove spaces but not quotes
                    return x.strip()

        # if the user didn't provide process functions, initialize as
        # an empty dict
        if process_fns is None:
            process_fns = {}

        # Create lists to store the results
        mapping_data = []
        header = header or []
        comments = []

        # Begin iterating over lines
        for line in lines:
            line = strip_f(line)
            if not line or (suppress_stripping and not line.strip()):
                # skip blank lines when not stripping lines
                continue

            if line.startswith('#'):
                line = line[1:]
                if not header:
                    header = line.strip().split('\t')
                else:
                    comments.append(line)
            else:
                # Will add empty string to empty fields
                tmp_line = list(map(strip_f, line.split('\t')))
                if len(tmp_line) < len(header):
                    tmp_line.extend([''] * (len(header) - len(tmp_line)))
                mapping_data.append(tmp_line)

        if not header:
            raise BiomParseException("No header line was found in mapping "
                                     "file.")
        if not mapping_data:
            raise BiomParseException("No data found in mapping file.")

        first_col = [i[0] for i in mapping_data]
        if len(first_col) != len(set(first_col)):
            raise BiomParseException("First column values are not unique! "
                                     "Cannot be ids.")

        mapping = {}
        for vals in mapping_data:
            current_d = {}
            for k, v in zip(header[1:], vals[1:]):
                try:
                    current_d[k] = process_fns[k](v)
                except KeyError:
                    current_d[k] = v
            mapping[vals[0]] = current_d

        return cls(mapping)

    def __init__(self, mapping):
        """Accepts dictionary mapping IDs to metadata.

        ``mapping`` should be a dictionary mapping an ID to a dictionary of
        metadata. For example:

        {'Sample1': {'Treatment': 'Fast'}, 'Sample2': {'Treatment': 'Control'}}
        """
        super(MetadataMap, self).__init__(mapping)


def generatedby():
    """Returns a generated by string"""
    return 'BIOM-Format %s' % __version__


def convert_table_to_biom(table_f, sample_mapping, obs_mapping,
                          process_func, **kwargs):
    """Convert a contigency table to a biom table

    sample_mapping : dict of {'sample_id':metadata} or None
    obs_mapping : dict of {'obs_id':metadata} or None
    process_func: a function to transform observation metadata
    dtype : type of table data
    """
    otu_table = Table.from_tsv(table_f, obs_mapping, sample_mapping,
                               process_func, **kwargs)
    return otu_table.to_json(generatedby())


def biom_meta_to_string(metadata, replace_str=':'):
    """Determine which format the metadata is and then convert to a string"""

    # Note that since ';' and '|' are used as seperators we must replace them
    # if they exist

    if isinstance(metadata, string_types):
        return metadata.replace(';', replace_str)
    elif isinstance(metadata, list):
        transtab = bytes.maketrans(';|', ''.join([replace_str, replace_str]))
        # metadata is list of lists
        if isinstance(metadata[0], list):
            new_metadata = []
            for x in metadata:
                # replace erroneus delimiters
                values = [y.strip().trans(transtab) for y in x]
                new_metadata.append("; ".join(values))
            return "|".join(new_metadata)

        # metadata is list (of strings)
        else:
            return (
                "; ".join(x.replace(';', replace_str).strip()
                          for x in metadata)
            )


def convert_biom_to_table(biom_f, header_key=None, header_value=None,
                          md_format=None):
    """Convert a biom table to a contigency table"""
    table = load_table(biom_f)
    if md_format is None:
        md_format = biom_meta_to_string

    if table.metadata(axis='observation') is None:
        return table.delimited_self()

    if header_key in table.metadata(axis='observation')[0]:
        return table.delimited_self(header_key=header_key,
                                    header_value=header_value,
                                    metadata_formatter=md_format)
    else:
        return table.delimited_self()


def load_table(f):
    r"""Load a `Table` from a path

    Parameters
    ----------
    f : str

    Returns
    -------
    Table

    Raises
    ------
    IOError
        If the path does not exist
    TypeError
        If the data in the path does not appear to be a BIOM table

    Examples
    --------
    Parse a table from a path. BIOM will attempt to determine if the fhe file
    is either in TSV, HDF5, JSON, gzip'd JSON or gzip'd TSV and parse
    accordingly:

    >>> from biom import load_table
    >>> table = load_table('path/to/table.biom') # doctest: +SKIP

    """
    with biom_open(f) as fp:
        try:
            table = parse_biom_table(fp)
        except (IndexError, TypeError):
            raise TypeError("%s does not appear to be a BIOM file!" % f)
    return table