/usr/share/pyshared/cogent/parse/cigar.py is in python-cogent 1.5.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | #!/usr/bin/env python
"""Parsers for the cigar format
Cigar stands for Compact Idiosyncratic Gapped Alignment Report and defines the sequence
of matches/mismatches and deletions (or gaps). Cigar line is used in Ensembl database
for both multiple and pairwise genomic alignment.
for example, this cigar line 2MD3M2D2M will mean that the alignment contains 2 matches/
mismatches, 1 deletion (number 1 is omitted in order to save some spaces), 3 matches/
mismatches, 2 deletion and 2 matches/mismatches.
if the original sequence is: AACGCTT
the cigar line is: 2MD3M2D2M
the aligned sequence will be:
M M D M M M D D M M
A A - C G C - - T T
"""
import re
from cogent.core.location import LostSpan, Span, Map, _LostSpan
from cogent import DNA, LoadSeqs
__author__ = "Hua Ying"
__copyright__ = "Copyright 2007-2011, The Cogent Project"
__credits__ = ["Hua Ying"]
__license__ = "GPL"
__version__ = "1.5.1"
__maintainer__ = "Hua Ying"
__email__ = "hua.ying@anu.edu.au"
__status__ = "Production"
pattern = re.compile('([0-9]*)([DM])')
def map_to_cigar(map):
"""convert a Map into a cigar string"""
cigar = ''
for span in map.spans:
if isinstance(span, Span):
num_chars = span.End-span.Start
char = 'M'
else:
num_chars = span.length
char = 'D'
if num_chars == 1:
cigar += char
else:
cigar += str(num_chars)+char
return cigar
def cigar_to_map(cigar_text):
"""convert cigar string into Map"""
assert 'I' not in cigar_text
spans, posn = [], 0
for n, c in pattern.findall(cigar_text):
if n:
n = int(n)
else:
n = 1
if c == 'M':
spans.append(Span(posn, posn+n))
posn += n
else:
spans.append(LostSpan(n))
map = Map(spans = spans, parent_length = posn)
return map
def aligned_from_cigar(cigar_text, seq, moltype=DNA):
"""returns an Aligned sequence from a cigar string, sequence and moltype"""
if isinstance(seq, str):
seq = moltype.makeSequence(seq)
map = cigar_to_map(cigar_text)
aligned_seq = seq.gappedByMap(map)
return aligned_seq
def _slice_by_aln(map, left, right):
slicemap = map[left:right]
if hasattr(slicemap, 'Start'):
location = [slicemap.Start, slicemap.End]
else:
location = []
return slicemap, location
def _slice_by_seq(map, start, end):
re_map = map.inverse()
slicemap = re_map[start:end]
aln_start, aln_end = slicemap.Start, slicemap.End
new_map = map[aln_start:aln_end]
return new_map, [aln_start, aln_end]
def _remap(map):
start = map.Start
if start == 0:
new_map = map
new_map.parent_length = map.End
else:
spans = []
for span in map.spans:
if span.lost:
spans.append(span)
else:
span.Start = span.Start - start
span.End = span.End - start
length = span.End
spans.append(span)
new_map = Map(spans = spans, parent_length = length)
return new_map
def slice_cigar(cigar_text, start, end, by_align=True):
"""slices a cigar string as an alignment"""
map = cigar_to_map(cigar_text)
if by_align:
new_map, location = _slice_by_aln(map, start, end)
else:
new_map, location = _slice_by_seq(map, start, end)
if hasattr(new_map, 'Start'):
new_map = _remap(new_map)
return new_map, location
def CigarParser(seqs, cigars, sliced = False, ref_seqname = None, start = None, end = None, moltype=DNA):
"""return an alignment from raw sequences and cigar strings
if sliced, will return an alignment correspondent to ref sequence start to end
Arguments:
seqs - raw sequences as {seqname: seq}
cigars - corresponding cigar text as {seqname: cigar_text}
cigars and seqs should have the same seqnames
MolType - optional default to DNA
"""
data = {}
if not sliced:
for seqname in seqs.keys():
aligned_seq = aligned_from_cigar(cigars[seqname],
seqs[seqname], moltype=moltype)
data[seqname] = aligned_seq
else:
ref_aln_seq = aligned_from_cigar(cigars[ref_seqname],
seqs[ref_seqname], moltype=moltype)
m, aln_loc = slice_cigar(cigars[ref_seqname], start, end, by_align = False)
data[ref_seqname] = ref_aln_seq[aln_loc[0]:aln_loc[1]]
for seqname in [seqname for seqname in seqs.keys() if seqname != ref_seqname]:
m, seq_loc = slice_cigar(cigars[seqname], aln_loc[0], aln_loc[1])
if seq_loc:
seq = seqs[seqname]
if isinstance(seq, str):
seq = moltype.makeSequence(seq)
data[seqname] = seq[seq_loc[0]:seq_loc[1]].gappedByMap(m)
else:
data[seqname] = DNA.makeSequence('-'*(aln_loc[1] - aln_loc[0]))
aln = LoadSeqs(data = data, aligned = True)
return aln
|