/usr/lib/python2.7/dist-packages/GenomicConsensus/reference.py is in python-pbgenomicconsensus 2.1.0-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 | #################################################################################
# Copyright (c) 2011-2013, Pacific Biosciences of California, Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Pacific Biosciences nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
# THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#################################################################################
# Author: David Alexander
from __future__ import absolute_import
import logging, re, numpy as np
from collections import OrderedDict
from pbcore.io import ReferenceSet
from .windows import holes, kCoveredIntervals, enumerateIntervals
from .utils import die, nub
class WorkChunk(object):
"""
A chunk of the reference
"""
def __init__(self, window, hasCoverage):
self.window = window
self.hasCoverage = hasCoverage
class UppercasingMmappedFastaSequence(object):
def __init__(self, mmappedFastaSequence):
self.other = mmappedFastaSequence
def __getitem__(self, spec):
snip = self.other.__getitem__(spec)
return snip.upper()
class ReferenceContig(object):
"""
A contig from a reference (i.e. FASTA) file.
"""
def __init__(self, id, name, fullName, sequence, length):
self.id = id # CmpH5-local id
self.name = name # Prefix of FASTA heder
self.fullName = fullName
self.sequence = UppercasingMmappedFastaSequence(sequence)
self.length = length
byName = OrderedDict() # Fasta header (string e.g. "chr1") -> FastaRecord
byId = OrderedDict() # CmpH5 local id (integer) -> FastaRecord
byPacBioName = OrderedDict() # pacbio name ("ref000001") -> FastaRecord
def idToName(_id):
# At this point ids should always be names
return byName[_id].name
def idToFullName(_id):
# At this point ids should always be names
return byName[_id].fullName
# Interpret a string key (one of name, or id (as string))
# and find the associated id. Only to be used in interpretation of
# command-line input!
def anyKeyToId(stringKey):
assert isLoaded()
if stringKey in byName:
return byName[stringKey].name
elif stringKey in byPacBioName:
return byPacBioName[stringKey].name
elif stringKey.isdigit():
refId = int(stringKey)
# at this point, refId can still be the old numeric identifier
return byId[refId].name
else:
raise Exception, "Unknown reference name: %s" % stringKey
def sequenceInWindow(window):
refId, refStart, refEnd = window
return byName[refId].sequence[refStart:refEnd]
filename = None
def isLoaded():
return filename != None
def loadFromFile(filename_, alnFile):
"""
Reads reference from FASTA file, loading
lookup tables that can be used any time later.
"""
# Contigs in FASTA may disagree with those in cmp.h5 ref info
# table, for instance if the FASTA has been edited. Here's how we
# handle things:
#
# |fastaContigs \ cmpContigs| > 0 : OK, extra FASTA contigs just ignored
# |cmpContigs \ fastaContigs| > 0 : Not necessarily OK---a warning should be
# issued. We then proceed to operate on
# the contigs that are in both.
# |cmpContigs ^ fastaContigs| == 0 : Nothing to work with. This is an error.
#
# While we formerly used MD5s to vouch for the identity of a
# contig, we now use the name. This is an inferior approach but
# is necessary, in using the FastaTable.
# Load contigs
assert not isLoaded()
try:
f = ReferenceSet(filename_)
f.assertIndexed()
except IOError as e:
die(e)
cmpContigNames = set(alnFile.refNames)
for fastaRecord in f.contigs:
refName = fastaRecord.id
if refName in cmpContigNames:
refEntry = alnFile.referenceInfo(refName)
refId = refEntry.ID
pacBioName = refEntry.Name
refFullName = refEntry.FullName
sequence = UppercasingMmappedFastaSequence(fastaRecord.sequence)
length = len(fastaRecord.sequence)
contig = ReferenceContig(refId, refName, refFullName, sequence, length)
byId[refId] = contig
byName[refName] = contig
byPacBioName[pacBioName] = contig
loadedFastaContigNames = set(byName.keys())
logging.info("Loaded %d of %d reference groups from %s " %
(len(byName), len(loadedFastaContigNames), filename_))
if len(byName) == 0:
die("No reference groups in the FASTA file were aligned against. " \
"Did you select the wrong reference FASTA file?")
elif (cmpContigNames - loadedFastaContigNames):
logging.warn(
"Some reference contigs aligned against are not found in " \
"the reference FASTA. Will process only those contigs " \
"supported by the reference FASTA.")
global filename
filename = filename_
assert isLoaded()
def stringToWindow(s):
assert isLoaded()
if s is None:
return None
m = re.match("(.*):(.*)-(.*)", s)
if m:
refId = anyKeyToId(m.group(1))
refStart = int(m.group(2))
refEnd = min(int(m.group(3)), byName[refId].length)
else:
refId = anyKeyToId(s)
refStart = 0
refEnd = byName[refId].length
return (refId, refStart, refEnd)
def windowToString(referenceWindow):
assert isLoaded()
refId, refStart, refEnd = referenceWindow
return "%s:%d-%d" % (idToName(refId),
refStart,
refEnd)
def enumerateSpans(refId, referenceWindows=()):
"""
Enumerate the contiguous spans along this reference contig that
are to be analyzed.
"""
assert isLoaded()
referenceEntry = byName[refId]
referenceEntrySpan = (refId, 0, referenceEntry.length)
for refWin in (referenceWindows or [referenceEntrySpan]):
refWinId, start, end = refWin
if refWinId == refId:
yield (refId, start, end)
def enumerateChunks(refId, referenceStride, referenceWindows=()):
"""
Enumerate all work chunks on this reference contig (restricted to
the windows, if provided).
"""
for span in enumerateSpans(refId, referenceWindows):
for (s, e) in enumerateIntervals(span[1:], referenceStride):
yield WorkChunk((refId, s, e), True)
def fancyEnumerateChunks(alnFile, refId, referenceStride,
minCoverage, minMapQV, referenceWindows=()):
"""
Enumerate chunks, creating chunks with hasCoverage=False for
coverage cutouts.
"""
# Pull out rows with this refId and good enough MapQV
rows = alnFile.index[
((alnFile.tId == alnFile.referenceInfo(refId).ID) &
(alnFile.mapQV >= minMapQV))]
unsorted_tStart = rows.tStart
unsorted_tEnd = rows.tEnd
# Sort (expected by CoveredIntervals)
sort_order = np.lexsort((unsorted_tEnd, unsorted_tStart))
tStart = unsorted_tStart[sort_order].tolist()
tEnd = unsorted_tEnd[sort_order].tolist()
for span in enumerateSpans(refId, referenceWindows):
_, spanStart, spanEnd = span
coveredIntervals = kCoveredIntervals(minCoverage, tStart, tEnd, spanStart, spanEnd)
unCoveredIntervals = holes(span, coveredIntervals)
for (s, e) in sorted(list(coveredIntervals) + unCoveredIntervals):
win = (refId, s, e)
if (s, e) in coveredIntervals:
for chunk in enumerateChunks(refId, referenceStride, [(refId, s, e)]):
yield chunk
else:
yield WorkChunk(win, False)
def numReferenceBases(refId, referenceWindows=()):
"""
Termination is determined to be when the result collector has
built consensus corresponding to the exact number of reference
bases in the window under consideration.
"""
return sum((end - start)
for (_, start, end) in enumerateSpans(refId, referenceWindows))
def enumerateIds(referenceWindows=()):
"""
Enumerate all refIds (subject to the referenceWindows restriction,
if provided).
"""
assert isLoaded()
if referenceWindows == ():
for refId in byName:
yield refId
else:
for refId in nub(refId for (refId, _, _) in referenceWindows):
yield refId
def enlargedReferenceWindow(refWin, overlap):
assert isLoaded()
refId, refStart, refEnd = refWin
contigLength = byName[refId].length
return (refId,
max(0, refStart - overlap),
min(refEnd + overlap, contigLength))
|