/usr/lib/python2.7/dist-packages/biotools/analysis/cluster.py is in python-biotools 1.2.12-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | #!/usr/bin/env python
import biotools.sequence as sequ
import biotools.IO as io
import biotools.translate as tran
import biotools.clustal as clustal
import biotools.analysis.options as options
try:
import Queue as queue
except ImportError:
import queue
import hashlib
import subprocess
import threading
from os import sep, mkdir
def run(direc, inputs):
'''
Takes a collection of files generated by gene prediction, creates clusters
based off of the genes that have homology to those predicted genes, and
creates new fasta files in the clusters sub directory under the given
directory and separated according to whether they are nucleotide or amino
acid sequnces. These new fasta files are then used to create clustalw
alignments of the genes if more than 1 sequence exists in the fasta file.
'''
clusters = {}
all_ids = set()
ids = {}
q = queue.Queue()
filenames = []
def run_clustal():
while not q.empty():
cid = q.get()
dig = hashlib.md5()
dig.update(' '.join(cid))
dig = dig.hexdigest()
fpre = direc + 'nt' + sep + dig
apre = direc + 'aa' + sep + dig
fname = fpre + ".fasta"
aname = apre + ".fasta"
fh = io.open(fname, 'w')
ah = io.open(aname, 'w')
for ipt in clusters:
counter = 0
name = '_'.join(ipt.split(sep)[-1].split('.')[0].split())
for cluster in clusters[ipt]:
if cid & cluster[0]:
nm = name + '_' + str(counter)
seq = cluster[1]
curr = sequ.Sequence(nm, seq, defline=', '.join(cid))
tr = tran.translate(curr)
tr.name = curr.name
fh.write(curr)
ah.write(tr)
counter += 1
fh.close()
ah.close()
try:
clustal.run(fname, fpre + '.clustalw')
clustal.run(aname, apre + '.clustalw')
filenames.append(dig + '.fasta')
except ValueError:
pass
q.task_done()
if direc:
for d in [direc, direc + 'nt' + sep, direc + 'aa' + sep]:
try:
mkdir(d)
except OSError:
pass
for ipt in inputs:
seqs = {}
ids[ipt] = set()
for seq in io.open(ipt, 'r'):
ids[ipt].add(seq.name)
all_ids.add(seq.name)
if seq.seq not in seqs:
seqs[seq.seq] = set()
seqs[seq.seq].add(seq.name)
clusters[ipt] = [(seqs[k], k) for k in seqs]
del seqs
sub_ids = []
while all_ids:
cid = all_ids.pop()
subcluster = (all_ids | set([cid])) & \
set(i for ipt in clusters for cluster in clusters[ipt]
for i in cluster[0] if cid in cluster[0])
for ipt in clusters:
for cluster in clusters[ipt]:
if cid in cluster[0]:
subcluster = (subcluster & cluster[0]) | \
(subcluster - ids[ipt])
sub_ids.append(subcluster)
all_ids -= subcluster
for cid in sub_ids:
q.put(cid)
threads = []
for i in xrange(options.NUM_PROCESSES - 1):
curr = threading.Thread(target=run_clustal)
threads.append(curr)
curr.start()
run_clustal()
q.join()
return filenames
|