/usr/share/pyshared/cogent/db/util.py is in python-cogent 1.5.3-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | #!/usr/bin/env python
"""Retrieve information from web databases.
"""
from urllib import urlopen, urlretrieve, quote_plus
__author__ = "Rob Knight"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Rob Knight"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Rob Knight"
__email__ = "rob@spot.colorado.edu"
__status__ = "Production"
class UrlGetter(object):
Defaults = {} #override in derived classes -- default values
PrintedFields = {} #override in derived classes -- fields to print
BaseUrl = '' #override in derived classes
KeyValDelimiter = '='
FieldDelimiter = '&'
def __init__(self, **kwargs):
"""Returns new instance with arbitrary kwargs."""
self.__dict__.update(self.Defaults)
self.__dict__.update(kwargs)
self._temp_args = {}
def __str__(self):
to_get = self.__dict__.copy()
to_get.update(self._temp_args)
return self.BaseUrl + self.FieldDelimiter.join(\
[quote_plus(k)+self.KeyValDelimiter+quote_plus(str(v)) for k, v in to_get.items()\
if k in self.PrintedFields])
def open(self, **kwargs):
"""Returns a stream handle to URL result, temporarily overriding kwargs.."""
self._temp_args = kwargs
result = urlopen(str(self))
self._temp_args = {}
return result
def read(self, **kwargs):
"""Gets URL and reads into memory, temporarily overriding kwargs."""
result = self.open(**kwargs)
data = result.read()
result.close()
return data
def retrieve(self, fname, **kwargs):
"""Gets URL and writes to file fname, temporarily overriding kwargs.
Note: produces no return value."""
self._temp_args = kwargs
urlretrieve(str(self), fname)
self._temp_args = None
def expand_slice(s):
"""Takes a start and end accession, and gets the whole range.
WARNING: Unlike standard slices, includes the last item in the range.
In other words, obj[AF1001:AF1010] will include AF1010.
Both accessions must have the same non-numeric prefix.
"""
start, step, end = s.start, s.step, s.stop
#find where the number is
start_index = last_nondigit_index(start)
end_index = last_nondigit_index(end)
prefix = start[:start_index]
if prefix != end[:end_index]:
raise TypeError, "Range start and end don't have same prefix"
if not step:
step = 1
range_start = long(start[start_index:])
range_end = long(end[end_index:])
field_width = str(len(start) - start_index)
format_string = '%'+field_width+'.'+field_width+'d'
return [prefix + format_string % i \
for i in range(range_start, range_end+1, step)]
def make_lists_of_expanded_slices_of_set_size(s,size_limit=200):
"""Returns a list of Accessions terms from 'expand_slice'.
GenBank URLs are limited in size. This helps break up larger lists
of Accessions (e.g. thousands) into GenBank friendly sizes for down
stream fetching.
-s : slice of accessions
-size_limit : max items each list should contain
"""
full_list = expand_slice(s)
ls = len(full_list)
l = []
for i in range(ls/size_limit+1):
start = i * size_limit
end = (i+1) * size_limit
subset = full_list[start:end]
l.append(' '.join(subset))
return l
def make_lists_of_accessions_of_set_size(s,size_limit=200):
"""Returns list of search terms that contain accessions up to the size
'size_limit'
This is to help make friendly GenBank urls for fetching large lists
of accessions (1000s).
-s : list of accessions
-size_limit : max items each list should contain
"""
ls = len(s)
l = []
for i in range(ls/size_limit+1):
start = i * size_limit
end = (i+1) * size_limit
subset = s[start:end]
l.append(' '.join(subset))
return l
def last_nondigit_index(s):
"""Returns the index of s such that s[i:] is numeric, or None."""
for i in range(len(s)):
if s[i:].isdigit():
return i
#if we get here, there weren't any trailing digits
return None
|