/usr/share/pyshared/geopy/parsers/html.py is in python-geopy 0.95.1-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | import re
from BeautifulSoup import BeautifulSoup, SoupStrainer
from geopy import Point, Location
from geopy.parsers import Parser
from geopy.util import unescape
FLOAT_RE = re.compile(r'([+-]?\d*\.?\d+)$')
class ICBMMetaTag(Parser):
META_NAME = 'ICBM'
def __init__(self, ignore_invalid=True):
self.ignore_invalid = ignore_invalid
def find(self, document):
strainer = SoupStrainer('meta', attrs={'name': self.META_NAME})
if not isinstance(document, BeautifulSoup):
elements = BeautifulSoup(document, parseOnlyThese=strainer)
else:
elements = document.findAll(strainer)
for element in elements:
lat_long = element.get('content')
if lat_long or not self.ignore_invalid:
try:
point = Point(unescape(lat_long))
except (TypeError, ValueError):
if not self.ignore_invalid:
raise
else:
yield Location(None, point)
class GeoMetaTag(Parser):
META_NAME = re.compile(r'geo\.(\w+)')
def __init__(self, ignore_invalid=True):
self.ignore_invalid = ignore_invalid
def find(self, document):
strainer = SoupStrainer('meta', attrs={'name': self.META_NAME})
if not isinstance(document, BeautifulSoup):
elements = BeautifulSoup(document, parseOnlyThese=strainer)
else:
elements = document.findAll(strainer)
attrs = {}
for element in elements:
meta_name = element['name']
attr_name = re.match(self.META_NAME, meta_name).group(1)
value = element.get('content')
if attr_name in attrs:
location = self._get_location(attrs)
if location is not None:
yield location
attrs.clear()
attrs[attr_name] = value and unescape(value)
location = self._get_location(attrs)
if location is not None:
yield location
def _get_location(self, attrs):
position = attrs.pop('position')
name = attrs.pop('placename')
if position is not None:
if position or not self.ignore_invalid:
try:
point = Point(position)
except (TypeError, ValueError):
if not self.ignore_invalid:
raise
else:
return Location(name, point, attrs)
class GeoMicroformat(Parser):
GEO_CLASS = re.compile(r'\s*geo\s*')
LATITUDE_CLASS = re.compile(r'\s*latitude\s*')
LONGITUDE_CLASS = re.compile(r'\s*longitude\s*')
VALUE_CLASS = re.compile(r'\s*value\s*')
SEP = re.compile(r'\s*;\s*')
def __init__(self, ignore_invalid=True, shorthand=True, abbr_title=True, value_excerpting=True):
self.ignore_invalid = ignore_invalid
self.shorthand = shorthand
self.abbr_title = abbr_title
self.value_excerpting = value_excerpting
def find(self, document):
strainer = SoupStrainer(attrs={'class': self.GEO_CLASS})
if not isinstance(document, BeautifulSoup):
elements = BeautifulSoup(document, parseOnlyThese=strainer)
else:
elements = document.findAll(strainer)
for element in elements:
preformatted = element.name == 'pre'
lat_element = element.find(attrs={'class': self.LATITUDE_CLASS})
long_element = element.find(attrs={'class': self.LONGITUDE_CLASS})
latitude = None
longitude = None
if lat_element and long_element:
latitude = self._get_value(lat_element, preformatted)
longitude = self._get_value(long_element, preformatted)
elif self.shorthand:
lat_long = re.split(self.SEP, self._get_value(element), 1)
if len(lat_long) == 2:
latitude, longitude = lat_long
if latitude and longitude:
lat_match = FLOAT_RE.match(unescape(latitude))
long_match = FLOAT_RE.match(unescape(longitude))
if lat_match and long_match:
latitude = float(lat_match.group(1))
longitude = float(long_match.group(1))
text = unescape(self._get_text(element).strip())
name = re.sub('\s+', ' ', text)
yield Location(name, (latitude, longitude))
def _get_text(self, element, preformatted=False):
if isinstance(element, basestring):
if not preformatted:
return re.sub('\s+', ' ', element)
else:
return element
elif element.name == 'br':
return '\n'
else:
pre = preformatted or element.name == 'pre'
return "".join([self._get_text(node, pre) for node in element])
def _get_value(self, element, preformatted=False):
if self.value_excerpting:
value_nodes = element.findAll(attrs={'class': self.VALUE_CLASS})
if value_nodes:
pre = preformatted or element.name == 'pre'
values = [self._get_text(node, pre) for node in value_nodes]
return "".join(values)
if self.abbr_title and element.name == 'abbr':
value = element.get('title')
if value is not None:
return value
return self._get_text(element, preformatted)
|