This file is indexed.

/usr/lib/python2.7/dist-packages/lazr/uri/_uri.py is in python-lazr.uri 1.0.3-2build1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
# Copyright 2009 Canonical Ltd.  All rights reserved.
#
# This file is part of lazr.uri
#
# lazr.uri is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# lazr.uri is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
# License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with lazr.uri.  If not, see <http://www.gnu.org/licenses/>.

"""Functions for working with generic syntax URIs."""

__metaclass__ = type
__all__ = [
    'URI',
    'InvalidURIError',
    'find_uris_in_text',
    'possible_uri_re',
    'merge',
    'remove_dot_segments',
    ]

import re

try:
    unicode
except NameError:
    unicode = str

# Default port numbers for different URI schemes
# The registered URI schemes comes from
#    http://www.iana.org/assignments/uri-schemes.html
# The default ports come from the relevant RFCs

_default_port = {
    # Official schemes
    'acap': '674',
    'dav': '80',
    'dict': '2628',
    'dns': '53',
    'ftp': '21',
    'go': '1096',
    'gopher': '70',
    'h323': '1720',
    'http': '80',
    'https': '443',
    'imap': '143',
    'ipp': '631',
    'iris.beep': '702',
    'ldap': '389',
    'mtqp': '1038',
    'mupdate': '3905',
    'nfs': '2049',
    'nntp': '119',
    'pop': '110',
    'rtsp': '554',
    'sip': '5060',
    'sips': '5061',
    'snmp': '161',
    'soap.beep': '605',
    'soap.beeps': '605',
    'telnet': '23',
    'tftp': '69',
    'tip': '3372',
    'vemmi': '575',
    'xmlrpc.beep': '602',
    'xmlrpc.beeps': '602',
    'z39.50r': '210',
    'z39.50s': '210',

    # Historical schemes
    'prospero': '1525',
    'wais': '210',

    # Common but unregistered schemes
    'bzr+http': '80',
    'bzr+ssh': '22',
    'irc': '6667',
    'sftp': '22',
    'ssh': '22',
    'svn': '3690',
    'svn+ssh': '22',
    }

# Regular expressions adapted from the ABNF in the RFC

scheme_re = r"(?P<scheme>[a-z][-a-z0-9+.]*)"

userinfo_re = r"(?P<userinfo>(?:[-a-z0-9._~!$&\'()*+,;=:]|%[0-9a-f]{2})*)"
# The following regular expression will match some IP address style
# host names that the RFC would not (e.g. leading zeros on the
# components), but is signficantly simpler.
host_re = (r"(?P<host>[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|"
           r"(?:[-a-z0-9._~!$&\'()*+,;=]|%[0-9a-f]{2})*|"
           r"\[[0-9a-z:.]+\])")
port_re = r"(?P<port>[0-9]*)"

authority_re = r"(?P<authority>(?:%s@)?%s(?::%s)?)" % (
    userinfo_re, host_re, port_re)

path_abempty_re = r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*"
path_noscheme_re = (r"(?:[-a-z0-9._~!$&\'()*+,;=@]|%[0-9a-f]{2})+"
                    r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*")
path_rootless_re = (r"(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})+"
                    r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*")
path_absolute_re = r"/(?:%s)?" % path_rootless_re
path_empty_re = r""

hier_part_re = r"(?P<hierpart>//%s%s|%s|%s|%s)" % (
    authority_re, path_abempty_re, path_absolute_re, path_rootless_re,
    path_empty_re)

relative_part_re = r"(?P<relativepart>//%s%s|%s|%s|%s)" % (
    authority_re, path_abempty_re, path_absolute_re, path_noscheme_re,
    path_empty_re)

# Additionally we also permit square braces in the query portion to
# accomodate real-world URIs.
query_re = r"(?P<query>(?:[-a-z0-9._~!$&\'()*+,;=:@/?\[\]]|%[0-9a-f]{2})*)"
fragment_re = r"(?P<fragment>(?:[-a-z0-9._~!$&\'()*+,;=:@/?]|%[0-9a-f]{2})*)"

uri_re = r"%s:%s(?:\?%s)?(?:#%s)?$" % (
    scheme_re, hier_part_re, query_re, fragment_re)

relative_ref_re = r"%s(?:\?%s)?(?:#%s)?$" % (
    relative_part_re, query_re, fragment_re)

uri_pat = re.compile(uri_re, re.IGNORECASE)
relative_ref_pat = re.compile(relative_ref_re, re.IGNORECASE)


def merge(basepath, relpath, has_authority):
    """Merge two URI path components into a single path component.

    Follows rules specified in Section 5.2.3 of RFC 3986.

    The algorithm in the RFC treats the empty basepath edge case
    differently for URIs with and without an authority section, which
    is why the third argument is necessary.
    """
    if has_authority and basepath == '':
        return '/' + relpath
    slash = basepath.rfind('/')
    return basepath[:slash+1] + relpath


def remove_dot_segments(path):
    """Remove '.' and '..' segments from a URI path.

    Follows the rules specified in Section 5.2.4 of RFC 3986.
    """
    output = []
    while path:
        if path.startswith('../'):
            path = path[3:]
        elif path.startswith('./'):
            path = path[2:]
        elif path.startswith('/./') or path == '/.':
            path = '/' + path[3:]
        elif path.startswith('/../') or path == '/..':
            path = '/' + path[4:]
            if len(output) > 0:
                del output[-1]
        elif path in ['.', '..']:
            path = ''
        else:
            if path.startswith('/'):
                slash = path.find('/', 1)
            else:
                slash = path.find('/')
            if slash < 0:
                slash = len(path)
            output.append(path[:slash])
            path = path[slash:]
    return ''.join(output)


def normalise_unreserved(string):
    """Return a version of 's' where no unreserved characters are encoded.

    Unreserved characters are defined in Section 2.3 of RFC 3986.

    Percent encoded sequences are normalised to upper case.
    """
    result = string.split('%')
    unreserved = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                  'abcdefghijklmnopqrstuvwxyz'
                  '0123456789-._~')
    for index, item in enumerate(result):
        if index == 0:
            continue
        try:
            ch = int(item[:2], 16)
        except ValueError:
            continue
        if chr(ch) in unreserved:
            result[index] = chr(ch) + item[2:]
        else:
            result[index] = '%%%02X%s' % (ch, item[2:])
    return ''.join(result)


class InvalidURIError(Exception):
    """Invalid URI"""


class URI:
    """A class that represents a URI.

    This class can represent arbitrary URIs that conform to the
    generic syntax described in RFC 3986.
    """

    def __init__(self, uri=None, scheme=None, userinfo=None, host=None,
                 port=None, path=None, query=None, fragment=None):
        """Create a URI instance.

        Can be called with either a string URI or the component parts
        of the URI as keyword arguments.

        In either case, all arguments are expected to be appropriately
        URI encoded.
        """
        assert (uri is not None and scheme is None and userinfo is None and
                host is None and port is None and path is None and
                query is None and fragment is None) or uri is None, (
            "URI() must be called with a single string argument or "
            "with URI components given as keyword arguments.")

        if uri is not None:
            if isinstance(uri, unicode):
                try:
                    uri.encode('ASCII')
                except UnicodeEncodeError:
                    raise InvalidURIError(
                        'URIs must consist of ASCII characters')
            match = uri_pat.match(uri)
            if match is None:
                raise InvalidURIError('"%s" is not a valid URI' % uri)
            self.scheme = match.group('scheme')
            self.userinfo = match.group('userinfo')
            self.host = match.group('host')
            self.port = match.group('port')
            hierpart = match.group('hierpart')
            authority = match.group('authority')
            if authority is None:
                self.path = hierpart
            else:
                # Skip past the //authority part
                self.path = hierpart[2+len(authority):]
            self.query = match.group('query')
            self.fragment = match.group('fragment')
        else:
            if scheme is None:
                raise InvalidURIError('URIs must have a scheme')
            if host is None and (userinfo is not None or port is not None):
                raise InvalidURIError(
                    'host must be given if userinfo or port are')
            if path is None:
                raise InvalidURIError('URIs must have a path')
            self.scheme = scheme
            self.userinfo = userinfo
            self.host = host
            self.port = port
            self.path = path
            self.query = query
            self.fragment = fragment

        self._normalise()

        if (self.scheme in ['http', 'https', 'ftp', 'gopher', 'telnet',
                           'imap', 'mms', 'rtsp', 'svn', 'svn+ssh',
                           'bzr', 'bzr+http', 'bzr+ssh'] and
            not self.host):
            raise InvalidURIError('%s URIs must have a host name' %
                                  self.scheme)


    def _normalise(self):
        """Perform normalisation of URI components."""
        self.scheme = self.scheme.lower()

        if self.userinfo is not None:
            self.userinfo = normalise_unreserved(self.userinfo)
        if self.host is not None:
            self.host = normalise_unreserved(self.host.lower())
        if self.port == '':
            self.port = None
        elif self.port is not None:
            if self.port == _default_port.get(self.scheme):
                self.port = None
        if self.host is not None and self.path == '':
            self.path = '/'
        self.path = normalise_unreserved(remove_dot_segments(self.path))

        if self.query is not None:
            self.query = normalise_unreserved(self.query)
        if self.fragment is not None:
            self.fragment = normalise_unreserved(self.fragment)

    @property
    def authority(self):
        """The authority part of the URI"""
        if self.host is None:
            return None
        authority = self.host
        if self.userinfo is not None:
            authority = '%s@%s' % (self.userinfo, authority)
        if self.port is not None:
            authority = '%s:%s' % (authority, self.port)
        return authority

    @property
    def hier_part(self):
        """The hierarchical part of the URI"""
        authority = self.authority
        if authority is None:
            return self.path
        else:
            return '//%s%s' % (authority, self.path)

    def __str__(self):
        uri = '%s:%s' % (self.scheme, self.hier_part)
        if self.query is not None:
            uri += '?%s' % self.query
        if self.fragment is not None:
            uri += '#%s' % self.fragment
        return uri

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, str(self))

    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return (self.scheme == other.scheme and
                    self.authority == other.authority and
                    self.path == other.path and
                    self.query == other.query and
                    self.fragment == other.fragment)
        else:
            return NotImplemented

    def __ne__(self, other):
        equal = self.__eq__(other)
        if equal == NotImplemented:
            return NotImplemented
        else:
            return not equal

    def replace(self, **parts):
        """Replace one or more parts of the URI, returning the result."""
        if not parts:
            return self
        baseparts = dict(
            scheme=self.scheme,
            userinfo=self.userinfo,
            host=self.host,
            port=self.port,
            path=self.path,
            query=self.query,
            fragment=self.fragment)
        baseparts.update(parts)
        return self.__class__(**baseparts)

    def resolve(self, reference):
        """Resolve the given URI reference relative to this URI.

        Uses the rules from Section 5.2 of RFC 3986 to resolve the new
        URI.
        """
        # If the reference is a full URI, then return it as is.
        try:
            return self.__class__(reference)
        except InvalidURIError:
            pass

        match = relative_ref_pat.match(reference)
        if match is None:
            raise InvalidURIError("Invalid relative reference")

        parts = dict(scheme=self.scheme)
        authority = match.group('authority')
        if authority is not None:
            parts['userinfo'] = match.group('userinfo')
            parts['host'] = match.group('host')
            parts['port'] = match.group('port')
            # Skip over the //authority part
            parts['path'] = remove_dot_segments(
                match.group('relativepart')[2+len(authority):])
            parts['query'] = match.group('query')
        else:
            path = match.group('relativepart')
            query = match.group('query')
            if path == '':
                parts['path'] = self.path
                if query is not None:
                    parts['query'] = query
                else:
                    parts['query'] = self.query
            else:
                if path.startswith('/'):
                    parts['path'] = remove_dot_segments(path)
                else:
                    parts['path'] = merge(self.path, path,
                                          has_authority=self.host is not None)
                    parts['path'] = remove_dot_segments(parts['path'])
                parts['query'] = query
            parts['userinfo'] = self.userinfo
            parts['host'] = self.host
            parts['port'] = self.port
        parts['fragment'] = match.group('fragment')

        return self.__class__(**parts)

    def append(self, path):
        """Append the given path to this URI.

        The path must not start with a slash, but a slash is added to
        base URI (before appending the path), in case it doesn't end
        with a slash.
        """
        assert not path.startswith('/')
        return self.ensureSlash().resolve(path)

    def contains(self, other):
        """Returns True if the URI 'other' is contained by this one."""
        if (self.scheme != other.scheme or
            self.authority != other.authority):
            return False
        if self.path == other.path:
            return True
        basepath = self.path
        if not basepath.endswith('/'):
            basepath += '/'
        otherpath = other.path
        if not otherpath.endswith('/'):
            otherpath += '/'
        return otherpath.startswith(basepath)

    def underDomain(self, domain):
        """Return True if the given domain name a parent of the URL's host."""
        if len(domain) == 0:
            return True
        our_segments = self.host.split('.')
        domain_segments = domain.split('.')
        return our_segments[-len(domain_segments):] == domain_segments

    def ensureSlash(self):
        """Return a URI with the path normalised to end with a slash."""
        if self.path.endswith('/'):
            return self
        else:
            return self.replace(path=self.path + '/')

    def ensureNoSlash(self):
        """Return a URI with the path normalised to not end with a slash."""
        if self.path.endswith('/'):
            return self.replace(path=self.path.rstrip('/'))
        else:
            return self


# Regular expression for finding URIs in a body of text:
#
# From RFC 3986 ABNF for URIs:
#
#   URI           = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
#   hier-part     = "//" authority path-abempty
#                 / path-absolute
#                 / path-rootless
#                 / path-empty
#
#   authority     = [ userinfo "@" ] host [ ":" port ]
#   userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
#   host          = IP-literal / IPv4address / reg-name
#   reg-name      = *( unreserved / pct-encoded / sub-delims )
#   port          = *DIGIT
#
#   path-abempty  = *( "/" segment )
#   path-absolute = "/" [ segment-nz *( "/" segment ) ]
#   path-rootless = segment-nz *( "/" segment )
#   path-empty    = 0<pchar>
#
#   segment       = *pchar
#   segment-nz    = 1*pchar
#   pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
#
#   query         = *( pchar / "/" / "?" )
#   fragment      = *( pchar / "/" / "?" )
#
#   unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
#   pct-encoded   = "%" HEXDIG HEXDIG
#   sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
#                 / "*" / "+" / "," / ";" / "="
#
# We only match a set of known scheme names.  We don't handle
# IP-literal either.
#
# We will simplify "unreserved / pct-encoded / sub-delims" as the
# following regular expression:
#   [-a-zA-Z0-9._~%!$&'()*+,;=]
#
# We also require that the path-rootless form not begin with a
# colon to avoid matching strings like "http::foo" (to avoid bug
# #40255).
#
# The path-empty pattern is not matched either, due to false
# positives.
#
# Some allowed URI punctuation characters will be trimmed if they
# appear at the end of the URI since they may be incidental in the
# flow of the text.
#
# apport has at one time produced query strings containing sqaure
# braces (that are not percent-encoded). In RFC 2986 they seem to be
# allowed by section 2.2 "Reserved Characters", yet section 3.4
# "Query" appears to provide a strict definition of the query string
# that would forbid square braces. Either way, links with
# non-percent-encoded square braces are being used on Launchpad so
# it's probably best to accomodate them.

possible_uri_re = r'''
\b
(?:about|gopher|http|https|sftp|news|ftp|mailto|file|irc|jabber|xmpp)
:
(?:
  (?:
    # "//" authority path-abempty
    //
    (?: # userinfo
      [%(unreserved)s:]*
      @
    )?
    (?: # host
      \d+\.\d+\.\d+\.\d+ |
      [%(unreserved)s]*
    )
    (?: # port
      : \d*
    )?
    (?: / [%(unreserved)s:@]* )*
  ) | (?:
    # path-absolute
    /
    (?: [%(unreserved)s:@]+
        (?: / [%(unreserved)s:@]* )* )?
  ) | (?:
    # path-rootless
    [%(unreserved)s@]
    [%(unreserved)s:@]*
    (?: / [%(unreserved)s:@]* )*
  )
)
(?: # query
  \?
  [%(unreserved)s:@/\?\[\]]*
)?
(?: # fragment
  \#
  [%(unreserved)s:@/\?]*
)?
''' % {'unreserved': "-a-zA-Z0-9._~%!$&'()*+,;="}

possible_uri_pat = re.compile(possible_uri_re, re.IGNORECASE | re.VERBOSE)
uri_trailers_pat = re.compile(r'([,.?:);>]+)$')

def find_uris_in_text(text):
    """Scan a block of text for URIs, and yield the ones found."""
    for match in possible_uri_pat.finditer(text):
        uri_string = match.group()
        # remove characters from end of URI that are not likely to be
        # part of the URI.
        uri_string = uri_trailers_pat.sub('', uri_string)
        try:
            uri = URI(uri_string)
        except InvalidURIError:
            continue
        yield uri