This file is indexed.

/usr/lib/python2.7/dist-packages/pex/crawler.py is in python-pex 1.1.14-2ubuntu2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# Copyright 2014 Pants project contributors (see CONTRIBUTORS.md).
# Licensed under the Apache License, Version 2.0 (see LICENSE).

"""Support for webpage parsing and crawling."""

import os
import re
import threading
import traceback

from .compatibility import PY3
from .http import Context
from .link import Link
from .tracer import TRACER
from .util import Memoizer

if PY3:
  from queue import Empty, Queue
  from urllib.parse import urlparse
else:
  from Queue import Empty, Queue
  from urlparse import urlparse


def unescape(s):
  """Unescapes html. Taken from https://wiki.python.org/moin/EscapingHtml"""
  s = s.replace("&lt;", "<")
  s = s.replace("&gt;", ">")
  # this has to be last:
  s = s.replace("&amp;", "&")
  return s


class PageParser(object):
  """A helper class to extract and differentiate ordinary and download links from webpages."""

  HREF_RE = re.compile(r"""href=(?:"([^"]*)"|\'([^\']*)\'|([^>\s\n]*))""", re.I | re.S)
  REL_RE = re.compile(r"""<[^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*>""", re.I)
  REL_SKIP_EXTENSIONS = frozenset(['.zip', '.tar', '.tar.gz', '.tar.bz2', '.tgz', '.exe'])
  REL_TYPES = frozenset(['homepage', 'download'])

  @classmethod
  def href_match_to_url(cls, match):
    def pick(group):
      return '' if group is None else group
    return unescape(pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3)))

  @classmethod
  def rel_links(cls, page):
    """return rel= links that should be scraped, skipping obviously data links."""
    for match in cls.REL_RE.finditer(page):
      href, rel = match.group(0), match.group(1)
      if rel not in cls.REL_TYPES:
        continue
      href_match = cls.HREF_RE.search(href)
      if href_match:
        href = cls.href_match_to_url(href_match)
        parsed_href = urlparse(href)
        if any(parsed_href.path.endswith(ext) for ext in cls.REL_SKIP_EXTENSIONS):
          continue
        yield href

  @classmethod
  def links(cls, page):
    """return all links on a page, including potentially rel= links."""
    for match in cls.HREF_RE.finditer(page):
      yield cls.href_match_to_url(match)


def partition(L, pred):
  return filter(lambda v: not pred(v), L), filter(lambda v: pred(v), L)


class Crawler(object):
  """A multi-threaded crawler that supports local (disk) and remote (web) crawling."""

  # Memoizer for calls to Crawler.crawl().
  _CRAWL_CACHE = Memoizer()

  @classmethod
  def reset_cache(cls):
    """Reset the internal crawl cache. This is intended primarily for tests."""
    cls._CRAWL_CACHE = Memoizer()

  @classmethod
  def crawl_local(cls, link):
    try:
      dirents = os.listdir(link.local_path)
    except OSError as e:
      TRACER.log('Failed to read %s: %s' % (link.local_path, e), V=1)
      return set(), set()
    files, dirs = partition([os.path.join(link.local_path, fn) for fn in dirents], os.path.isdir)
    return set(map(Link.from_filename, files)), set(map(Link.from_filename, dirs))

  @classmethod
  def crawl_remote(cls, context, link):
    try:
      content = context.content(link)
    except context.Error as e:
      TRACER.log('Failed to read %s: %s' % (link.url, e), V=1)
      return set(), set()
    links = set(link.join(href) for href in PageParser.links(content))
    rel_links = set(link.join(href) for href in PageParser.rel_links(content))
    return links, rel_links

  @classmethod
  def crawl_link(cls, context, link):
    if link.local:
      return cls.crawl_local(link)
    elif link.remote:
      return cls.crawl_remote(context, link)
    else:
      TRACER.log('Failed to crawl %s: unknown scheme %s' % (link.url, link.scheme))
      return set(), set()

  def __init__(self, context=None, threads=1):
    self._threads = threads
    self.context = context or Context.get()

  def _make_cache_key(self, links, follow_links):
    return (follow_links,) + tuple(links)

  def crawl(self, link_or_links, follow_links=False):
    links = list(Link.wrap_iterable(link_or_links))
    cache_key = self._make_cache_key(links, follow_links)

    # Memoize crawling to a global Memoizer (Crawler._CRAWL_CACHE).
    result = self._CRAWL_CACHE.get(cache_key)
    if result is None:
      result = self._crawl(links, follow_links)
      self._CRAWL_CACHE.store(cache_key, result)

    return result

  def _crawl(self, link_or_links, follow_links):
    links, seen = set(), set()
    queue = Queue()
    converged = threading.Event()

    def execute():
      while not converged.is_set():
        try:
          link = queue.get(timeout=0.01)
        except Empty:
          continue
        if link not in seen:
          seen.add(link)
          try:
            roots, rels = self.crawl_link(self.context, link)
          except Exception as e:
            TRACER.log('Unknown exception encountered: %s' % e)
            for line in traceback.format_exc().splitlines():
              TRACER.log(line)
            queue.task_done()
            continue
          links.update(roots)
          if follow_links:
            for rel in rels:
              if rel not in seen:
                queue.put(rel)
        queue.task_done()

    for i, link in enumerate(link_or_links):
      TRACER.log('crawling link i=%s link=%s follow_links=%s' % (i, link, follow_links), V=3)
      queue.put(link)

    workers = []
    for _ in range(self._threads):
      worker = threading.Thread(target=execute)
      workers.append(worker)
      worker.daemon = True
      worker.start()

    queue.join()
    converged.set()

    # We deliberately do not join the worker threads, since they are no longer of any use to us.
    return links