/usr/lib/python2.7/dist-packages/pex/crawler.py is in python-pex 1.1.14-2ubuntu2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | # Copyright 2014 Pants project contributors (see CONTRIBUTORS.md).
# Licensed under the Apache License, Version 2.0 (see LICENSE).
"""Support for webpage parsing and crawling."""
import os
import re
import threading
import traceback
from .compatibility import PY3
from .http import Context
from .link import Link
from .tracer import TRACER
from .util import Memoizer
if PY3:
from queue import Empty, Queue
from urllib.parse import urlparse
else:
from Queue import Empty, Queue
from urlparse import urlparse
def unescape(s):
"""Unescapes html. Taken from https://wiki.python.org/moin/EscapingHtml"""
s = s.replace("<", "<")
s = s.replace(">", ">")
# this has to be last:
s = s.replace("&", "&")
return s
class PageParser(object):
"""A helper class to extract and differentiate ordinary and download links from webpages."""
HREF_RE = re.compile(r"""href=(?:"([^"]*)"|\'([^\']*)\'|([^>\s\n]*))""", re.I | re.S)
REL_RE = re.compile(r"""<[^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*>""", re.I)
REL_SKIP_EXTENSIONS = frozenset(['.zip', '.tar', '.tar.gz', '.tar.bz2', '.tgz', '.exe'])
REL_TYPES = frozenset(['homepage', 'download'])
@classmethod
def href_match_to_url(cls, match):
def pick(group):
return '' if group is None else group
return unescape(pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3)))
@classmethod
def rel_links(cls, page):
"""return rel= links that should be scraped, skipping obviously data links."""
for match in cls.REL_RE.finditer(page):
href, rel = match.group(0), match.group(1)
if rel not in cls.REL_TYPES:
continue
href_match = cls.HREF_RE.search(href)
if href_match:
href = cls.href_match_to_url(href_match)
parsed_href = urlparse(href)
if any(parsed_href.path.endswith(ext) for ext in cls.REL_SKIP_EXTENSIONS):
continue
yield href
@classmethod
def links(cls, page):
"""return all links on a page, including potentially rel= links."""
for match in cls.HREF_RE.finditer(page):
yield cls.href_match_to_url(match)
def partition(L, pred):
return filter(lambda v: not pred(v), L), filter(lambda v: pred(v), L)
class Crawler(object):
"""A multi-threaded crawler that supports local (disk) and remote (web) crawling."""
# Memoizer for calls to Crawler.crawl().
_CRAWL_CACHE = Memoizer()
@classmethod
def reset_cache(cls):
"""Reset the internal crawl cache. This is intended primarily for tests."""
cls._CRAWL_CACHE = Memoizer()
@classmethod
def crawl_local(cls, link):
try:
dirents = os.listdir(link.local_path)
except OSError as e:
TRACER.log('Failed to read %s: %s' % (link.local_path, e), V=1)
return set(), set()
files, dirs = partition([os.path.join(link.local_path, fn) for fn in dirents], os.path.isdir)
return set(map(Link.from_filename, files)), set(map(Link.from_filename, dirs))
@classmethod
def crawl_remote(cls, context, link):
try:
content = context.content(link)
except context.Error as e:
TRACER.log('Failed to read %s: %s' % (link.url, e), V=1)
return set(), set()
links = set(link.join(href) for href in PageParser.links(content))
rel_links = set(link.join(href) for href in PageParser.rel_links(content))
return links, rel_links
@classmethod
def crawl_link(cls, context, link):
if link.local:
return cls.crawl_local(link)
elif link.remote:
return cls.crawl_remote(context, link)
else:
TRACER.log('Failed to crawl %s: unknown scheme %s' % (link.url, link.scheme))
return set(), set()
def __init__(self, context=None, threads=1):
self._threads = threads
self.context = context or Context.get()
def _make_cache_key(self, links, follow_links):
return (follow_links,) + tuple(links)
def crawl(self, link_or_links, follow_links=False):
links = list(Link.wrap_iterable(link_or_links))
cache_key = self._make_cache_key(links, follow_links)
# Memoize crawling to a global Memoizer (Crawler._CRAWL_CACHE).
result = self._CRAWL_CACHE.get(cache_key)
if result is None:
result = self._crawl(links, follow_links)
self._CRAWL_CACHE.store(cache_key, result)
return result
def _crawl(self, link_or_links, follow_links):
links, seen = set(), set()
queue = Queue()
converged = threading.Event()
def execute():
while not converged.is_set():
try:
link = queue.get(timeout=0.01)
except Empty:
continue
if link not in seen:
seen.add(link)
try:
roots, rels = self.crawl_link(self.context, link)
except Exception as e:
TRACER.log('Unknown exception encountered: %s' % e)
for line in traceback.format_exc().splitlines():
TRACER.log(line)
queue.task_done()
continue
links.update(roots)
if follow_links:
for rel in rels:
if rel not in seen:
queue.put(rel)
queue.task_done()
for i, link in enumerate(link_or_links):
TRACER.log('crawling link i=%s link=%s follow_links=%s' % (i, link, follow_links), V=3)
queue.put(link)
workers = []
for _ in range(self._threads):
worker = threading.Thread(target=execute)
workers.append(worker)
worker.daemon = True
worker.start()
queue.join()
converged.set()
# We deliberately do not join the worker threads, since they are no longer of any use to us.
return links
|