/usr/share/pyshared/scrapy/contrib/throttle.py is in python-scrapy 0.14.4-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | from scrapy.xlib.pydispatch import dispatcher
from scrapy.exceptions import NotConfigured
from scrapy import signals
from scrapy.utils.httpobj import urlparse_cached
from scrapy.resolver import dnscache
class AutoThrottle(object):
"""
============
AutoThrottle
============
This is an extension for automatically throttling crawling speed based on
load.
Design goals
============
1. be nicer to sites instead of using default download delay of zero
2. automatically adjust scrapy to the optimum crawling speed, so the user
doesn't have to tune the download delays and concurrent requests to find
the optimum one. the user only needs to specify the maximum concurrent
requests it allows, and the extension does the rest.
Download latencies
==================
In Scrapy, the download latency is the (real) time elapsed between
establishing the TCP connection and receiving the HTTP headers.
Note that these latencies are very hard to measure accurately in a
cooperative multitasking environment because Scrapy may be busy processing
a spider callback, for example, and unable to attend downloads. However,
the latencies should give a reasonable estimate of how busy Scrapy (and
ultimately, the server) is. This extension builds on that premise.
Throttling rules
================
This adjusts download delays and concurrency based on the following rules:
1. spiders always start with one concurrent request and a download delay of
START_DELAY
2. when a response is received, the download delay is adjusted to the
average of previous download delay and the latency of the response.
3. after CONCURRENCY_CHECK_PERIOD responses have passed, the average
latency of this period is checked against the previous one and:
3.1. if the latency remained constant (within standard deviation limits)
and the concurrency is lower than MAX_CONCURRENCY, the concurrency is
increased
3.2. if the latency has increased (beyond standard deviation limits) and
the concurrency is higher than 1, the concurrency is decreased
"""
def __init__(self, crawler):
settings = crawler.settings
if not settings.getbool('AUTOTHROTTLE_ENABLED'):
raise NotConfigured
self.crawler = crawler
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.response_received, signal=signals.response_received)
self.START_DELAY = settings.getfloat("AUTOTHROTTLE_START_DELAY", 5.0)
self.CONCURRENCY_CHECK_PERIOD = settings.getint("AUTOTHROTTLE_CONCURRENCY_CHECK_PERIOD", 10)
self.MAX_CONCURRENCY = settings.getint("AUTOTHROTTLE_MAX_CONCURRENCY", 8)
self.DEBUG = settings.getint("AUTOTHROTTLE_DEBUG", False)
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def spider_opened(self, spider):
spider.download_delay = self.START_DELAY
if hasattr(spider, "max_concurrent_requests"):
self.MAX_CONCURRENCY = spider.max_concurrent_requests
# override in order to avoid to initialize slot with concurrency > 1
spider.max_concurrent_requests = 1
self.last_latencies = [self.START_DELAY]
self.last_lat = self.START_DELAY, 0.0
def response_received(self, response, spider):
slot = self._get_slot(response.request)
latency = response.meta.get('download_latency')
if not latency or not slot:
return
self._adjust_delay(slot, latency, response)
self._check_concurrency(slot, latency)
if self.DEBUG:
spider.log("conc:%2d | delay:%5d ms | latency:%5d ms | size:%6d bytes" % \
(slot.concurrency, slot.delay*1000, \
latency*1000, len(response.body)))
def _get_slot(self, request):
downloader = self.crawler.engine.downloader
key = urlparse_cached(request).hostname or ''
if downloader.ip_concurrency:
key = dnscache.get(key, key)
return downloader.slots.get(key)
def _check_concurrency(self, slot, latency):
latencies = self.last_latencies
latencies.append(latency)
if len(latencies) == self.CONCURRENCY_CHECK_PERIOD:
curavg, curdev = avg_stdev(latencies)
preavg, predev = self.last_lat
self.last_lat = curavg, curdev
del latencies[:]
if curavg > preavg + predev:
if slot.concurrency > 1:
slot.concurrency -= 1
elif slot.concurrency < self.MAX_CONCURRENCY:
slot.concurrency += 1
def _adjust_delay(self, slot, latency, response):
"""Define delay adjustment policy"""
# if latency is bigger than old delay, then use latency instead of mean. Works better with problematic sites
new_delay = (slot.delay + latency) / 2.0 if latency < slot.delay else latency
# dont adjust delay if response status != 200 and new delay is smaller than old one,
# as error pages (and redirections) are usually small and so tend to reduce latency, thus provoking a positive feedback
# by reducing delay instead of increase.
if response.status == 200 or new_delay > slot.delay:
slot.delay = new_delay
def avg_stdev(lst):
"""Return average and standard deviation of the given list"""
avg = sum(lst)/len(lst)
sdsq = sum((x-avg) ** 2 for x in lst)
stdev = (sdsq / (len(lst) -1)) ** 0.5
return avg, stdev
|