/usr/share/pyshared/scrapy/contrib/logstats.py is in python-scrapy 0.14.4-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | from twisted.internet import task
from scrapy.xlib.pydispatch import dispatcher
from scrapy.exceptions import NotConfigured
from scrapy.conf import settings
from scrapy import log, signals
class Slot(object):
def __init__(self):
self.items = 0
self.itemsprev = 0
self.pages = 0
self.pagesprev = 0
class LogStats(object):
"""Log basic scraping stats periodically"""
def __init__(self):
self.interval = settings.getfloat('LOGSTATS_INTERVAL')
if not self.interval:
raise NotConfigured
self.slots = {}
self.multiplier = 60.0 / self.interval
dispatcher.connect(self.item_scraped, signal=signals.item_scraped)
dispatcher.connect(self.response_received, signal=signals.response_received)
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
dispatcher.connect(self.engine_started, signal=signals.engine_started)
dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped)
def item_scraped(self, spider):
self.slots[spider].items += 1
def response_received(self, spider):
self.slots[spider].pages += 1
def spider_opened(self, spider):
self.slots[spider] = Slot()
def spider_closed(self, spider):
del self.slots[spider]
def engine_started(self):
self.tsk = task.LoopingCall(self.log)
self.tsk.start(self.interval)
def log(self):
for spider, slot in self.slots.items():
irate = (slot.items - slot.itemsprev) * self.multiplier
prate = (slot.pages - slot.pagesprev) * self.multiplier
slot.pagesprev, slot.itemsprev = slot.pages, slot.items
msg = "Crawled %d pages (at %d pages/min), scraped %d items (at %d items/min)" \
% (slot.pages, prate, slot.items, irate)
log.msg(msg, spider=spider)
def engine_stopped(self):
if self.tsk.running:
self.tsk.stop()
|