This file is indexed.

/usr/lib/python2.7/dist-packages/scrapy/crawler.py is in python-scrapy 0.24.2-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import signal

from twisted.internet import reactor, defer

from scrapy.core.engine import ExecutionEngine
from scrapy.resolver import CachingThreadedResolver
from scrapy.extension import ExtensionManager
from scrapy.signalmanager import SignalManager
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
from scrapy.utils.misc import load_object
from scrapy import log, signals


class Crawler(object):

    def __init__(self, settings):
        self.configured = False
        self.settings = settings
        self.signals = SignalManager(self)
        self.stats = load_object(settings['STATS_CLASS'])(self)
        self._start_requests = lambda: ()
        self._spider = None
        # TODO: move SpiderManager to CrawlerProcess
        spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
        self.spiders = spman_cls.from_crawler(self)

    def install(self):
        # TODO: remove together with scrapy.project.crawler usage
        import scrapy.project
        assert not hasattr(scrapy.project, 'crawler'), "crawler already installed"
        scrapy.project.crawler = self

    def uninstall(self):
        # TODO: remove together with scrapy.project.crawler usage
        import scrapy.project
        assert hasattr(scrapy.project, 'crawler'), "crawler not installed"
        del scrapy.project.crawler

    def configure(self):
        if self.configured:
            return

        self.configured = True
        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)
        self.extensions = ExtensionManager.from_crawler(self)
        self.engine = ExecutionEngine(self, self._spider_closed)

    def crawl(self, spider, requests=None):
        assert self._spider is None, 'Spider already attached'
        self._spider = spider
        spider.set_crawler(self)
        if requests is None:
            self._start_requests = spider.start_requests
        else:
            self._start_requests = lambda: requests

    def _spider_closed(self, spider=None):
        if not self.engine.open_spiders:
            self.stop()

    @defer.inlineCallbacks
    def start(self):
        yield defer.maybeDeferred(self.configure)
        if self._spider:
            yield self.engine.open_spider(self._spider, self._start_requests())
        yield defer.maybeDeferred(self.engine.start)

    @defer.inlineCallbacks
    def stop(self):
        if self.configured and self.engine.running:
            yield defer.maybeDeferred(self.engine.stop)


class CrawlerProcess(object):
    """ A class to run multiple scrapy crawlers in a process sequentially"""

    def __init__(self, settings):
        install_shutdown_handlers(self._signal_shutdown)
        self.settings = settings
        self.crawlers = {}
        self.stopping = False
        self._started = None

    def create_crawler(self, name=None):
        if name not in self.crawlers:
            self.crawlers[name] = Crawler(self.settings)

        return self.crawlers[name]

    def start(self):
        if self.start_crawling():
            self.start_reactor()

    @defer.inlineCallbacks
    def stop(self):
        self.stopping = True
        if self._active_crawler:
            yield self._active_crawler.stop()

    def _signal_shutdown(self, signum, _):
        install_shutdown_handlers(self._signal_kill)
        signame = signal_names[signum]
        log.msg(format="Received %(signame)s, shutting down gracefully. Send again to force ",
                level=log.INFO, signame=signame)
        reactor.callFromThread(self.stop)

    def _signal_kill(self, signum, _):
        install_shutdown_handlers(signal.SIG_IGN)
        signame = signal_names[signum]
        log.msg(format='Received %(signame)s twice, forcing unclean shutdown',
                level=log.INFO, signame=signame)
        reactor.callFromThread(self._stop_reactor)

    # ------------------------------------------------------------------------#
    # The following public methods can't be considered stable and may change at
    # any moment.
    #
    # start_crawling and start_reactor are called from scrapy.commands.shell
    # They are splitted because reactor is started on a different thread than IPython shell.
    #
    def start_crawling(self):
        log.scrapy_info(self.settings)
        return self._start_crawler() is not None

    def start_reactor(self):
        if self.settings.getbool('DNSCACHE_ENABLED'):
            reactor.installResolver(CachingThreadedResolver(reactor))
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        reactor.run(installSignalHandlers=False)  # blocking call

    def _start_crawler(self):
        if not self.crawlers or self.stopping:
            return

        name, crawler = self.crawlers.popitem()
        self._active_crawler = crawler
        sflo = log.start_from_crawler(crawler)
        crawler.configure()
        crawler.install()
        crawler.signals.connect(crawler.uninstall, signals.engine_stopped)
        if sflo:
            crawler.signals.connect(sflo.stop, signals.engine_stopped)
        crawler.signals.connect(self._check_done, signals.engine_stopped)
        crawler.start()
        return name, crawler

    def _check_done(self, **kwargs):
        if not self._start_crawler():
            self._stop_reactor()

    def _stop_reactor(self, _=None):
        try:
            reactor.stop()
        except RuntimeError:  # raised if already stopped or in shutdown stage
            pass