/usr/share/pyshared/scrapy/commands/parse.py is in python-scrapy 0.14.4-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | from w3lib.url import is_url
from scrapy.command import ScrapyCommand
from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.utils import display
from scrapy.utils.spider import iterate_spider_output, create_spider_for_request
from scrapy.exceptions import UsageError
from scrapy import log
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return "[options] <url>"
def short_desc(self):
return "Parse URL (using its spider) and print the results"
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("--spider", dest="spider", default=None, \
help="use this spider without looking for one")
parser.add_option("--nolinks", dest="nolinks", action="store_true", \
help="don't show links to follow (extracted requests)")
parser.add_option("--noitems", dest="noitems", action="store_true", \
help="don't show scraped items")
parser.add_option("--nocolour", dest="nocolour", action="store_true", \
help="avoid using pygments to colorize the output")
parser.add_option("-r", "--rules", dest="rules", action="store_true", \
help="use CrawlSpider rules to discover the callback")
parser.add_option("-c", "--callback", dest="callback", \
help="use this callback for parsing, instead looking for a callback")
def pipeline_process(self, item, spider, opts):
return item
def run_callback(self, spider, response, callback, opts):
cb = callback if callable(callback) else getattr(spider, callback, None)
if not cb:
log.msg('Cannot find callback %r in spider: %s' % (callback, spider.name))
return (), ()
items, requests = [], []
for x in iterate_spider_output(cb(response)):
if isinstance(x, BaseItem):
items.append(x)
elif isinstance(x, Request):
requests.append(x)
return items, requests
def get_callback_from_rules(self, spider, response):
if getattr(spider, 'rules', None):
for rule in spider.rules:
if rule.link_extractor.matches(response.url) and rule.callback:
return rule.callback
else:
log.msg("No CrawlSpider rules found in spider %r, please specify "
"a callback to use for parsing" % spider.name, log.ERROR)
def print_results(self, items, requests, cb_name, opts):
if not opts.noitems:
print "# Scraped Items - callback: %s" % cb_name, "-"*60
display.pprint([dict(x) for x in items], colorize=not opts.nocolour)
if not opts.nolinks:
print "# Requests - callback: %s" % cb_name, "-"*68
display.pprint(requests, colorize=not opts.nocolour)
def get_spider(self, request, opts):
if opts.spider:
try:
return self.crawler.spiders.create(opts.spider)
except KeyError:
log.msg('Unable to find spider: %s' % opts.spider, log.ERROR)
else:
spider = create_spider_for_request(self.crawler.spiders, request)
if spider:
return spider
log.msg('Unable to find spider for: %s' % request, log.ERROR)
def get_response_and_spider(self, url, opts):
responses = [] # to collect downloaded responses
request = Request(url, callback=responses.append)
spider = self.get_spider(request, opts)
if not spider:
return None, None
self.crawler.crawl(spider, [request])
self.crawler.start()
if not responses:
log.msg('No response downloaded for: %s' % request, log.ERROR, \
spider=spider)
return None, None
return responses[0], spider
def run(self, args, opts):
if not len(args) == 1 or not is_url(args[0]):
raise UsageError()
response, spider = self.get_response_and_spider(args[0], opts)
if not response:
return
callback = None
if opts.callback:
callback = opts.callback
elif opts.rules:
callback = self.get_callback_from_rules(spider, response)
items, requests = self.run_callback(spider, response, callback or 'parse', \
opts)
self.print_results(items, requests, callback, opts)
|