/usr/share/pyshared/scrapy/utils/request.py is in python-scrapy 0.14.4-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | """
This module provides some useful functions for working with
scrapy.http.Request objects
"""
import hashlib
import weakref
from urlparse import urlunparse
from twisted.internet.defer import Deferred
from w3lib.http import basic_auth_header
from scrapy.utils.url import canonicalize_url
from scrapy.utils.httpobj import urlparse_cached
_fingerprint_cache = weakref.WeakKeyDictionary()
def request_fingerprint(request, include_headers=None):
"""
Return the request fingerprint.
The request fingerprint is a hash that uniquely identifies the resource the
request points to. For example, take the following two urls:
http://www.example.com/query?id=111&cat=222
http://www.example.com/query?cat=222&id=111
Even though those are two different URLs both point to the same resource
and are equivalent (ie. they should return the same response).
Another example are cookies used to store session ids. Suppose the
following page is only accesible to authenticated users:
http://www.example.com/members/offers.html
Lot of sites use a cookie to store the session id, which adds a random
component to the HTTP Request and thus should be ignored when calculating
the fingerprint.
For this reason, request headers are ignored by default when calculating
the fingeprint. If you want to include specific headers use the
include_headers argument, which is a list of Request headers to include.
"""
if include_headers:
include_headers = tuple([h.lower() for h in sorted(include_headers)])
cache = _fingerprint_cache.setdefault(request, {})
if include_headers not in cache:
fp = hashlib.sha1()
fp.update(request.method)
fp.update(canonicalize_url(request.url))
fp.update(request.body or '')
if include_headers:
for hdr in include_headers:
if hdr in request.headers:
fp.update(hdr)
for v in request.headers.getlist(hdr):
fp.update(v)
cache[include_headers] = fp.hexdigest()
return cache[include_headers]
def request_authenticate(request, username, password):
"""Autenticate the given request (in place) using the HTTP basic access
authentication mechanism (RFC 2617) and the given username and password
"""
request.headers['Authorization'] = basic_auth_header(username, password)
def request_httprepr(request):
"""Return the raw HTTP representation (as string) of the given request.
This is provided only for reference since it's not the actual stream of
bytes that will be send when performing the request (that's controlled
by Twisted).
"""
parsed = urlparse_cached(request)
path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
s = "%s %s HTTP/1.1\r\n" % (request.method, path)
s += "Host: %s\r\n" % parsed.hostname
if request.headers:
s += request.headers.to_string() + "\r\n"
s += "\r\n"
s += request.body
return s
def request_deferred(request):
"""Wrap a request inside a Deferred.
This returns a Deferred whose first pair of callbacks are the request
callback and errback. The Deferred also triggers when the request
callback/errback is executed (ie. when the request is downloaded)
"""
d = Deferred()
if request.callback:
d.addCallbacks(request.callback, request.errback)
request.callback, request.errback = d.callback, d.errback
return d
|