Scrapy源码分析(四):请求Request

来源:互联网 发布:买保险 知乎 编辑:程序博客网 时间:2024/05/16 01:26

本次我们要分析的Scrapy源码为Request模块,模块的位置:

from scrapytest.http import Request

首先把Request的源码附上:

class Request(object_ref):    def __init__(self, url, callback=None, method='GET', headers=None, body=None,                 cookies=None, meta=None, encoding='utf-8', priority=0,                 dont_filter=False, errback=None):        self._encoding = encoding  # this one has to be set first 设定了request使用的编码        self.method = str(method).upper()     #GET POST等等        self._set_url(url)      #调用URL处理函数,设置url属性        self._set_body(body)      #设置Body属性        assert isinstance(priority, int), "Request priority not an integer: %r" % priority #检查priority是否int        self.priority = priority        assert callback or not errback, "Cannot use errback without a callback"        self.callback = callback        self.errback = errback        self.cookies = cookies or {}        self.headers = Headers(headers or {}, encoding=encoding)        self.dont_filter = dont_filter        self._meta = dict(meta) if meta else None    @property              #将meta作为一个属性    def meta(self):        if self._meta is None:            self._meta = {}        return self._meta    def _get_url(self):        return self._url    def _set_url(self, url):           #设置url,输入要字符型的        if not isinstance(url, six.string_types):            raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)        s = safe_url_string(url, self.encoding)       #Urlencode        self._url = escape_ajax(s)              #处理!#        if ':' not in self._url:           #要有http://或者https://            raise ValueError('Missing scheme in request url: %s' % self._url)    url = property(_get_url, obsolete_setter(_set_url, 'url'))     #将url设置为一个只读属性,修改url要通过.replace实现    def _get_body(self):        return self._body    def _set_body(self, body):        if body is None:            self._body = b''       #b''在py2.7中与普通字符串型一致,为了兼容py3        else:            self._body = to_bytes(body, self.encoding) #将unicode型转为utf-8等等byte类型    body = property(_get_body, obsolete_setter(_set_body, 'body')) #body也是只读属性,修改要通过replace    @property    def encoding(self):        return self._encoding    def __str__(self):        return "<%s %s>" % (self.method, self.url)    __repr__ = __str__    def copy(self):        """Return a copy of this Request"""        return self.replace()    def replace(self, *args, **kwargs):  #创建一个新的Request对象,对于新给的kwargs,使用新的,没给的使用原有对象的        """Create a new Request with the same attributes except for those        given new values.        """        for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta',                  'encoding', 'priority', 'dont_filter', 'callback', 'errback']:            kwargs.setdefault(x, getattr(self, x))        cls = kwargs.pop('cls', self.__class__)        return cls(*args, **kwargs)

下面我们来逐个函数分析:

    def __init__(self, url, callback=None, method='GET', headers=None, body=None,                 cookies=None, meta=None, encoding='utf-8', priority=0,                 dont_filter=False, errback=None):        self._encoding = encoding  # this one has to be set first 设定了request使用的编码        self.method = str(method).upper()     #GET POST等等        self._set_url(url)      #调用URL处理函数,设置url属性        self._set_body(body)      #设置Body属性        assert isinstance(priority, int), "Request priority not an integer: %r" % priority #检查priority是否int        self.priority = priority        assert callback or not errback, "Cannot use errback without a callback"        self.callback = callback        self.errback = errback        self.cookies = cookies or {}        self.headers = Headers(headers or {}, encoding=encoding)        self.dont_filter = dont_filter        self._meta = dict(meta) if meta else None
主要做的工作是变量赋值,来看_set_url和_set_body两个函数:

    def _set_url(self, url):           #设置url,输入要字符型的        if not isinstance(url, six.string_types):            raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)        s = safe_url_string(url, self.encoding)       #Urlencode        self._url = escape_ajax(s)              #处理!#        if ':' not in self._url:           #要有http://或者https://            raise ValueError('Missing scheme in request url: %s' % self._url)    def _set_body(self, body):        if body is None:            self._body = b''       #b''在py2.7中与普通字符串型一致,为了兼容py3        else:            self._body = to_bytes(body, self.encoding) #将unicode型转为utf-8等等byte类型

six.string_types包含unicode型和str型(也就是py3的b'')

再来看一下用到的几个工具类函数safe_url_string,to_bytes,escape_ajax

def safe_url_string(url, encoding='utf8'):    """Convert the given url into a legal URL by escaping unsafe characters    according to RFC-3986.    If a unicode url is given, it is first converted to str using the given    encoding (which defaults to 'utf-8'). When passing a encoding, you should    use the encoding of the original page (the page from which the url was    extracted from).    Calling this function on an already "safe" url will return the url    unmodified.    Always returns a str.    """    s = unicode_to_str(url, encoding)    return moves.urllib.parse.quote(s, _safe_chars)def escape_ajax(url):    """    Return the crawleable url according to:    http://code.google.com/web/ajaxcrawling/docs/getting-started.html    >>> escape_ajax("www.example.com/ajax.html#!key=value")    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'    >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")    'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'    >>> escape_ajax("www.example.com/ajax.html?#!key=value")    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'    >>> escape_ajax("www.example.com/ajax.html#!")    'www.example.com/ajax.html?_escaped_fragment_='    URLs that are not "AJAX crawlable" (according to Google) returned as-is:    >>> escape_ajax("www.example.com/ajax.html#key=value")    'www.example.com/ajax.html#key=value'    >>> escape_ajax("www.example.com/ajax.html#")    'www.example.com/ajax.html#'    >>> escape_ajax("www.example.com/ajax.html")    'www.example.com/ajax.html'    """    defrag, frag = urldefrag(url)    if not frag.startswith('!'):        return url    return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])def to_bytes(text, encoding=None, errors='strict'):    """Return the binary representation of `text`. If `text`    is already a bytes object, return it as-is."""    if isinstance(text, bytes):        return text    if not isinstance(text, six.string_types):        raise TypeError('to_bytes must receive a unicode, str or bytes '                        'object, got %s' % type(text).__name__)    if encoding is None:        encoding = 'utf-8'    return text.encode(encoding, errors)

safe_url_string是用来给含有非法字符的url做urlencode,

to_bytes将字符串统一成byte型,

escape_ajax处理url中含有#!的情况。

再来看一下Header模块:

class Headers(CaselessDict):#    """Case insensitive http headers dictionary"""    def __init__(self, seq=None, encoding='utf-8'):        self.encoding = encoding        super(Headers, self).__init__(seq)    def normkey(self, key):        """Normalize key to bytes"""        return self._tobytes(key.title())    def normvalue(self, value):        """Normalize values to bytes"""        if value is None:            value = []        elif isinstance(value, (six.text_type, bytes)):            value = [value]        elif not hasattr(value, '__iter__'):            value = [value]        return [self._tobytes(x) for x in value]    def _tobytes(self, x):        if isinstance(x, bytes):            return x        elif isinstance(x, six.text_type):            return x.encode(self.encoding)        elif isinstance(x, int):            return six.text_type(x).encode(self.encoding)        else:            raise TypeError('Unsupported value type: {}'.format(type(x)))    def __getitem__(self, key):        try:            return super(Headers, self).__getitem__(key)[-1]        except IndexError:            return None    def get(self, key, def_val=None):        try:            return super(Headers, self).get(key, def_val)[-1]        except IndexError:            return None    def getlist(self, key, def_val=None):        try:            return super(Headers, self).__getitem__(key)        except KeyError:            if def_val is not None:                return self.normvalue(def_val)            return []    def setlist(self, key, list_):        self[key] = list_    def setlistdefault(self, key, default_list=()):        return self.setdefault(key, default_list)    def appendlist(self, key, value):        lst = self.getlist(key)        lst.extend(self.normvalue(value))        self[key] = lst    def items(self):        return list(self.iteritems())    def iteritems(self):        return ((k, self.getlist(k)) for k in self.keys())    def values(self):        return [self[k] for k in self.keys()]    def to_string(self):        return headers_dict_to_raw(self)    def to_unicode_dict(self):        """ Return headers as a CaselessDict with unicode keys        and unicode values. Multiple values are joined with ','.        """        return CaselessDict(            (to_unicode(key, encoding=self.encoding),             to_unicode(b','.join(value), encoding=self.encoding))            for key, value in self.items())    def __copy__(self):        return self.__class__(self)    copy = __copy__

基类是CaselessDict,是一个不区分key大小写的字典:

class CaselessDict(dict):  #key大小写不敏感的字典,key统一按照小写存储和查询    __slots__ = ()    def __init__(self, seq=None):        super(CaselessDict, self).__init__()        if seq:            self.update(seq)    def __getitem__(self, key):        return dict.__getitem__(self, self.normkey(key))    def __setitem__(self, key, value):        dict.__setitem__(self, self.normkey(key), self.normvalue(value))    def __delitem__(self, key):        dict.__delitem__(self, self.normkey(key))    def __contains__(self, key):        return dict.__contains__(self, self.normkey(key))    has_key = __contains__    def __copy__(self):        return self.__class__(self)    copy = __copy__    def normkey(self, key):        """Method to normalize dictionary key access"""        return key.lower()    def normvalue(self, value):        """Method to normalize values prior to be setted"""        return value    def get(self, key, def_val=None):        return dict.get(self, self.normkey(key), self.normvalue(def_val))    def setdefault(self, key, def_val=None):        return dict.setdefault(self, self.normkey(key), self.normvalue(def_val))    def update(self, seq):        seq = seq.items() if isinstance(seq, dict) else seq        iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)        super(CaselessDict, self).update(iseq)    @classmethod    def fromkeys(cls, keys, value=None):        return cls((k, value) for k in keys)    def pop(self, key, *args):        return dict.pop(self, self.normkey(key), *args)

主要是定义了normkey,normvalue,使得对字典的访问(get、set等等)要经过这两个函数,从而处理了key,不区分大小写。

再来看一下Header中的函数:

    def normkey(self, key):        """Normalize key to bytes"""        return self._tobytes(key.title())    def normvalue(self, value):        """Normalize values to bytes"""        if value is None:            value = []        elif isinstance(value, (six.text_type, bytes)):            value = [value]        elif not hasattr(value, '__iter__'):            value = [value]        return [self._tobytes(x) for x in value]

将key定义为byte型,首字母大写。value使用list包装,字符串都是用byte型。value使用list是为了可以一个key对应多个值。

    def __getitem__(self, key):        try:            return super(Headers, self).__getitem__(key)[-1]        except IndexError:            return None    def get(self, key, def_val=None):        try:            return super(Headers, self).get(key, def_val)[-1]        except IndexError:            return None    def getlist(self, key, def_val=None):        try:            return super(Headers, self).__getitem__(key)        except KeyError:            if def_val is not None:                return self.normvalue(def_val)            return []

定义的get操作都是返回list的最后一个元素,getlist直接返回整个list。

    def setlist(self, key, list_):        self[key] = list_    def setlistdefault(self, key, default_list=()):        return self.setdefault(key, default_list)    def appendlist(self, key, value):        lst = self.getlist(key)        lst.extend(self.normvalue(value))        self[key] = lst

appendlist可以接受一个list类型的参数。

    def to_string(self):        return headers_dict_to_raw(self)    def to_unicode_dict(self):        """ Return headers as a CaselessDict with unicode keys        and unicode values. Multiple values are joined with ','.        """        return CaselessDict(            (to_unicode(key, encoding=self.encoding),             to_unicode(b','.join(value), encoding=self.encoding))            for key, value in self.items())

to_string函数将字典转换为字符串:

def headers_dict_to_raw(headers_dict):    r"""    Returns a raw HTTP headers representation of headers    For example:    >>> import w3lib.http    >>> w3lib.http.headers_dict_to_raw({b'Content-type': b'text/html', b'Accept': b'gzip'}) # doctest: +SKIP    'Content-type: text/html\\r\\nAccept: gzip'    >>>    Note that keys and values must be bytes.    Argument is ``None`` (returns ``None``):    >>> w3lib.http.headers_dict_to_raw(None)    >>>    """    if headers_dict is None:        return None    raw_lines = []    for key, value in headers_dict.items():        if isinstance(value, bytes):            raw_lines.append(b": ".join([key, value]))        elif isinstance(value, (list, tuple)):            for v in value:                raw_lines.append(b": ".join([key, v]))    return b'\r\n'.join(raw_lines)

list中的元素被拆开成多行显示。

to_unicode_dict将header转换为一个unicode类型的CaselessDict。

回到Request来:

    url = property(_get_url, obsolete_setter(_set_url, 'url'))     #将url设置为一个只读属性,修改url要通过.replace实现    body = property(_get_body, obsolete_setter(_set_body, 'body')) #body也是只读属性,修改要通过replace

url和body被设置为只读属性,修改会报错。

def obsolete_setter(setter, attrname):    def newsetter(self, value):        c = self.__class__.__name__        msg = "%s.%s is not modifiable, use %s.replace() instead" % (c, attrname, c)        raise AttributeError(msg)    return newsetter


看看replace函数,创建一个新的Request对象,对于新给的kwargs,使用新的,没给的使用原有对象的参数
    def replace(self, *args, **kwargs):  #创建一个新的Request对象,对于新给的kwargs,使用新的,没给的使用原有对象的        """Create a new Request with the same attributes except for those        given new values.        """        for x in ['url', 'method', 'headers', 'body', 'cookies', 'meta',                  'encoding', 'priority', 'dont_filter', 'callback', 'errback']:            kwargs.setdefault(x, getattr(self, x))        cls = kwargs.pop('cls', self.__class__)        return cls(*args, **kwargs)




0 0