中间件是scrapy的核心部分,数据的传递,队列的维护,都是靠中间件,所以了解他的生命周期还是很有必要的

https://github.com/scrapy/scrapy/blob/master/scrapy/core/spidermw.py

        def _add_middleware(self, mw):
            super(SpiderMiddlewareManager, self)._add_middleware(mw)
            if hasattr(mw, 'process_spider_input'):
                self.methods['process_spider_input'].append(mw.process_spider_input)
            if hasattr(mw, 'process_spider_output'):
                self.methods['process_spider_output'].insert(0, mw.process_spider_output)
            if hasattr(mw, 'process_spider_exception'):
                self.methods['process_spider_exception'].insert(0, mw.process_spider_exception)
            if hasattr(mw, 'process_start_requests'):
                self.methods['process_start_requests'].insert(0, mw.process_start_requests)

将中间件的每内置的方法加入到方法链上,注意,这里的数值越小,处理越靠前,下同

https://github.com/scrapy/scrapy/blob/master/scrapy/core/scraper.py

这里就去处理spiderMiddleWare

    class Scraper(object):
    
        def __init__(self, crawler):
            self.slot = None
            self.spidermw = SpiderMiddlewareManager.from_crawler(crawler)
            itemproc_cls = load_object(crawler.settings['ITEM_PROCESSOR'])
            self.itemproc = itemproc_cls.from_crawler(crawler)
            self.concurrent_items = crawler.settings.getint('CONCURRENT_ITEMS')
            self.crawler = crawler
            self.signals = crawler.signals
            self.logformatter = crawler.logformatter

itemproc_cls就是加载item pipeline了

并且处理item 和Request

        def _process_spidermw_output(self, output, request, response, spider):
            """Process each Request/Item (given in the output parameter) returned
            from the given spider
            """
            if isinstance(output, Request):
                self.crawler.engine.crawl(request=output, spider=spider)
            elif isinstance(output, (BaseItem, dict)):
                self.slot.itemproc_size += 1
                dfd = self.itemproc.process_item(output, spider)
                dfd.addBoth(self._itemproc_finished, output, response, spider)
                return dfd
            elif output is None:
                pass
            else:
                typename = type(output).__name__
                logger.error('Spider must return Request, BaseItem, dict or None, '
                             'got %(typename)r in %(request)s',
                             {'request': request, 'typename': typename},
                             extra={'spider': spider})

参考源码:scrapy/scrapy/core/downloader/init.py

    class Downloader(object):
    
        def __init__(self, crawler):
            self.settings = crawler.settings
            self.signals = crawler.signals
            self.slots = {}
            self.active = set()
            self.handlers = DownloadHandlers(crawler)
            self.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS')
            self.domain_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
            self.ip_concurrency = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
            self.randomize_delay = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
            self.middleware = DownloaderMiddlewareManager.from_crawler(crawler)
            self._slot_gc_loop = task.LoopingCall(self._slot_gc)
            self._slot_gc_loop.start(60)

然后转向engine 简单理解就是在engine里面的运行时加载donwload(下载中间件)

    class ExecutionEngine(object):
    
        def __init__(self, crawler, spider_closed_callback):
            self.crawler = crawler
            self.settings = crawler.settings
            self.signals = crawler.signals
            self.logformatter = crawler.logformatter
            self.slot = None
            self.spider = None
            self.running = False
            self.paused = False
            self.scheduler_cls = load_object(self.settings['SCHEDULER'])
            downloader_cls = load_object(self.settings['DOWNLOADER'])
            self.downloader = downloader_cls(crawler)
            self.scraper = Scraper(crawler)
            self._spider_closed_callback = spider_closed_callback

将scraper也加入了引擎Engine