Source code for zyte_spider_templates._incremental.middleware

import logging
from typing import AsyncGenerator, Union

from scrapinghub.client.exceptions import Unauthorized
from scrapy.crawler import Crawler
from scrapy.exceptions import CloseSpider, NotConfigured
from scrapy.http import Request
from zyte_common_items import Item

from .manager import CollectionsFingerprintsManager, IncrementalCrawlingManager

logger = logging.getLogger(__name__)


[docs] class IncrementalCrawlMiddleware: """:ref:`Downloader middleware <topics-spider-middleware>` to skip items seen in previous crawls. To enable this middleware, set the :setting:`INCREMENTAL_CRAWL_ENABLED` setting to ``True``. This middleware keeps a record of URLs of crawled items in the :ref:`Zyte Scrapy Cloud collection <api-collections>` specified in the :setting:`INCREMENTAL_CRAWL_COLLECTION_NAME` setting, and skips items, responses and requests with matching URLs. Use :setting:`INCREMENTAL_CRAWL_BATCH_SIZE` to fine-tune interactions with the collection for performance. """ def __init__(self, crawler: Crawler): assert crawler.spider if not crawler.spider.settings.getbool("INCREMENTAL_CRAWL_ENABLED", False): raise NotConfigured self.inc_manager: IncrementalCrawlingManager = self.prepare_incremental_manager( crawler ) @staticmethod def prepare_incremental_manager(crawler): try: collection_fp = CollectionsFingerprintsManager(crawler) except (AttributeError, Unauthorized, RuntimeError, ValueError) as exc_info: logger.error( f"IncrementalCrawlMiddleware is enabled, but something went wrong with Collections.\n" f"The reason: {exc_info}" ) raise CloseSpider("incremental_crawling_middleware_collection_issue") return IncrementalCrawlingManager(crawler, collection_fp) @classmethod def from_crawler(cls, crawler: Crawler): return cls(crawler) async def process_spider_output( self, response, result, spider ) -> AsyncGenerator[Union[Request, Item], None]: result_list = [] async for item_or_request in result: result_list.append(item_or_request) unique_items_or_requests = await self.inc_manager.process_incremental_async( response.request, result_list ) for item_or_request in unique_items_or_requests: yield item_or_request