diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 1524d7aa..ce031f67 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -80,12 +80,21 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ elif scrapy_request.meta.get('apify_request_unique_key'): request_kwargs['unique_key'] = scrapy_request.meta['apify_request_unique_key'] + # Serialize the Scrapy request now, before `Request.from_url()` runs below. `from_url()` mutates the + # `user_data` dict it receives in place (it injects a live `CrawleeRequestData` under `__crawlee`), and that + # dict can be the spider's own `meta['userData']`. Capturing `to_dict()` first keeps the stored blob free of + # those injected internals, and copying `user_data` below leaves the spider's request untouched. + scrapy_request_dict = scrapy_request.to_dict(spider=spider) + user_data = scrapy_request.meta.get('userData', {}) # Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects from leaking - # into Request.from_url() during Scrapy-Apify roundtrips. + # into Request.from_url() during Scrapy-Apify roundtrips. `model_dump()` already returns a fresh dict; the + # plain-dict case is copied so the `pop` and `from_url()` mutations below never touch the spider's meta. if isinstance(user_data, UserData): user_data = user_data.model_dump(by_alias=True) + elif isinstance(user_data, dict): + user_data = dict(user_data) # Remove internal Crawlee data since it's managed by Request.from_url() and values from previous roundtrips # cause incorrect state. @@ -117,7 +126,6 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ ) apify_request = ApifyRequest.from_url(**request_kwargs) - scrapy_request_dict = scrapy_request.to_dict(spider=spider) except Exception as exc: logger.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}') diff --git a/tests/unit/scrapy/requests/test_to_apify_request.py b/tests/unit/scrapy/requests/test_to_apify_request.py index 482d5c5d..c7e94f21 100644 --- a/tests/unit/scrapy/requests/test_to_apify_request.py +++ b/tests/unit/scrapy/requests/test_to_apify_request.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import logging from typing import cast @@ -140,6 +141,28 @@ def test_roundtrip_follow_up_request_with_propagated_userdata(spider: Spider) -> assert follow_up_apify_request.url == 'https://example.com/image.png' +def test_does_not_mutate_spider_request_user_data(spider: Spider) -> None: + """Conversion must not mutate the spider's own `meta['userData']` by injecting Crawlee internals.""" + user_data = {'some_user_data': 'test'} + scrapy_request = Request(url='https://example.com', meta={'userData': user_data}) + + to_apify_request(scrapy_request, spider) + + assert user_data == {'some_user_data': 'test'} + assert '__crawlee' not in user_data + + +def test_serialized_request_omits_injected_crawlee_data(spider: Spider) -> None: + """The stored `scrapy_request` blob must not embed the `__crawlee` data `Request.from_url()` injects.""" + scrapy_request = Request(url='https://example.com', meta={'userData': {'some_user_data': 'test'}}) + + apify_request = to_apify_request(scrapy_request, spider) + assert apify_request is not None + + stored = json.loads(cast('str', apify_request.user_data['scrapy_request'])) + assert '__crawlee' not in stored['meta'].get('userData', {}) + + def test_dont_filter_request_is_always_enqueued(spider: Spider) -> None: """A `dont_filter=True` request is always enqueued: each conversion gets a fresh unique key, bypassing dedup.""" first = to_apify_request(Request(url='https://example.com', dont_filter=True), spider)