Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions src/apify/scrapy/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,21 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
elif scrapy_request.meta.get('apify_request_unique_key'):
request_kwargs['unique_key'] = scrapy_request.meta['apify_request_unique_key']

# Serialize the Scrapy request now, before `Request.from_url()` runs below. `from_url()` mutates the
# `user_data` dict it receives in place (it injects a live `CrawleeRequestData` under `__crawlee`), and that
# dict can be the spider's own `meta['userData']`. Capturing `to_dict()` first keeps the stored blob free of
# those injected internals, and copying `user_data` below leaves the spider's request untouched.
scrapy_request_dict = scrapy_request.to_dict(spider=spider)

user_data = scrapy_request.meta.get('userData', {})

# Convert UserData Pydantic model to a plain dict to prevent CrawleeRequestData objects from leaking
# into Request.from_url() during Scrapy-Apify roundtrips.
# into Request.from_url() during Scrapy-Apify roundtrips. `model_dump()` already returns a fresh dict; the
# plain-dict case is copied so the `pop` and `from_url()` mutations below never touch the spider's meta.
if isinstance(user_data, UserData):
user_data = user_data.model_dump(by_alias=True)
elif isinstance(user_data, dict):
user_data = dict(user_data)

# Remove internal Crawlee data since it's managed by Request.from_url() and values from previous roundtrips
# cause incorrect state.
Expand Down Expand Up @@ -117,7 +126,6 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
)

apify_request = ApifyRequest.from_url(**request_kwargs)
scrapy_request_dict = scrapy_request.to_dict(spider=spider)

except Exception as exc:
logger.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
Expand Down
23 changes: 23 additions & 0 deletions tests/unit/scrapy/requests/test_to_apify_request.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import json
import logging
from typing import cast

Expand Down Expand Up @@ -140,6 +141,28 @@ def test_roundtrip_follow_up_request_with_propagated_userdata(spider: Spider) ->
assert follow_up_apify_request.url == 'https://example.com/image.png'


def test_does_not_mutate_spider_request_user_data(spider: Spider) -> None:
"""Conversion must not mutate the spider's own `meta['userData']` by injecting Crawlee internals."""
user_data = {'some_user_data': 'test'}
scrapy_request = Request(url='https://example.com', meta={'userData': user_data})

to_apify_request(scrapy_request, spider)

assert user_data == {'some_user_data': 'test'}
assert '__crawlee' not in user_data


def test_serialized_request_omits_injected_crawlee_data(spider: Spider) -> None:
"""The stored `scrapy_request` blob must not embed the `__crawlee` data `Request.from_url()` injects."""
scrapy_request = Request(url='https://example.com', meta={'userData': {'some_user_data': 'test'}})

apify_request = to_apify_request(scrapy_request, spider)
assert apify_request is not None

stored = json.loads(cast('str', apify_request.user_data['scrapy_request']))
assert '__crawlee' not in stored['meta'].get('userData', {})


def test_dont_filter_request_is_always_enqueued(spider: Spider) -> None:
"""A `dont_filter=True` request is always enqueued: each conversion gets a fresh unique key, bypassing dedup."""
first = to_apify_request(Request(url='https://example.com', dont_filter=True), spider)
Expand Down