From 9a8347136fc41fb23f58c4c093226799c23ce4bb Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 26 Apr 2022 20:59:18 +0800 Subject: [PATCH 01/28] create ZyteAPITextResponse and ZyteAPIResponse --- scrapy_zyte_api/handler.py | 37 +++++----------- scrapy_zyte_api/responses.py | 52 +++++++++++++++++++++++ tests/test_responses.py | 81 ++++++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+), 27 deletions(-) create mode 100644 scrapy_zyte_api/responses.py create mode 100644 tests/test_responses.py diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 92035c41..c80878ba 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -1,14 +1,13 @@ import json import logging import os -from base64 import b64decode -from typing import Any, Dict, Generator, List, Optional +from typing import Any, Dict, Generator, Union from scrapy import Spider from scrapy.core.downloader.handlers.http import HTTPDownloadHandler from scrapy.crawler import Crawler from scrapy.exceptions import IgnoreRequest, NotConfigured -from scrapy.http import Request, Response, TextResponse +from scrapy.http import Request from scrapy.settings import Settings from scrapy.utils.defer import deferred_from_coro from scrapy.utils.reactor import verify_installed_reactor @@ -16,6 +15,8 @@ from zyte_api.aio.client import AsyncClient, create_session from zyte_api.aio.errors import RequestError +from .responses import ZyteAPIResponse, ZyteAPITextResponse + logger = logging.getLogger(__name__) @@ -53,7 +54,9 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: else: return super().download_request(request, spider) - async def _download_request(self, request: Request, spider: Spider) -> Response: + async def _download_request( + self, request: Request, spider: Spider + ) -> Union[ZyteAPITextResponse, ZyteAPIResponse]: api_params: Dict[str, Any] = request.meta["zyte_api"] if not isinstance(api_params, dict): logger.error( @@ -81,30 +84,16 @@ async def _download_request(self, request: Request, spider: Spider) -> Response: ) raise IgnoreRequest() self._stats.inc_value("scrapy-zyte-api/request_count") - headers = self._prepare_headers(api_response.get("httpResponseHeaders")) # browserHtml and httpResponseBody are not allowed at the same time, # but at least one of them should be present if api_response.get("browserHtml"): # Using TextResponse because browserHtml always returns a browser-rendered page # even when requesting files (like images) - return TextResponse( - url=api_response["url"], - status=200, - body=api_response["browserHtml"].encode(self._encoding), - encoding=self._encoding, - request=request, - flags=["zyte-api"], - headers=headers, + return ZyteAPITextResponse.from_api_response( + api_response, request=request, encoding=self._encoding ) else: - return Response( - url=api_response["url"], - status=200, - body=b64decode(api_response["httpResponseBody"]), - request=request, - flags=["zyte-api"], - headers=headers, - ) + return ZyteAPIResponse.from_api_response(api_response, request=request) @inlineCallbacks def close(self) -> Generator: @@ -129,9 +118,3 @@ def _get_request_error_message(error: RequestError) -> str: if error_data.get("detail"): return error_data["detail"] return base_message - - @staticmethod - def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]): - if not init_headers: - return None - return {h["name"]: h["value"] for h in init_headers} diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py new file mode 100644 index 00000000..dcff522d --- /dev/null +++ b/scrapy_zyte_api/responses.py @@ -0,0 +1,52 @@ +from base64 import b64decode +from typing import Dict, List, Optional + +from scrapy import Request +from scrapy.http import Response, TextResponse + + +class ZyteAPIMixin: + def __init__(self, *args, zyte_api_response=None, **kwargs): + super().__init__(*args, **kwargs) + self._zyte_api_response = zyte_api_response + + @property + def zyte_api_response(self): + return self._zyte_api_response + + @staticmethod + def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]): + if not init_headers: + return None + return {h["name"]: h["value"] for h in init_headers} + + +class ZyteAPITextResponse(ZyteAPIMixin, TextResponse): + @classmethod + def from_api_response( + cls, api_response: Dict, *, request: Request = None, encoding: str = "utf-8" + ): + return cls( + url=api_response["url"], + status=200, + body=api_response["browserHtml"].encode(encoding), + encoding=encoding, + request=request, + flags=["zyte-api"], + headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), + zyte_api_response=api_response, + ) + + +class ZyteAPIResponse(ZyteAPIMixin, Response): + @classmethod + def from_api_response(cls, api_response: Dict, *, request: Request = None): + return cls( + url=api_response["url"], + status=200, + body=b64decode(api_response["httpResponseBody"]), + request=request, + flags=["zyte-api"], + headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), + zyte_api_response=api_response, + ) diff --git a/tests/test_responses.py b/tests/test_responses.py new file mode 100644 index 00000000..47067edf --- /dev/null +++ b/tests/test_responses.py @@ -0,0 +1,81 @@ +from base64 import b64encode + +import pytest + +from scrapy_zyte_api.responses import ZyteAPIResponse, ZyteAPITextResponse + +PAGE_CONTENT = "The cake is a lie!" +URL = "https://example.com" + + +def api_response_browser(): + return { + "url": URL, + "browserHtml": PAGE_CONTENT, + "javascript": True, + "echoData": {"some_value": "here"}, + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html"}, + {"name": "Content-Length", "value": len(PAGE_CONTENT)}, + ], + } + + +def api_response_body(): + return { + "url": "https://example.com", + "httpResponseBody": b64encode(PAGE_CONTENT.encode("utf-8")), + "echoData": {"some_value": "here"}, + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html"}, + {"name": "Content-Length", "value": len(PAGE_CONTENT)}, + ], + } + + +EXPECTED_HEADERS = {b"Content-Type": [b"text/html"], b"Content-Length": [b"44"]} +EXPECTED_BODY = PAGE_CONTENT.encode("utf-8") + + +@pytest.mark.parametrize( + "api_response,cls", + [ + (api_response_browser, ZyteAPITextResponse), + (api_response_body, ZyteAPIResponse), + ], +) +def test_init(api_response, cls): + response = cls(URL, zyte_api_response=api_response()) + assert response.zyte_api_response == api_response() + + assert response.url == URL + assert response.status == 200 + assert not response.headers + assert response.body == b"" + assert not response.flags + assert response.request is None + assert response.certificate is None + assert response.ip_address is None + assert response.protocol is None + + +@pytest.mark.parametrize( + "api_response,cls", + [ + (api_response_browser, ZyteAPITextResponse), + (api_response_body, ZyteAPIResponse), + ], +) +def test_text_from_api_response(api_response, cls): + response = cls.from_api_response(api_response()) + assert response.zyte_api_response == api_response() + + assert response.url == URL + assert response.status == 200 + assert response.headers == EXPECTED_HEADERS + assert response.body == EXPECTED_BODY + assert response.flags == ["zyte-api"] + assert response.request is None + assert response.certificate is None + assert response.ip_address is None + assert response.protocol is None From 8909473a97e3708cf59fdb1035cfe1c350be1eb9 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 27 Apr 2022 19:41:17 +0800 Subject: [PATCH 02/28] update README and CHANGES with notes on new response classes --- CHANGES.rst | 8 ++++++ README.rst | 81 +++++++++++++++++++++++++++++++++++------------------ 2 files changed, 61 insertions(+), 28 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 53ed7f11..54bd9448 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,14 @@ Changes ======= +TBD +--- + +* Introduce ``ZyteAPIResponse`` and ``ZyteAPITextResponse`` which are subclasses + of ``scrapy.http.Response`` and ``scrapy.http.TextResponse`` respectively. + These new response classes hold the raw Zyte API response in its + ``zyte_api_response`` attribute. + 0.1.0 (2022-02-03) ------------------ diff --git a/README.rst b/README.rst index 3ef63fc9..a99fadcb 100644 --- a/README.rst +++ b/README.rst @@ -33,8 +33,8 @@ Installation This package requires Python 3.7+. -How to configure ----------------- +Configuration +------------- Replace the default ``http`` and ``https`` in Scrapy's `DOWNLOAD_HANDLERS `_ @@ -60,8 +60,8 @@ Here's example of the things needed inside a Scrapy project's ``settings.py`` fi TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" -How to use ----------- +Usage +----- Set the ``zyte_api`` `Request.meta `_ @@ -70,27 +70,52 @@ key to download a request using Zyte API. Full list of parameters is provided in .. code-block:: python - import scrapy - - - class TestSpider(scrapy.Spider): - name = "test" - - def start_requests(self): - - yield scrapy.Request( - url="http://books.toscrape.com/", - callback=self.parse, - meta={ - "zyte_api": { - "browserHtml": True, - # You can set any GEOLocation region you want. - "geolocation": "US", - "javascript": True, - "echoData": {"something": True}, - } - }, - ) - - def parse(self, response): - yield {"URL": response.url, "status": response.status, "HTML": response.body} + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + + yield scrapy.Request( + url="http://books.toscrape.com/", + callback=self.parse, + meta={ + "zyte_api": { + "browserHtml": True, + "geolocation": "US", # You can set any Geolocation region you want. + "javascript": True, + "echoData": {"some_value_I_could_track": 123}, + } + }, + ) + + def parse(self, response): + yield {"URL": response.url, "status": response.status, "HTML": response.body} + + print(response.zyte_api_response) + # { + # 'url': 'https://quotes.toscrape.com/', + # 'browserHtml': ' ... ', + # 'echoData': {'some_value_I_could_track': 123}, + # 'actions': [] + # } + + print(response.request.meta) + # { + # 'zyte_api': { + # 'browserHtml': True, + # 'geolocation': 'US', + # 'javascript': True, + # 'echoData': {'some_value_I_could_track': 123} + # }, + # 'download_timeout': 180.0, + # 'download_slot': 'quotes.toscrape.com' + # } + +The raw Zyte API Response can be accessed via the ``zyte_api_response`` attribute +of the response object. Note that such responses are of ``ZyteAPIResponse`` and +``ZyteAPITextResponse`` which are respectively subclasses of ``scrapy.http.Response`` +and ``scrapy.http.TextResponse``. Such classes are needed to hold the raw Zyte API +responses. From d0dc08ddd06b6f21800ccd2dd93af199792c88e6 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 28 Apr 2022 19:44:31 +0800 Subject: [PATCH 03/28] set the encoding consistently to be 'utf-8' --- scrapy_zyte_api/handler.py | 5 +---- scrapy_zyte_api/responses.py | 7 ++----- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index c80878ba..273a8e3d 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -32,7 +32,6 @@ def __init__( self._stats = crawler.stats self._job_id = crawler.settings.get("JOB") self._session = create_session() - self._encoding = "utf-8" @classmethod def from_crawler(cls, crawler): @@ -89,9 +88,7 @@ async def _download_request( if api_response.get("browserHtml"): # Using TextResponse because browserHtml always returns a browser-rendered page # even when requesting files (like images) - return ZyteAPITextResponse.from_api_response( - api_response, request=request, encoding=self._encoding - ) + return ZyteAPITextResponse.from_api_response(api_response, request=request) else: return ZyteAPIResponse.from_api_response(api_response, request=request) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index dcff522d..6e5c5735 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -23,14 +23,11 @@ def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]): class ZyteAPITextResponse(ZyteAPIMixin, TextResponse): @classmethod - def from_api_response( - cls, api_response: Dict, *, request: Request = None, encoding: str = "utf-8" - ): + def from_api_response(cls, api_response: Dict, *, request: Request = None): return cls( url=api_response["url"], status=200, - body=api_response["browserHtml"].encode(encoding), - encoding=encoding, + body=api_response["browserHtml"].encode("utf-8"), request=request, flags=["zyte-api"], headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), From 109dbf0f7f4dd6bf54d3cdd2b08504130439c636 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 28 Apr 2022 20:00:01 +0800 Subject: [PATCH 04/28] improve example and docs --- README.rst | 1 - scrapy_zyte_api/responses.py | 13 ++++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index a99fadcb..13223d3a 100644 --- a/README.rst +++ b/README.rst @@ -99,7 +99,6 @@ key to download a request using Zyte API. Full list of parameters is provided in # 'url': 'https://quotes.toscrape.com/', # 'browserHtml': ' ... ', # 'echoData': {'some_value_I_could_track': 123}, - # 'actions': [] # } print(response.request.meta) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 6e5c5735..d3d260aa 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -11,7 +11,12 @@ def __init__(self, *args, zyte_api_response=None, **kwargs): self._zyte_api_response = zyte_api_response @property - def zyte_api_response(self): + def zyte_api_response(self) -> Dict: + """Contains the raw API response from Zyte API. + + To see the full list of parameters and their description, kindly refer to the + `Zyte API Specification `_. + """ return self._zyte_api_response @staticmethod @@ -24,6 +29,9 @@ def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]): class ZyteAPITextResponse(ZyteAPIMixin, TextResponse): @classmethod def from_api_response(cls, api_response: Dict, *, request: Request = None): + """Alternative constructor to instantiate the response from the raw + Zyte API response. + """ return cls( url=api_response["url"], status=200, @@ -38,6 +46,9 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): class ZyteAPIResponse(ZyteAPIMixin, Response): @classmethod def from_api_response(cls, api_response: Dict, *, request: Request = None): + """Alternative constructor to instantiate the response from the raw + Zyte API response. + """ return cls( url=api_response["url"], status=200, From 9695880ef8d20873b06b9f6ba2da335754c54fcf Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 28 Apr 2022 20:12:52 +0800 Subject: [PATCH 05/28] override replace() to prevent 'zyte_api_response' attribute from being lost --- scrapy_zyte_api/responses.py | 12 +++++++++++- tests/test_responses.py | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index d3d260aa..65207bec 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -6,10 +6,20 @@ class ZyteAPIMixin: - def __init__(self, *args, zyte_api_response=None, **kwargs): + def __init__(self, *args, zyte_api_response: Dict = None, **kwargs): super().__init__(*args, **kwargs) self._zyte_api_response = zyte_api_response + def replace(self, *args, **kwargs): + """Create a new response with the same attributes except for those given + new values. + + NOTE: This doesn't support replacing the ``zyte_api_response`` attribute. + """ + instance = super().replace(*args, **kwargs) + instance._zyte_api_response = self.zyte_api_response + return instance + @property def zyte_api_response(self) -> Dict: """Contains the raw API response from Zyte API. diff --git a/tests/test_responses.py b/tests/test_responses.py index 47067edf..f63f1421 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -79,3 +79,23 @@ def test_text_from_api_response(api_response, cls): assert response.certificate is None assert response.ip_address is None assert response.protocol is None + + +@pytest.mark.parametrize( + "api_response,cls", + [ + (api_response_browser, ZyteAPITextResponse), + (api_response_body, ZyteAPIResponse), + ], +) +def test_response_replace(api_response, cls): + orig_response = cls.from_api_response(api_response()) + + # The ``zyte_api_response`` should not be replaced. + new_response = orig_response.replace(zyte_api_response={"overridden": "value"}) + assert new_response.zyte_api_response == orig_response.zyte_api_response + + # It should still work the same way + new_response = orig_response.replace(status=404) + assert new_response.status == 404 + assert new_response.zyte_api_response == orig_response.zyte_api_response From 8812a05b0cf0d71a7c7069f6d3ad5b3819361f53 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 28 Apr 2022 20:20:22 +0800 Subject: [PATCH 06/28] fix mypy failures --- scrapy_zyte_api/responses.py | 2 +- tests/test_api_requests.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 65207bec..5183686f 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -21,7 +21,7 @@ def replace(self, *args, **kwargs): return instance @property - def zyte_api_response(self) -> Dict: + def zyte_api_response(self) -> Optional[Dict]: """Contains the raw API response from Zyte API. To see the full list of parameters and their description, kindly refer to the diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index f6cca9e7..c62640ed 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -44,7 +44,7 @@ async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): coro = handler._download_request(req, Spider("test")) assert iscoroutine(coro) assert not isinstance(coro, Deferred) - resp = await coro # NOQA + resp = await coro # type: ignore assert isinstance(resp, TextResponse) assert resp.request is req @@ -81,7 +81,7 @@ async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]]) coro = handler._download_request(req, Spider("test")) assert iscoroutine(coro) assert not isinstance(coro, Deferred) - resp = await coro # NOQA + resp = await coro # type: ignore assert isinstance(resp, Response) assert resp.request is req @@ -109,7 +109,7 @@ async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any coro = handler._download_request(req, Spider("test")) assert iscoroutine(coro) assert not isinstance(coro, Deferred) - resp = await coro # NOQA + resp = await coro # type: ignore assert resp.request is req assert resp.url == req.url From ba64103d617649317a59ddcb11690e82d14f236f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 28 Apr 2022 20:59:48 +0800 Subject: [PATCH 07/28] enforce 'utf-8' encoding on Text responses --- scrapy_zyte_api/responses.py | 5 ++++- tests/test_responses.py | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 5183686f..87cfd45a 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -4,6 +4,8 @@ from scrapy import Request from scrapy.http import Response, TextResponse +_ENCODING = "utf-8" + class ZyteAPIMixin: def __init__(self, *args, zyte_api_response: Dict = None, **kwargs): @@ -45,7 +47,8 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): return cls( url=api_response["url"], status=200, - body=api_response["browserHtml"].encode("utf-8"), + body=api_response["browserHtml"].encode(_ENCODING), + encoding=_ENCODING, request=request, flags=["zyte-api"], headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), diff --git a/tests/test_responses.py b/tests/test_responses.py index f63f1421..2ee06cab 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -99,3 +99,24 @@ def test_response_replace(api_response, cls): new_response = orig_response.replace(status=404) assert new_response.status == 404 assert new_response.zyte_api_response == orig_response.zyte_api_response + + +def test_non_utf8_response(): + content = "Some non-ASCII ✨ chars" + sample_zyte_api_response = { + "url": URL, + "browserHtml": content, + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html; charset=iso-8859-1"}, + {"name": "Content-Length", "value": len(content)}, + ], + } + + # Encoding inference should not kick in under the hood for + # ``scrapy.http.TextResponse`` since ``ZyteAPITextResponse`` using "utf-8" + # for it. This is the default encoding for the "browserHtml" contents from + # Zyte API. Thus, even if the Response Headers or tags indicate a + # different encoding, it should still be treated as "utf-8". + response = ZyteAPITextResponse.from_api_response(sample_zyte_api_response) + assert response.text == content + assert response.encoding == "utf-8" From 84dac7de138f50f6e0727384fafccad6062817a5 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 29 Apr 2022 11:22:36 +0800 Subject: [PATCH 08/28] update expectation for replacing zyte_api_response attribute --- scrapy_zyte_api/responses.py | 6 +----- tests/test_responses.py | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 87cfd45a..a51d3c21 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -15,12 +15,8 @@ def __init__(self, *args, zyte_api_response: Dict = None, **kwargs): def replace(self, *args, **kwargs): """Create a new response with the same attributes except for those given new values. - - NOTE: This doesn't support replacing the ``zyte_api_response`` attribute. """ - instance = super().replace(*args, **kwargs) - instance._zyte_api_response = self.zyte_api_response - return instance + return super().replace(*args, **kwargs) @property def zyte_api_response(self) -> Optional[Dict]: diff --git a/tests/test_responses.py b/tests/test_responses.py index 2ee06cab..a24bac93 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -91,14 +91,29 @@ def test_text_from_api_response(api_response, cls): def test_response_replace(api_response, cls): orig_response = cls.from_api_response(api_response()) - # The ``zyte_api_response`` should not be replaced. - new_response = orig_response.replace(zyte_api_response={"overridden": "value"}) - assert new_response.zyte_api_response == orig_response.zyte_api_response - # It should still work the same way new_response = orig_response.replace(status=404) assert new_response.status == 404 - assert new_response.zyte_api_response == orig_response.zyte_api_response + + new_response = orig_response.replace(url="https://new-example.com") + assert new_response.url == "https://new-example.com" + + +@pytest.mark.xfail +@pytest.mark.parametrize( + "api_response,cls", + [ + (api_response_browser, ZyteAPITextResponse), + (api_response_body, ZyteAPIResponse), + ], +) +def test_response_replace_zyte_api_response(api_response, cls): + orig_response = cls.from_api_response(api_response()) + + # The ``zyte_api_response`` should not be replaced. + new_zyte_api_response = {"overridden": "value"} + new_response = orig_response.replace(zyte_api_response=new_zyte_api_response) + assert new_response.zyte_api_response == api_response() def test_non_utf8_response(): From 5b834437f144d76b28c5f83c960a3692954e408d Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 29 Apr 2022 21:28:55 +0800 Subject: [PATCH 09/28] update README regarding default params --- README.rst | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 13223d3a..34c45dd1 100644 --- a/README.rst +++ b/README.rst @@ -60,13 +60,40 @@ Here's example of the things needed inside a Scrapy project's ``settings.py`` fi TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +To enable every request to be sent through Zyte API, you can set the following +in the ``settings.py`` file: + +.. code-block:: python + + ZYTE_API_ENABLED = True + +Moreover, it can also be set inside the spider as an attribute: + +.. code-block:: python + + class MySpider: + zyte_api_enabled = True + +The default parameters sent for every request could be declared in the ``settings.py`` +file or `any other settings `_ +as: + +.. code-block:: python + + ZYTE_API_DEFAULT_PARAMS = { + "browserHtml": True, + "geolocation": "US", + } + +You can see the full list of parameters in the `Zyte API Specification +`_. + Usage ----- -Set the ``zyte_api`` `Request.meta -`_ -key to download a request using Zyte API. Full list of parameters is provided in the -`Zyte API Specification `_. +Setting ``ZYTE_API_ENABLED=True`` would enable Zyte API for every request. On the +other hand, you could also control it on a per request basis by setting the +``zyte_api`` key in `Request.meta `_. .. code-block:: python From fb0b41285da303f72be4a47fbe24b21b5aeb3494 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 2 May 2022 13:34:12 +0800 Subject: [PATCH 10/28] remove 'Content-Encoding' header when returning responses --- scrapy_zyte_api/responses.py | 18 +++++++++++++++--- tests/test_responses.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index a51d3c21..f0fc4835 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -8,6 +8,14 @@ class ZyteAPIMixin: + + REMOVE_HEADERS = { + # Zyte API already decompresses the HTTP Response Body. Scrapy's + # HttpCompressionMiddleware will error out when it attempts to + # decompress an already decompressed body based on this header. + "content-encoding" + } + def __init__(self, *args, zyte_api_response: Dict = None, **kwargs): super().__init__(*args, **kwargs) self._zyte_api_response = zyte_api_response @@ -27,11 +35,15 @@ def zyte_api_response(self) -> Optional[Dict]: """ return self._zyte_api_response - @staticmethod - def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]): + @classmethod + def _prepare_headers(cls, init_headers: Optional[List[Dict[str, str]]]): if not init_headers: return None - return {h["name"]: h["value"] for h in init_headers} + return { + h["name"]: h["value"] + for h in init_headers + if h["name"].lower() not in cls.REMOVE_HEADERS + } class ZyteAPITextResponse(ZyteAPIMixin, TextResponse): diff --git a/tests/test_responses.py b/tests/test_responses.py index a24bac93..6eb949e2 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -135,3 +135,32 @@ def test_non_utf8_response(): response = ZyteAPITextResponse.from_api_response(sample_zyte_api_response) assert response.text == content assert response.encoding == "utf-8" + + +@pytest.mark.parametrize( + "api_response,cls", + [ + (api_response_browser, ZyteAPITextResponse), + (api_response_body, ZyteAPIResponse), + ], +) +def test_response_headers_removal(api_response, cls): + """Headers like 'Content-Encoding' should be removed later in the response + instance returned to Scrapy. + + However, it should still be present inside 'zyte_api_response.headers'. + """ + additional_headers = [ + {"name": "Content-Encoding", "value": "gzip"}, + {"name": "X-Some-Other-Value", "value": "123"}, + ] + raw_response = api_response() + raw_response["httpResponseHeaders"] = additional_headers + + response = cls.from_api_response(raw_response) + + assert response.headers == {b"X-Some-Other-Value": [b"123"]} + assert ( + response.zyte_api_response["httpResponseHeaders"] + == raw_response["httpResponseHeaders"] + ) From 10a460332b4a47b4ea479cd2378230dfd1378cf0 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 2 May 2022 20:45:44 +0800 Subject: [PATCH 11/28] remove the ZYTE_API_ENABLED setting --- README.rst | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/README.rst b/README.rst index 34c45dd1..5f844e32 100644 --- a/README.rst +++ b/README.rst @@ -46,7 +46,7 @@ Lastly, make sure to `install the asyncio-based Twisted reactor `_ in the ``settings.py`` file as well: -Here's example of the things needed inside a Scrapy project's ``settings.py`` file: +Here's an example of the things needed inside a Scrapy project's ``settings.py`` file: .. code-block:: python @@ -60,39 +60,38 @@ Here's example of the things needed inside a Scrapy project's ``settings.py`` fi TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +Usage +----- + To enable every request to be sent through Zyte API, you can set the following -in the ``settings.py`` file: +in the ``settings.py`` file or `any other settings within Scrapy +`_: .. code-block:: python - ZYTE_API_ENABLED = True + ZYTE_API_DEFAULT_PARAMS = { + "browserHtml": True, + "geolocation": "US", + } Moreover, it can also be set inside the spider as an attribute: .. code-block:: python class MySpider: - zyte_api_enabled = True + zyte_api_default_params = { + "browserHtml": True, + "geolocation": "US", + } -The default parameters sent for every request could be declared in the ``settings.py`` -file or `any other settings `_ -as: - -.. code-block:: python - - ZYTE_API_DEFAULT_PARAMS = { - "browserHtml": True, - "geolocation": "US", - } +If the default parameters are both set in the ``settings.py`` and the **spider**, +the values within ``settings.py`` will take effect first. The values within the +**spider** will be next, essentially overwriting similar parameters. You can see the full list of parameters in the `Zyte API Specification `_. -Usage ------ - -Setting ``ZYTE_API_ENABLED=True`` would enable Zyte API for every request. On the -other hand, you could also control it on a per request basis by setting the +On the other hand, you could also control it on a per request basis by setting the ``zyte_api`` key in `Request.meta `_. .. code-block:: python From b7102fad4acf7ec4ba390095be4f8fc96057060a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 2 May 2022 20:51:35 +0800 Subject: [PATCH 12/28] remove zyte_api_default_params in the spider --- README.rst | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/README.rst b/README.rst index 5f844e32..065e1529 100644 --- a/README.rst +++ b/README.rst @@ -74,20 +74,6 @@ in the ``settings.py`` file or `any other settings within Scrapy "geolocation": "US", } -Moreover, it can also be set inside the spider as an attribute: - -.. code-block:: python - - class MySpider: - zyte_api_default_params = { - "browserHtml": True, - "geolocation": "US", - } - -If the default parameters are both set in the ``settings.py`` and the **spider**, -the values within ``settings.py`` will take effect first. The values within the -**spider** will be next, essentially overwriting similar parameters. - You can see the full list of parameters in the `Zyte API Specification `_. From 2b4a0fb07caf7c763a0c2b3dc364290169198a45 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 2 May 2022 21:27:55 +0800 Subject: [PATCH 13/28] refactor TestAPI to have single producer of requests and responses --- tests/test_api_requests.py | 91 ++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 54 deletions(-) diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index c62640ed..69d1fd1a 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -23,17 +23,9 @@ class TestAPI: - @pytest.mark.parametrize( - "meta", - [ - {"zyte_api": {"browserHtml": True}}, - {"zyte_api": {"browserHtml": True, "geolocation": "US"}}, - {"zyte_api": {"browserHtml": True, "geolocation": "US", "echoData": 123}}, - {"zyte_api": {"browserHtml": True, "randomParameter": None}}, - ], - ) - @pytest.mark.asyncio - async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): + + @staticmethod + async def produce_request_response(meta): with MockServer() as server: async with make_handler({}, server.urljoin("/")) as handler: req = Request( @@ -45,14 +37,27 @@ async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): assert iscoroutine(coro) assert not isinstance(coro, Deferred) resp = await coro # type: ignore + return req, resp - assert isinstance(resp, TextResponse) - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" - assert resp.text == "" + @pytest.mark.parametrize( + "meta", + [ + {"zyte_api": {"browserHtml": True}}, + {"zyte_api": {"browserHtml": True, "geolocation": "US"}}, + {"zyte_api": {"browserHtml": True, "geolocation": "US", "echoData": 123}}, + {"zyte_api": {"browserHtml": True, "randomParameter": None}}, + ], + ) + @pytest.mark.asyncio + async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): + req, resp = await self.produce_request_response(meta) + assert isinstance(resp, TextResponse) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"" + assert resp.text == "" @pytest.mark.parametrize( "meta", @@ -71,24 +76,13 @@ async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): ) @pytest.mark.asyncio async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]]): - with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: - req = Request( - "http://example.com", - method="POST", - meta=meta, - ) - coro = handler._download_request(req, Spider("test")) - assert iscoroutine(coro) - assert not isinstance(coro, Deferred) - resp = await coro # type: ignore - - assert isinstance(resp, Response) - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" + req, resp = await self.produce_request_response(meta) + assert isinstance(resp, Response) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"" @pytest.mark.parametrize( "meta", @@ -99,24 +93,13 @@ async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]]) ) @pytest.mark.asyncio async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any]]): - with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: - req = Request( - "http://example.com", - method="POST", - meta=meta, - ) - coro = handler._download_request(req, Spider("test")) - assert iscoroutine(coro) - assert not isinstance(coro, Deferred) - resp = await coro # type: ignore - - assert resp.request is req - assert resp.url == req.url - assert resp.status == 200 - assert "zyte-api" in resp.flags - assert resp.body == b"" - assert resp.headers == {b"Test_Header": [b"test_value"]} + req, resp = await self.produce_request_response(meta) + assert resp.request is req + assert resp.url == req.url + assert resp.status == 200 + assert "zyte-api" in resp.flags + assert resp.body == b"" + assert resp.headers == {b"Test_Header": [b"test_value"]} @pytest.mark.parametrize( "meta, api_relevant", From 97ea1e4b6ec770eeda1dcf846e7e31531ba7a64d Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 3 May 2022 10:54:32 +0800 Subject: [PATCH 14/28] implement ZYTE_API_DEFAULT_PARAMS in the settings --- README.rst | 2 ++ scrapy_zyte_api/handler.py | 12 ++++++--- tests/test_api_requests.py | 55 +++++++++++++++++++++++++++++++++++--- 3 files changed, 61 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index 065e1529..36ae4241 100644 --- a/README.rst +++ b/README.rst @@ -79,6 +79,8 @@ You can see the full list of parameters in the `Zyte API Specification On the other hand, you could also control it on a per request basis by setting the ``zyte_api`` key in `Request.meta `_. +When doing so, it will override any parameters that was set via the setting +named ``ZYTE_API_DEFAULT_PARAMS``. .. code-block:: python diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 273a8e3d..a395f871 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -31,6 +31,7 @@ def __init__( ) self._stats = crawler.stats self._job_id = crawler.settings.get("JOB") + self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS") self._session = create_session() @classmethod @@ -56,11 +57,14 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: async def _download_request( self, request: Request, spider: Spider ) -> Union[ZyteAPITextResponse, ZyteAPIResponse]: - api_params: Dict[str, Any] = request.meta["zyte_api"] - if not isinstance(api_params, dict): + api_params: Dict[str, Any] = self._zyte_api_default_params or {} + try: + api_params.update(request.meta.get("zyte_api") or {}) + except TypeError: logger.error( - "zyte_api parameters in the request meta should be " - f"provided as dictionary, got {type(api_params)} instead ({request.url})." + f"zyte_api parameters in the request meta should be " + f"provided as dictionary, got {type(request.meta.get('zyte_api'))} " + f"instead ({request.url})." ) raise IgnoreRequest() # Define url by default diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 69d1fd1a..fe81b308 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,6 +1,7 @@ import os from asyncio import iscoroutine from typing import Any, Dict +from unittest import mock import pytest from _pytest.logging import LogCaptureFixture # NOQA @@ -23,17 +24,16 @@ class TestAPI: - @staticmethod - async def produce_request_response(meta): + async def produce_request_response(meta, custom_settings=None): with MockServer() as server: - async with make_handler({}, server.urljoin("/")) as handler: + async with make_handler(custom_settings, server.urljoin("/")) as handler: req = Request( "http://example.com", method="POST", meta=meta, ) - coro = handler._download_request(req, Spider("test")) + coro = handler._download_request(req, None) assert iscoroutine(coro) assert not isinstance(coro, Deferred) resp = await coro # type: ignore @@ -101,6 +101,53 @@ async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any assert resp.body == b"" assert resp.headers == {b"Test_Header": [b"test_value"]} + @pytest.mark.parametrize( + "meta,custom_settings,expected", + [ + ({}, {}, {}), + ({"zyte_api": {}}, {}, {}), + ( + {}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "CA"}, + ), + ( + {"zyte_api": {}}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "CA"}, + ), + ( + {"zyte_api": {"javascript": True, "geolocation": "US"}}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "US", "javascript": True}, + ), + ], + ) + @mock.patch("tests.AsyncClient") + @pytest.mark.asyncio + async def test_empty_zyte_api_request_meta( + self, + mock_client, + meta: Dict[str, Dict[str, Any]], + custom_settings: Dict[str, str], + expected: Dict[str, str], + ): + try: + # This would always error out since the mocked client doesn't + # return the expected API response. + await self.produce_request_response(meta, custom_settings=custom_settings) + except: + pass + + request_call = [c for c in mock_client.mock_calls if "request_raw(" in str(c)] + if not request_call: + pytest.fail("The client's request_raw() method was not called.") + + args_used = request_call[0].args[0] + args_used.pop("url") + + assert args_used == expected + @pytest.mark.parametrize( "meta, api_relevant", [ From 5dd1bec7b9d181508f86df350d1a655d24166875 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 3 May 2022 11:07:44 +0800 Subject: [PATCH 15/28] fix failing tests --- README.rst | 4 ++-- tests/test_api_requests.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 36ae4241..a95c753d 100644 --- a/README.rst +++ b/README.rst @@ -79,8 +79,8 @@ You can see the full list of parameters in the `Zyte API Specification On the other hand, you could also control it on a per request basis by setting the ``zyte_api`` key in `Request.meta `_. -When doing so, it will override any parameters that was set via the setting -named ``ZYTE_API_DEFAULT_PARAMS``. +When doing so, it will override any parameters that was set in the +``ZYTE_API_DEFAULT_PARAMS`` setting. .. code-block:: python diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index fe81b308..316b0a59 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -1,4 +1,5 @@ import os +import sys from asyncio import iscoroutine from typing import Any, Dict from unittest import mock @@ -101,6 +102,9 @@ async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any assert resp.body == b"" assert resp.headers == {b"Test_Header": [b"test_value"]} + @pytest.mark.skipif( + sys.version_info < (3, 8), reason="Python3.7 has poor support for AsyncMocks" + ) @pytest.mark.parametrize( "meta,custom_settings,expected", [ @@ -136,9 +140,10 @@ async def test_empty_zyte_api_request_meta( # This would always error out since the mocked client doesn't # return the expected API response. await self.produce_request_response(meta, custom_settings=custom_settings) - except: + except Exception: pass + # What we're interested in is the Request call in the API request_call = [c for c in mock_client.mock_calls if "request_raw(" in str(c)] if not request_call: pytest.fail("The client's request_raw() method was not called.") From 48a476683be9e5e72bc7f654b45d6ae6f3bd21e6 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 19 May 2022 16:38:23 +0800 Subject: [PATCH 16/28] rename zyte_api_response into zyte_api --- CHANGES.rst | 2 +- README.rst | 4 ++-- scrapy_zyte_api/responses.py | 12 ++++++------ tests/test_responses.py | 25 ++++++++++++------------- 4 files changed, 21 insertions(+), 22 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 54bd9448..9c4381fc 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -7,7 +7,7 @@ TBD * Introduce ``ZyteAPIResponse`` and ``ZyteAPITextResponse`` which are subclasses of ``scrapy.http.Response`` and ``scrapy.http.TextResponse`` respectively. These new response classes hold the raw Zyte API response in its - ``zyte_api_response`` attribute. + ``zyte_api`` attribute. 0.1.0 (2022-02-03) ------------------ diff --git a/README.rst b/README.rst index 13223d3a..d9267f4f 100644 --- a/README.rst +++ b/README.rst @@ -94,7 +94,7 @@ key to download a request using Zyte API. Full list of parameters is provided in def parse(self, response): yield {"URL": response.url, "status": response.status, "HTML": response.body} - print(response.zyte_api_response) + print(response.zyte_api) # { # 'url': 'https://quotes.toscrape.com/', # 'browserHtml': ' ... ', @@ -113,7 +113,7 @@ key to download a request using Zyte API. Full list of parameters is provided in # 'download_slot': 'quotes.toscrape.com' # } -The raw Zyte API Response can be accessed via the ``zyte_api_response`` attribute +The raw Zyte API Response can be accessed via the ``zyte_api`` attribute of the response object. Note that such responses are of ``ZyteAPIResponse`` and ``ZyteAPITextResponse`` which are respectively subclasses of ``scrapy.http.Response`` and ``scrapy.http.TextResponse``. Such classes are needed to hold the raw Zyte API diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index f0fc4835..95244b7f 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -16,9 +16,9 @@ class ZyteAPIMixin: "content-encoding" } - def __init__(self, *args, zyte_api_response: Dict = None, **kwargs): + def __init__(self, *args, zyte_api: Dict = None, **kwargs): super().__init__(*args, **kwargs) - self._zyte_api_response = zyte_api_response + self._zyte_api = zyte_api def replace(self, *args, **kwargs): """Create a new response with the same attributes except for those given @@ -27,13 +27,13 @@ def replace(self, *args, **kwargs): return super().replace(*args, **kwargs) @property - def zyte_api_response(self) -> Optional[Dict]: + def zyte_api(self) -> Optional[Dict]: """Contains the raw API response from Zyte API. To see the full list of parameters and their description, kindly refer to the `Zyte API Specification `_. """ - return self._zyte_api_response + return self._zyte_api @classmethod def _prepare_headers(cls, init_headers: Optional[List[Dict[str, str]]]): @@ -60,7 +60,7 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): request=request, flags=["zyte-api"], headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), - zyte_api_response=api_response, + zyte_api=api_response, ) @@ -77,5 +77,5 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): request=request, flags=["zyte-api"], headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), - zyte_api_response=api_response, + zyte_api=api_response, ) diff --git a/tests/test_responses.py b/tests/test_responses.py index 6eb949e2..012f3662 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -45,8 +45,8 @@ def api_response_body(): ], ) def test_init(api_response, cls): - response = cls(URL, zyte_api_response=api_response()) - assert response.zyte_api_response == api_response() + response = cls(URL, zyte_api=api_response()) + assert response.zyte_api == api_response() assert response.url == URL assert response.status == 200 @@ -68,7 +68,7 @@ def test_init(api_response, cls): ) def test_text_from_api_response(api_response, cls): response = cls.from_api_response(api_response()) - assert response.zyte_api_response == api_response() + assert response.zyte_api == api_response() assert response.url == URL assert response.status == 200 @@ -107,18 +107,18 @@ def test_response_replace(api_response, cls): (api_response_body, ZyteAPIResponse), ], ) -def test_response_replace_zyte_api_response(api_response, cls): +def test_response_replace_zyte_api(api_response, cls): orig_response = cls.from_api_response(api_response()) - # The ``zyte_api_response`` should not be replaced. - new_zyte_api_response = {"overridden": "value"} - new_response = orig_response.replace(zyte_api_response=new_zyte_api_response) - assert new_response.zyte_api_response == api_response() + # The ``zyte_api`` should not be replaced. + new_zyte_api = {"overridden": "value"} + new_response = orig_response.replace(zyte_api=new_zyte_api) + assert new_response.zyte_api == api_response() def test_non_utf8_response(): content = "Some non-ASCII ✨ chars" - sample_zyte_api_response = { + sample_zyte_api = { "url": URL, "browserHtml": content, "httpResponseHeaders": [ @@ -132,7 +132,7 @@ def test_non_utf8_response(): # for it. This is the default encoding for the "browserHtml" contents from # Zyte API. Thus, even if the Response Headers or tags indicate a # different encoding, it should still be treated as "utf-8". - response = ZyteAPITextResponse.from_api_response(sample_zyte_api_response) + response = ZyteAPITextResponse.from_api_response(sample_zyte_api) assert response.text == content assert response.encoding == "utf-8" @@ -148,7 +148,7 @@ def test_response_headers_removal(api_response, cls): """Headers like 'Content-Encoding' should be removed later in the response instance returned to Scrapy. - However, it should still be present inside 'zyte_api_response.headers'. + However, it should still be present inside 'zyte_api.headers'. """ additional_headers = [ {"name": "Content-Encoding", "value": "gzip"}, @@ -161,6 +161,5 @@ def test_response_headers_removal(api_response, cls): assert response.headers == {b"X-Some-Other-Value": [b"123"]} assert ( - response.zyte_api_response["httpResponseHeaders"] - == raw_response["httpResponseHeaders"] + response.zyte_api["httpResponseHeaders"] == raw_response["httpResponseHeaders"] ) From 910085bfb749cdc9bec0c7092de3d69c280386b5 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 25 May 2022 22:10:40 +0800 Subject: [PATCH 17/28] add tests for css/xpath selectors --- tests/mockserver.py | 2 +- tests/test_api_requests.py | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/mockserver.py b/tests/mockserver.py index 6ef5f5a4..e202860b 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -45,7 +45,7 @@ def do_POST(self): # NOQA {"name": "test_header", "value": "test_value"} ] if post_data.get("jobId") is None: - browser_html = "" + browser_html = "Hello

World!

" else: browser_html = f"{post_data['jobId']}" if post_data.get("browserHtml"): diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 316b0a59..40e8ba40 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -7,7 +7,7 @@ import pytest from _pytest.logging import LogCaptureFixture # NOQA from scrapy import Request, Spider -from scrapy.exceptions import IgnoreRequest, NotConfigured +from scrapy.exceptions import IgnoreRequest, NotConfigured, NotSupported from scrapy.http import Response, TextResponse from scrapy.utils.test import get_crawler from twisted.internet.asyncioreactor import install as install_asyncio_reactor @@ -57,8 +57,10 @@ async def test_browser_html_request(self, meta: Dict[str, Dict[str, Any]]): assert resp.url == req.url assert resp.status == 200 assert "zyte-api" in resp.flags - assert resp.body == b"" - assert resp.text == "" + assert resp.body == b"Hello

World!

" + assert resp.text == "Hello

World!

" + assert resp.css("h1 ::text").get() == "World!" + assert resp.xpath("//body/text()").getall() == ["Hello"] @pytest.mark.parametrize( "meta", @@ -83,7 +85,12 @@ async def test_http_response_body_request(self, meta: Dict[str, Dict[str, Any]]) assert resp.url == req.url assert resp.status == 200 assert "zyte-api" in resp.flags - assert resp.body == b"" + assert resp.body == b"Hello

World!

" + + with pytest.raises(NotSupported): + assert resp.css("h1 ::text").get() == "World!" + with pytest.raises(NotSupported): + assert resp.xpath("//body/text()").getall() == ["Hello"] @pytest.mark.parametrize( "meta", @@ -99,7 +106,7 @@ async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any assert resp.url == req.url assert resp.status == 200 assert "zyte-api" in resp.flags - assert resp.body == b"" + assert resp.body == b"Hello

World!

" assert resp.headers == {b"Test_Header": [b"test_value"]} @pytest.mark.skipif( From e3214d80178169998b06903d71e2b999f25d4eb6 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 26 May 2022 15:48:33 +0800 Subject: [PATCH 18/28] enable css/xpath selectors on httpResponseBody --- scrapy_zyte_api/handler.py | 16 +-- scrapy_zyte_api/responses.py | 56 ++++++++- tests/test_responses.py | 224 ++++++++++++++++++++++++++++++++++- 3 files changed, 280 insertions(+), 16 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index a395f871..6e9f0158 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -1,7 +1,7 @@ import json import logging import os -from typing import Any, Dict, Generator, Union +from typing import Any, Dict, Generator, Optional, Union from scrapy import Spider from scrapy.core.downloader.handlers.http import HTTPDownloadHandler @@ -15,7 +15,7 @@ from zyte_api.aio.client import AsyncClient, create_session from zyte_api.aio.errors import RequestError -from .responses import ZyteAPIResponse, ZyteAPITextResponse +from .responses import ZyteAPIResponse, ZyteAPITextResponse, process_response logger = logging.getLogger(__name__) @@ -56,7 +56,7 @@ def download_request(self, request: Request, spider: Spider) -> Deferred: async def _download_request( self, request: Request, spider: Spider - ) -> Union[ZyteAPITextResponse, ZyteAPIResponse]: + ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: api_params: Dict[str, Any] = self._zyte_api_default_params or {} try: api_params.update(request.meta.get("zyte_api") or {}) @@ -86,15 +86,9 @@ async def _download_request( f"Got an error when processing Zyte API request ({request.url}): {er}" ) raise IgnoreRequest() + self._stats.inc_value("scrapy-zyte-api/request_count") - # browserHtml and httpResponseBody are not allowed at the same time, - # but at least one of them should be present - if api_response.get("browserHtml"): - # Using TextResponse because browserHtml always returns a browser-rendered page - # even when requesting files (like images) - return ZyteAPITextResponse.from_api_response(api_response, request=request) - else: - return ZyteAPIResponse.from_api_response(api_response, request=request) + return process_response(api_response, request) @inlineCallbacks def close(self) -> Generator: diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 95244b7f..039b25c3 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -1,10 +1,11 @@ from base64 import b64decode -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union from scrapy import Request from scrapy.http import Response, TextResponse +from scrapy.responsetypes import responsetypes -_ENCODING = "utf-8" +_DEFAULT_ENCODING = "utf-8" class ZyteAPIMixin: @@ -52,11 +53,20 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): """Alternative constructor to instantiate the response from the raw Zyte API response. """ + body = None + encoding = None + + if api_response.get("browserHtml"): + encoding = _DEFAULT_ENCODING # Zyte API has "utf-8" by default + body = api_response["browserHtml"].encode(encoding) + elif api_response.get("httpResponseBody"): + body = b64decode(api_response["httpResponseBody"]) + return cls( url=api_response["url"], status=200, - body=api_response["browserHtml"].encode(_ENCODING), - encoding=_ENCODING, + body=body, + encoding=encoding, request=request, flags=["zyte-api"], headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), @@ -79,3 +89,41 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), zyte_api=api_response, ) + + +def process_response( + api_response: Dict[str, Union[List[Dict], str]], request: Request +) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: + """Given a Zyte API Response and the ``scrapy.Request`` that asked for it, + this returns either a ``ZyteAPITextResponse`` or ``ZyteAPIResponse`` depending + on which if it can properly decode the HTTP Body or have access to browserHtml. + """ + + # NOTES: Currently, Zyte API does NOT only allow both 'browserHtml' and + # 'httpResponseBody' to be present at the same time. The support for both + # will be addressed in the future. Reference: + # - https://github.com/scrapy-plugins/scrapy-zyte-api/pull/10#issuecomment-1131406460 + # For now, at least one of them should be present. + + if api_response.get("browserHtml"): + # Using TextResponse because browserHtml always returns a browser-rendered page + # even when requesting files (like images) + return ZyteAPITextResponse.from_api_response(api_response, request=request) + + if api_response.get("httpResponseBody") is None: + raise ValueError( + "Can't instantiate ZyteAPITextResponse/ZyteAPIResopnse without " + "'browserHtml' or 'httpResponseBody'." + ) + + if api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"): + response_cls = responsetypes.from_args( + headers=api_response["httpResponseHeaders"], + url=api_response["url"], + # FIXME: update this when python-zyte-api supports base64 decoding + body=b64decode(api_response["httpResponseBody"]), # type: ignore + ) + if issubclass(response_cls, TextResponse): + return ZyteAPITextResponse.from_api_response(api_response, request=request) + + return ZyteAPIResponse.from_api_response(api_response, request=request) diff --git a/tests/test_responses.py b/tests/test_responses.py index 012f3662..2ca54e64 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -1,8 +1,15 @@ from base64 import b64encode import pytest +from scrapy import Request +from scrapy.exceptions import NotSupported +from scrapy.http import Response, TextResponse -from scrapy_zyte_api.responses import ZyteAPIResponse, ZyteAPITextResponse +from scrapy_zyte_api.responses import ( + ZyteAPIResponse, + ZyteAPITextResponse, + process_response, +) PAGE_CONTENT = "The cake is a lie!" URL = "https://example.com" @@ -137,6 +144,13 @@ def test_non_utf8_response(): assert response.encoding == "utf-8" +BODY = "Hello

World!✨

" + + +def format_to_httpResponseBody(body, encoding="utf-8"): + return b64encode(body.encode(encoding)).decode("utf-8") + + @pytest.mark.parametrize( "api_response,cls", [ @@ -163,3 +177,211 @@ def test_response_headers_removal(api_response, cls): assert ( response.zyte_api["httpResponseHeaders"] == raw_response["httpResponseHeaders"] ) + + +def test_process_response_no_body(): + """The process_response() function cannot produce the appropriate Zyte API + Response for Scrapy if it doesn't have a 'browserHtml' or 'httpResponseBody'. + """ + api_response = {"url": "https://example.com", "product": {"name": "shoes"}} + + with pytest.raises(ValueError): + process_response(api_response, Request(api_response["url"])) + + +def test_process_response_body_only(): + """Having the Body but with no Headers won't allow us to decode the contents + with the proper encoding. + + Thus, we won't have access to css/xpath selectors. + """ + encoding = "utf-8" + api_response = { + "url": "https://example.com", + "httpResponseBody": format_to_httpResponseBody(BODY, encoding=encoding), + } + + resp = process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, Response) + with pytest.raises(NotSupported): + assert resp.css("h1 ::text") + with pytest.raises(NotSupported): + assert resp.xpath("//body/text()") + + +@pytest.mark.xfail(reason="encoding inference is not supported for now") +def test_process_response_body_only_infer_encoding(): + """The ``scrapy.TextResponse`` class has the ability to check the encoding + by inferring it in the HTML body. + + However, this is a bit tricky since we need to somehow ensure that the body + we're receiving is "text/html". We can't fully determine that without the + headers. + """ + encoding = "gb18030" + body = ( + "" + '' + "Some ✨ contents" + "" + ) + + api_response = { + "url": "https://example.com", + "httpResponseBody": format_to_httpResponseBody(body, encoding=encoding), + } + + resp = process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, TextResponse) + assert resp.css("body ::text").get() == "Some ✨ contents" + assert resp.xpath("//body/text()").getall() == ["Some ✨ contents"] + + +@pytest.mark.parametrize( + "encoding,content_type", + [ + ("utf-8", "text/html; charset=UTF-8"), + ("gb18030", "text/html; charset=gb2312"), + ], +) +def test_process_response_body_and_headers(encoding, content_type): + """Having access to the Headers allow us to properly decode the contents + and will have access to the css/xpath selectors. + """ + api_response = { + "url": "https://example.com", + "httpResponseBody": format_to_httpResponseBody(BODY, encoding=encoding), + "httpResponseHeaders": [{"name": "Content-Type", "value": content_type}], + } + + resp = process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, TextResponse) + assert resp.css("h1 ::text").get() == "World!✨" + assert resp.xpath("//body/text()").getall() == ["Hello"] + assert resp.encoding == encoding + + +@pytest.mark.parametrize( + "body,expected,actual_encoding,inferred_encoding", + [ + ("plain", "plain", "cp1252", "cp1252"), + ( + "✨", + "✨", + "utf-8", + "utf-8", + ), + ( + "✨", + "✨", + "utf-16", + "utf-16-le", + ), + ( + """ + ✨""", + "✨", + "gb18030", + None, + ), + ], +) +def test_process_response_body_and_headers_but_no_encoding( + body, expected, actual_encoding, inferred_encoding +): + """Should both the body and headers are present but no 'Content-Type' encoding + can be derived, it should infer from the body contents. + """ + api_response = { + "url": "https://example.com", + "httpResponseBody": format_to_httpResponseBody(body, encoding=actual_encoding), + "httpResponseHeaders": [{"name": "X-Value", "value": "some_value"}], + } + + resp = process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, TextResponse) + + if inferred_encoding: + assert resp.css("body ::text").get() == expected + assert resp.xpath("//body/text()").get() == expected + assert resp.encoding == inferred_encoding + + # Scrapy's ``TextResponse`` built-in inference only works on "utf-8" and + # "Latin-1" based encodings. + else: + assert resp.css("body ::text").get() != expected + assert resp.xpath("//body/text()").get() != expected + assert resp.encoding == "ascii" + + +def test_process_response_body_and_headers_mismatch(): + """If the actual contents have a mismatch in terms of its encoding, we won't + properly decode the ✨ emoji. + """ + encoding = "utf-8" + api_response = { + "url": "https://example.com", + "httpResponseBody": format_to_httpResponseBody(BODY, encoding=encoding), + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html; charset=gb2312"} + ], + } + + resp = process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, TextResponse) + assert resp.css("h1 ::text").get() != "World!✨" # mismatch + assert resp.xpath("//body/text()").getall() == ["Hello"] + assert resp.encoding == "gb18030" + + +def test_process_response_non_text(): + """Non-textual responses like images, files, etc. won't have access to the + css/xpath selectors. + """ + api_response = { + "url": "https://example.com/sprite.gif", + "httpResponseBody": b"", + "httpResponseHeaders": [ + { + "name": "Content-Type", + "value": "image/gif", + } + ], + } + resp = process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, Response) + with pytest.raises(NotSupported): + assert resp.css("h1 ::text") + with pytest.raises(NotSupported): + assert resp.xpath("//body/text()") + + +@pytest.mark.parametrize( + "api_response", + [ + {"url": "https://example.com", "browserHtml": BODY}, + { + "url": "https://example.com", + "browserHtml": BODY, + "httpResponseHeaders": [ + { + "name": "Content-Type", + "value": "text/html; charset=UTF-8", + } + ], + }, + ], +) +def test_process_response_browserhtml(api_response): + resp = process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, TextResponse) + assert resp.css("h1 ::text").get() == "World!✨" + assert resp.xpath("//body/text()").getall() == ["Hello"] + assert resp.encoding == "utf-8" # Zyte API is consistent with this on browserHtml From e530053ecf9845ae3ac414bd7cda83f8f4525d25 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 26 May 2022 17:45:27 +0800 Subject: [PATCH 19/28] handle empty 'browserHtml' or 'httpResponseBody' --- scrapy_zyte_api/responses.py | 8 +------- tests/test_responses.py | 10 ++++++---- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 039b25c3..108d0e0e 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -83,7 +83,7 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): return cls( url=api_response["url"], status=200, - body=b64decode(api_response["httpResponseBody"]), + body=b64decode(api_response.get("httpResponseBody") or ""), request=request, flags=["zyte-api"], headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), @@ -110,12 +110,6 @@ def process_response( # even when requesting files (like images) return ZyteAPITextResponse.from_api_response(api_response, request=request) - if api_response.get("httpResponseBody") is None: - raise ValueError( - "Can't instantiate ZyteAPITextResponse/ZyteAPIResopnse without " - "'browserHtml' or 'httpResponseBody'." - ) - if api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"): response_cls = responsetypes.from_args( headers=api_response["httpResponseHeaders"], diff --git a/tests/test_responses.py b/tests/test_responses.py index 2ca54e64..92d776da 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -180,13 +180,15 @@ def test_response_headers_removal(api_response, cls): def test_process_response_no_body(): - """The process_response() function cannot produce the appropriate Zyte API - Response for Scrapy if it doesn't have a 'browserHtml' or 'httpResponseBody'. + """The process_response() function should handle missing 'browserHtml' or + 'httpResponseBody'. """ api_response = {"url": "https://example.com", "product": {"name": "shoes"}} - with pytest.raises(ValueError): - process_response(api_response, Request(api_response["url"])) + resp = process_response(api_response, Request(api_response["url"])) + + assert isinstance(resp, Response) + assert resp.body == b"" def test_process_response_body_only(): From 27c7a7d95fb8f16f4fcf12568015c2a1a3644f68 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 10:55:27 +0800 Subject: [PATCH 20/28] Fix typos in docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- CHANGES.rst | 2 +- README.rst | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 9c4381fc..ced5eb54 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,7 +6,7 @@ TBD * Introduce ``ZyteAPIResponse`` and ``ZyteAPITextResponse`` which are subclasses of ``scrapy.http.Response`` and ``scrapy.http.TextResponse`` respectively. - These new response classes hold the raw Zyte API response in its + These new response classes hold the raw Zyte Data API response in its ``zyte_api`` attribute. 0.1.0 (2022-02-03) diff --git a/README.rst b/README.rst index 410e6b86..6ed5dfed 100644 --- a/README.rst +++ b/README.rst @@ -77,9 +77,9 @@ in the ``settings.py`` file or `any other settings within Scrapy You can see the full list of parameters in the `Zyte API Specification `_. -On the other hand, you could also control it on a per request basis by setting the +On the other hand, you could also control it on a per-request basis by setting the ``zyte_api`` key in `Request.meta `_. -When doing so, it will override any parameters that was set in the +When doing so, it will override any parameters set in the ``ZYTE_API_DEFAULT_PARAMS`` setting. .. code-block:: python @@ -127,8 +127,8 @@ When doing so, it will override any parameters that was set in the # 'download_slot': 'quotes.toscrape.com' # } -The raw Zyte API Response can be accessed via the ``zyte_api`` attribute +The raw Zyte Data API response can be accessed via the ``zyte_api`` attribute of the response object. Note that such responses are of ``ZyteAPIResponse`` and -``ZyteAPITextResponse`` which are respectively subclasses of ``scrapy.http.Response`` -and ``scrapy.http.TextResponse``. Such classes are needed to hold the raw Zyte API +``ZyteAPITextResponse`` types, which are respectively subclasses of ``scrapy.http.Response`` +and ``scrapy.http.TextResponse``. Such classes are needed to hold the raw Zyte Data API responses. From 5b7cf6fded737bb08c0d03b422c50120f920e2ed Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 11:48:58 +0800 Subject: [PATCH 21/28] update how replace() works --- scrapy_zyte_api/responses.py | 14 +++++++------- tests/test_responses.py | 15 +-------------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 108d0e0e..399afdf7 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -1,5 +1,5 @@ from base64 import b64decode -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, Tuple from scrapy import Request from scrapy.http import Response, TextResponse @@ -21,12 +21,6 @@ def __init__(self, *args, zyte_api: Dict = None, **kwargs): super().__init__(*args, **kwargs) self._zyte_api = zyte_api - def replace(self, *args, **kwargs): - """Create a new response with the same attributes except for those given - new values. - """ - return super().replace(*args, **kwargs) - @property def zyte_api(self) -> Optional[Dict]: """Contains the raw API response from Zyte API. @@ -48,6 +42,9 @@ def _prepare_headers(cls, init_headers: Optional[List[Dict[str, str]]]): class ZyteAPITextResponse(ZyteAPIMixin, TextResponse): + + attributes: Tuple[str, ...] = TextResponse.attributes + ("zyte_api",) + @classmethod def from_api_response(cls, api_response: Dict, *, request: Request = None): """Alternative constructor to instantiate the response from the raw @@ -75,6 +72,9 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): class ZyteAPIResponse(ZyteAPIMixin, Response): + + attributes: Tuple[str, ...] = Response.attributes + ("zyte_api",) + @classmethod def from_api_response(cls, api_response: Dict, *, request: Request = None): """Alternative constructor to instantiate the response from the raw diff --git a/tests/test_responses.py b/tests/test_responses.py index 92d776da..1d556617 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -105,22 +105,9 @@ def test_response_replace(api_response, cls): new_response = orig_response.replace(url="https://new-example.com") assert new_response.url == "https://new-example.com" - -@pytest.mark.xfail -@pytest.mark.parametrize( - "api_response,cls", - [ - (api_response_browser, ZyteAPITextResponse), - (api_response_body, ZyteAPIResponse), - ], -) -def test_response_replace_zyte_api(api_response, cls): - orig_response = cls.from_api_response(api_response()) - - # The ``zyte_api`` should not be replaced. new_zyte_api = {"overridden": "value"} new_response = orig_response.replace(zyte_api=new_zyte_api) - assert new_response.zyte_api == api_response() + assert new_response.zyte_api == new_zyte_api def test_non_utf8_response(): From 2adc8a63b984847c693c41cc70d13794213eb237 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 14:34:56 +0800 Subject: [PATCH 22/28] update README in line with the ZYTE_API_DEFAULT_PARAMS expectations --- README.rst | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index 6ed5dfed..a12715db 100644 --- a/README.rst +++ b/README.rst @@ -63,8 +63,12 @@ Here's an example of the things needed inside a Scrapy project's ``settings.py`` Usage ----- -To enable every request to be sent through Zyte API, you can set the following -in the ``settings.py`` file or `any other settings within Scrapy +To enable a ``scrapy.Request`` to go through Zyte Data API, the ``zyte_api`` key in +`Request.meta `_ +must be present and has dict-like contents. + +To set the default parameters for Zyte API enabled requests, you can set the +following in the ``settings.py`` file or `any other settings within Scrapy `_: .. code-block:: python @@ -74,12 +78,12 @@ in the ``settings.py`` file or `any other settings within Scrapy "geolocation": "US", } -You can see the full list of parameters in the `Zyte API Specification +You can see the full list of parameters in the `Zyte Data API Specification `_. -On the other hand, you could also control it on a per-request basis by setting the -``zyte_api`` key in `Request.meta `_. -When doing so, it will override any parameters set in the +Note that the ``ZYTE_API_DEFAULT_PARAMS`` would only work if the ``zyte_api`` +key in `Request.meta `_ +is set. When doing so, it will override any parameters set in the ``ZYTE_API_DEFAULT_PARAMS`` setting. .. code-block:: python @@ -90,15 +94,19 @@ When doing so, it will override any parameters set in the class SampleQuotesSpider(scrapy.Spider): name = "sample_quotes" - def start_requests(self): + custom_settings = { + "ZYTE_API_DEFAULT_PARAMS": { + "geolocation": "US", # You can set any Geolocation region you want. + } + } + def start_requests(self): yield scrapy.Request( url="http://books.toscrape.com/", callback=self.parse, meta={ "zyte_api": { "browserHtml": True, - "geolocation": "US", # You can set any Geolocation region you want. "javascript": True, "echoData": {"some_value_I_could_track": 123}, } From 32faf3d999622a1b379f355fc515d7803194af27 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 14:42:28 +0800 Subject: [PATCH 23/28] add test case to ensure zyte_api is intact when replacing other attribs --- tests/test_responses.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_responses.py b/tests/test_responses.py index 1d556617..f39c4016 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -105,6 +105,10 @@ def test_response_replace(api_response, cls): new_response = orig_response.replace(url="https://new-example.com") assert new_response.url == "https://new-example.com" + # Ensure that the Zyte API response is intact + assert new_response.zyte_api == api_response() + + # The Zyte API response must also be fully replaceable new_zyte_api = {"overridden": "value"} new_response = orig_response.replace(zyte_api=new_zyte_api) assert new_response.zyte_api == new_zyte_api From cec06773c13f1708af5f80d3533d2202a787c89f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 18:59:33 +0800 Subject: [PATCH 24/28] make process_response() private --- scrapy_zyte_api/handler.py | 4 ++-- scrapy_zyte_api/responses.py | 2 +- tests/test_responses.py | 36 ++++++++++++++++++------------------ 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 6e9f0158..76b2f4d6 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -15,7 +15,7 @@ from zyte_api.aio.client import AsyncClient, create_session from zyte_api.aio.errors import RequestError -from .responses import ZyteAPIResponse, ZyteAPITextResponse, process_response +from .responses import ZyteAPIResponse, ZyteAPITextResponse, _process_response logger = logging.getLogger(__name__) @@ -88,7 +88,7 @@ async def _download_request( raise IgnoreRequest() self._stats.inc_value("scrapy-zyte-api/request_count") - return process_response(api_response, request) + return _process_response(api_response, request) @inlineCallbacks def close(self) -> Generator: diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 399afdf7..e214b60f 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -91,7 +91,7 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): ) -def process_response( +def _process_response( api_response: Dict[str, Union[List[Dict], str]], request: Request ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: """Given a Zyte API Response and the ``scrapy.Request`` that asked for it, diff --git a/tests/test_responses.py b/tests/test_responses.py index f39c4016..9ec892b9 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -8,7 +8,7 @@ from scrapy_zyte_api.responses import ( ZyteAPIResponse, ZyteAPITextResponse, - process_response, + _process_response, ) PAGE_CONTENT = "The cake is a lie!" @@ -170,19 +170,19 @@ def test_response_headers_removal(api_response, cls): ) -def test_process_response_no_body(): - """The process_response() function should handle missing 'browserHtml' or +def test__process_response_no_body(): + """The _process_response() function should handle missing 'browserHtml' or 'httpResponseBody'. """ api_response = {"url": "https://example.com", "product": {"name": "shoes"}} - resp = process_response(api_response, Request(api_response["url"])) + resp = _process_response(api_response, Request(api_response["url"])) assert isinstance(resp, Response) assert resp.body == b"" -def test_process_response_body_only(): +def test__process_response_body_only(): """Having the Body but with no Headers won't allow us to decode the contents with the proper encoding. @@ -194,7 +194,7 @@ def test_process_response_body_only(): "httpResponseBody": format_to_httpResponseBody(BODY, encoding=encoding), } - resp = process_response(api_response, Request(api_response["url"])) + resp = _process_response(api_response, Request(api_response["url"])) assert isinstance(resp, Response) with pytest.raises(NotSupported): @@ -204,7 +204,7 @@ def test_process_response_body_only(): @pytest.mark.xfail(reason="encoding inference is not supported for now") -def test_process_response_body_only_infer_encoding(): +def test__process_response_body_only_infer_encoding(): """The ``scrapy.TextResponse`` class has the ability to check the encoding by inferring it in the HTML body. @@ -225,7 +225,7 @@ def test_process_response_body_only_infer_encoding(): "httpResponseBody": format_to_httpResponseBody(body, encoding=encoding), } - resp = process_response(api_response, Request(api_response["url"])) + resp = _process_response(api_response, Request(api_response["url"])) assert isinstance(resp, TextResponse) assert resp.css("body ::text").get() == "Some ✨ contents" @@ -239,7 +239,7 @@ def test_process_response_body_only_infer_encoding(): ("gb18030", "text/html; charset=gb2312"), ], ) -def test_process_response_body_and_headers(encoding, content_type): +def test__process_response_body_and_headers(encoding, content_type): """Having access to the Headers allow us to properly decode the contents and will have access to the css/xpath selectors. """ @@ -249,7 +249,7 @@ def test_process_response_body_and_headers(encoding, content_type): "httpResponseHeaders": [{"name": "Content-Type", "value": content_type}], } - resp = process_response(api_response, Request(api_response["url"])) + resp = _process_response(api_response, Request(api_response["url"])) assert isinstance(resp, TextResponse) assert resp.css("h1 ::text").get() == "World!✨" @@ -282,7 +282,7 @@ def test_process_response_body_and_headers(encoding, content_type): ), ], ) -def test_process_response_body_and_headers_but_no_encoding( +def test__process_response_body_and_headers_but_no_encoding( body, expected, actual_encoding, inferred_encoding ): """Should both the body and headers are present but no 'Content-Type' encoding @@ -294,7 +294,7 @@ def test_process_response_body_and_headers_but_no_encoding( "httpResponseHeaders": [{"name": "X-Value", "value": "some_value"}], } - resp = process_response(api_response, Request(api_response["url"])) + resp = _process_response(api_response, Request(api_response["url"])) assert isinstance(resp, TextResponse) @@ -311,7 +311,7 @@ def test_process_response_body_and_headers_but_no_encoding( assert resp.encoding == "ascii" -def test_process_response_body_and_headers_mismatch(): +def test__process_response_body_and_headers_mismatch(): """If the actual contents have a mismatch in terms of its encoding, we won't properly decode the ✨ emoji. """ @@ -324,7 +324,7 @@ def test_process_response_body_and_headers_mismatch(): ], } - resp = process_response(api_response, Request(api_response["url"])) + resp = _process_response(api_response, Request(api_response["url"])) assert isinstance(resp, TextResponse) assert resp.css("h1 ::text").get() != "World!✨" # mismatch @@ -332,7 +332,7 @@ def test_process_response_body_and_headers_mismatch(): assert resp.encoding == "gb18030" -def test_process_response_non_text(): +def test__process_response_non_text(): """Non-textual responses like images, files, etc. won't have access to the css/xpath selectors. """ @@ -346,7 +346,7 @@ def test_process_response_non_text(): } ], } - resp = process_response(api_response, Request(api_response["url"])) + resp = _process_response(api_response, Request(api_response["url"])) assert isinstance(resp, Response) with pytest.raises(NotSupported): @@ -371,8 +371,8 @@ def test_process_response_non_text(): }, ], ) -def test_process_response_browserhtml(api_response): - resp = process_response(api_response, Request(api_response["url"])) +def test__process_response_browserhtml(api_response): + resp = _process_response(api_response, Request(api_response["url"])) assert isinstance(resp, TextResponse) assert resp.css("h1 ::text").get() == "World!✨" From e0865e77e7a627b9a04b2638ca9e3241455d4421 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 20:23:10 +0800 Subject: [PATCH 25/28] update tests to ensure other response attribs are not updated on .replace() --- tests/test_responses.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_responses.py b/tests/test_responses.py index 9ec892b9..df1e5a7c 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -109,10 +109,23 @@ def test_response_replace(api_response, cls): assert new_response.zyte_api == api_response() # The Zyte API response must also be fully replaceable - new_zyte_api = {"overridden": "value"} + new_zyte_api = { + "url": "https://another-website.com", + "httpResponseHeaders": {"name": "Content-Type", "value": "application/json"} + } + if cls == ZyteAPITextResponse: + new_zyte_api["browserHtml"] = "Hello World!" + elif cls == ZyteAPIResponse: + new_zyte_api["httpResponseBody"] = "Hello World!" + new_response = orig_response.replace(zyte_api=new_zyte_api) assert new_response.zyte_api == new_zyte_api + # But it doesn't change the contextually similar attributes of the response + new_response.url == orig_response.url + new_response.body == orig_response.body + new_response.headers == orig_response.headers + def test_non_utf8_response(): content = "Some non-ASCII ✨ chars" From 34a427f84fdd6f91d11579c8868d0ff2ccda0c5b Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 20:41:55 +0800 Subject: [PATCH 26/28] raise an error if zyte_api is passed to .replace() --- scrapy_zyte_api/responses.py | 5 +++++ tests/test_responses.py | 17 ++++------------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index e214b60f..f5ab6970 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -21,6 +21,11 @@ def __init__(self, *args, zyte_api: Dict = None, **kwargs): super().__init__(*args, **kwargs) self._zyte_api = zyte_api + def replace(self, *args, **kwargs): + if kwargs.get("zyte_api"): + raise ValueError("Replacing the value of 'zyte_api' isn't allowed.") + return super().replace(*args, **kwargs) + @property def zyte_api(self) -> Optional[Dict]: """Contains the raw API response from Zyte API. diff --git a/tests/test_responses.py b/tests/test_responses.py index df1e5a7c..f2e1c307 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -108,23 +108,14 @@ def test_response_replace(api_response, cls): # Ensure that the Zyte API response is intact assert new_response.zyte_api == api_response() - # The Zyte API response must also be fully replaceable new_zyte_api = { "url": "https://another-website.com", "httpResponseHeaders": {"name": "Content-Type", "value": "application/json"} } - if cls == ZyteAPITextResponse: - new_zyte_api["browserHtml"] = "Hello World!" - elif cls == ZyteAPIResponse: - new_zyte_api["httpResponseBody"] = "Hello World!" - - new_response = orig_response.replace(zyte_api=new_zyte_api) - assert new_response.zyte_api == new_zyte_api - - # But it doesn't change the contextually similar attributes of the response - new_response.url == orig_response.url - new_response.body == orig_response.body - new_response.headers == orig_response.headers + + # Attempting to replace the zyte_api value would raise an error + with pytest.raises(ValueError): + orig_response.replace(zyte_api=new_zyte_api) def test_non_utf8_response(): From 37a4cc705d786a8d091b3066e6426b1f2dc9814c Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 27 May 2022 21:13:33 +0800 Subject: [PATCH 27/28] rename '.zyte_api' attribute as '.raw_api_response' --- CHANGES.rst | 4 ++-- README.rst | 4 ++-- scrapy_zyte_api/responses.py | 22 +++++++++--------- tests/test_responses.py | 45 ++++++++++++++++++------------------ 4 files changed, 38 insertions(+), 37 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index ced5eb54..b0adbb88 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,8 +6,8 @@ TBD * Introduce ``ZyteAPIResponse`` and ``ZyteAPITextResponse`` which are subclasses of ``scrapy.http.Response`` and ``scrapy.http.TextResponse`` respectively. - These new response classes hold the raw Zyte Data API response in its - ``zyte_api`` attribute. + These new response classes hold the raw Zyte Data API response in the + ``raw_api_response`` attribute. 0.1.0 (2022-02-03) ------------------ diff --git a/README.rst b/README.rst index a12715db..25b12288 100644 --- a/README.rst +++ b/README.rst @@ -116,7 +116,7 @@ is set. When doing so, it will override any parameters set in the def parse(self, response): yield {"URL": response.url, "status": response.status, "HTML": response.body} - print(response.zyte_api) + print(response.raw_api_response) # { # 'url': 'https://quotes.toscrape.com/', # 'browserHtml': ' ... ', @@ -135,7 +135,7 @@ is set. When doing so, it will override any parameters set in the # 'download_slot': 'quotes.toscrape.com' # } -The raw Zyte Data API response can be accessed via the ``zyte_api`` attribute +The raw Zyte Data API response can be accessed via the ``raw_api_response`` attribute of the response object. Note that such responses are of ``ZyteAPIResponse`` and ``ZyteAPITextResponse`` types, which are respectively subclasses of ``scrapy.http.Response`` and ``scrapy.http.TextResponse``. Such classes are needed to hold the raw Zyte Data API diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index f5ab6970..ab8ef992 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -1,5 +1,5 @@ from base64 import b64decode -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict, List, Optional, Tuple, Union from scrapy import Request from scrapy.http import Response, TextResponse @@ -17,23 +17,23 @@ class ZyteAPIMixin: "content-encoding" } - def __init__(self, *args, zyte_api: Dict = None, **kwargs): + def __init__(self, *args, raw_api_response: Dict = None, **kwargs): super().__init__(*args, **kwargs) - self._zyte_api = zyte_api + self._raw_api_response = raw_api_response def replace(self, *args, **kwargs): - if kwargs.get("zyte_api"): - raise ValueError("Replacing the value of 'zyte_api' isn't allowed.") + if kwargs.get("raw_api_response"): + raise ValueError("Replacing the value of 'raw_api_response' isn't allowed.") return super().replace(*args, **kwargs) @property - def zyte_api(self) -> Optional[Dict]: + def raw_api_response(self) -> Optional[Dict]: """Contains the raw API response from Zyte API. To see the full list of parameters and their description, kindly refer to the `Zyte API Specification `_. """ - return self._zyte_api + return self._raw_api_response @classmethod def _prepare_headers(cls, init_headers: Optional[List[Dict[str, str]]]): @@ -48,7 +48,7 @@ def _prepare_headers(cls, init_headers: Optional[List[Dict[str, str]]]): class ZyteAPITextResponse(ZyteAPIMixin, TextResponse): - attributes: Tuple[str, ...] = TextResponse.attributes + ("zyte_api",) + attributes: Tuple[str, ...] = TextResponse.attributes + ("raw_api_response",) @classmethod def from_api_response(cls, api_response: Dict, *, request: Request = None): @@ -72,13 +72,13 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): request=request, flags=["zyte-api"], headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), - zyte_api=api_response, + raw_api_response=api_response, ) class ZyteAPIResponse(ZyteAPIMixin, Response): - attributes: Tuple[str, ...] = Response.attributes + ("zyte_api",) + attributes: Tuple[str, ...] = Response.attributes + ("raw_api_response",) @classmethod def from_api_response(cls, api_response: Dict, *, request: Request = None): @@ -92,7 +92,7 @@ def from_api_response(cls, api_response: Dict, *, request: Request = None): request=request, flags=["zyte-api"], headers=cls._prepare_headers(api_response.get("httpResponseHeaders")), - zyte_api=api_response, + raw_api_response=api_response, ) diff --git a/tests/test_responses.py b/tests/test_responses.py index f2e1c307..cc34553a 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -15,7 +15,7 @@ URL = "https://example.com" -def api_response_browser(): +def raw_api_response_browser(): return { "url": URL, "browserHtml": PAGE_CONTENT, @@ -28,7 +28,7 @@ def api_response_browser(): } -def api_response_body(): +def raw_api_response_body(): return { "url": "https://example.com", "httpResponseBody": b64encode(PAGE_CONTENT.encode("utf-8")), @@ -47,13 +47,13 @@ def api_response_body(): @pytest.mark.parametrize( "api_response,cls", [ - (api_response_browser, ZyteAPITextResponse), - (api_response_body, ZyteAPIResponse), + (raw_api_response_browser, ZyteAPITextResponse), + (raw_api_response_body, ZyteAPIResponse), ], ) def test_init(api_response, cls): - response = cls(URL, zyte_api=api_response()) - assert response.zyte_api == api_response() + response = cls(URL, raw_api_response=api_response()) + assert response.raw_api_response == api_response() assert response.url == URL assert response.status == 200 @@ -69,13 +69,13 @@ def test_init(api_response, cls): @pytest.mark.parametrize( "api_response,cls", [ - (api_response_browser, ZyteAPITextResponse), - (api_response_body, ZyteAPIResponse), + (raw_api_response_browser, ZyteAPITextResponse), + (raw_api_response_body, ZyteAPIResponse), ], ) def test_text_from_api_response(api_response, cls): response = cls.from_api_response(api_response()) - assert response.zyte_api == api_response() + assert response.raw_api_response == api_response() assert response.url == URL assert response.status == 200 @@ -91,8 +91,8 @@ def test_text_from_api_response(api_response, cls): @pytest.mark.parametrize( "api_response,cls", [ - (api_response_browser, ZyteAPITextResponse), - (api_response_body, ZyteAPIResponse), + (raw_api_response_browser, ZyteAPITextResponse), + (raw_api_response_body, ZyteAPIResponse), ], ) def test_response_replace(api_response, cls): @@ -106,21 +106,21 @@ def test_response_replace(api_response, cls): assert new_response.url == "https://new-example.com" # Ensure that the Zyte API response is intact - assert new_response.zyte_api == api_response() + assert new_response.raw_api_response == api_response() - new_zyte_api = { + new_raw_api_response = { "url": "https://another-website.com", - "httpResponseHeaders": {"name": "Content-Type", "value": "application/json"} + "httpResponseHeaders": {"name": "Content-Type", "value": "application/json"}, } - # Attempting to replace the zyte_api value would raise an error + # Attempting to replace the raw_api_response value would raise an error with pytest.raises(ValueError): - orig_response.replace(zyte_api=new_zyte_api) + orig_response.replace(raw_api_response=new_raw_api_response) def test_non_utf8_response(): content = "Some non-ASCII ✨ chars" - sample_zyte_api = { + sample_raw_api_response = { "url": URL, "browserHtml": content, "httpResponseHeaders": [ @@ -134,7 +134,7 @@ def test_non_utf8_response(): # for it. This is the default encoding for the "browserHtml" contents from # Zyte API. Thus, even if the Response Headers or tags indicate a # different encoding, it should still be treated as "utf-8". - response = ZyteAPITextResponse.from_api_response(sample_zyte_api) + response = ZyteAPITextResponse.from_api_response(sample_raw_api_response) assert response.text == content assert response.encoding == "utf-8" @@ -149,15 +149,15 @@ def format_to_httpResponseBody(body, encoding="utf-8"): @pytest.mark.parametrize( "api_response,cls", [ - (api_response_browser, ZyteAPITextResponse), - (api_response_body, ZyteAPIResponse), + (raw_api_response_browser, ZyteAPITextResponse), + (raw_api_response_body, ZyteAPIResponse), ], ) def test_response_headers_removal(api_response, cls): """Headers like 'Content-Encoding' should be removed later in the response instance returned to Scrapy. - However, it should still be present inside 'zyte_api.headers'. + However, it should still be present inside 'raw_api_response.headers'. """ additional_headers = [ {"name": "Content-Encoding", "value": "gzip"}, @@ -170,7 +170,8 @@ def test_response_headers_removal(api_response, cls): assert response.headers == {b"X-Some-Other-Value": [b"123"]} assert ( - response.zyte_api["httpResponseHeaders"] == raw_response["httpResponseHeaders"] + response.raw_api_response["httpResponseHeaders"] + == raw_response["httpResponseHeaders"] ) From f5a9bb074d4f76b8ef294a84edb8c4946919c298 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 30 May 2022 18:56:04 +0800 Subject: [PATCH 28/28] refactor to accept 'True' and '{}' to trigger Zyte API Requests --- scrapy_zyte_api/handler.py | 28 ++++++++++---- tests/test_api_requests.py | 76 ++++++++++++++++++++++++++++---------- 2 files changed, 76 insertions(+), 28 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 76b2f4d6..e192d5c5 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -49,17 +49,24 @@ def from_crawler(cls, crawler): return cls(crawler.settings, crawler, client) def download_request(self, request: Request, spider: Spider) -> Deferred: - if request.meta.get("zyte_api"): - return deferred_from_coro(self._download_request(request, spider)) - else: - return super().download_request(request, spider) + api_params = self._prepare_api_params(request) + if api_params: + return deferred_from_coro( + self._download_request(api_params, request, spider) + ) + return super().download_request(request, spider) + + def _prepare_api_params(self, request: Request) -> Optional[dict]: + meta_params = request.meta.get("zyte_api") + if not meta_params and meta_params != {}: + return None + + if meta_params is True: + meta_params = {} - async def _download_request( - self, request: Request, spider: Spider - ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: api_params: Dict[str, Any] = self._zyte_api_default_params or {} try: - api_params.update(request.meta.get("zyte_api") or {}) + api_params.update(meta_params) except TypeError: logger.error( f"zyte_api parameters in the request meta should be " @@ -67,6 +74,11 @@ async def _download_request( f"instead ({request.url})." ) raise IgnoreRequest() + return api_params + + async def _download_request( + self, api_params: dict, request: Request, spider: Spider + ) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]: # Define url by default api_data = {**{"url": request.url}, **api_params} if self._job_id is not None: diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 40e8ba40..84ddb342 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -9,6 +9,7 @@ from scrapy import Request, Spider from scrapy.exceptions import IgnoreRequest, NotConfigured, NotSupported from scrapy.http import Response, TextResponse +from scrapy.utils.defer import deferred_to_future from scrapy.utils.test import get_crawler from twisted.internet.asyncioreactor import install as install_asyncio_reactor from twisted.internet.defer import Deferred @@ -34,10 +35,12 @@ async def produce_request_response(meta, custom_settings=None): method="POST", meta=meta, ) - coro = handler._download_request(req, None) - assert iscoroutine(coro) - assert not isinstance(coro, Deferred) - resp = await coro # type: ignore + coro_or_deferred = handler.download_request(req, None) + if iscoroutine(coro_or_deferred): + resp = await coro_or_deferred # type: ignore + else: + resp = await deferred_to_future(coro_or_deferred) + return req, resp @pytest.mark.parametrize( @@ -113,35 +116,59 @@ async def test_http_response_headers_request(self, meta: Dict[str, Dict[str, Any sys.version_info < (3, 8), reason="Python3.7 has poor support for AsyncMocks" ) @pytest.mark.parametrize( - "meta,custom_settings,expected", + "meta,custom_settings,expected,use_zyte_api", [ - ({}, {}, {}), - ({"zyte_api": {}}, {}, {}), + ({}, {}, {}, False), + ({"zyte_api": {}}, {}, {}, False), + ({"zyte_api": True}, {}, {}, False), + ({"zyte_api": False}, {}, {}, False), ( {}, {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, {"browserHtml": True, "geolocation": "CA"}, + False, + ), + ( + {"zyte_api": False}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {}, + False, + ), + ( + {"zyte_api": None}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {}, + False, ), ( {"zyte_api": {}}, {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, {"browserHtml": True, "geolocation": "CA"}, + True, + ), + ( + {"zyte_api": True}, + {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, + {"browserHtml": True, "geolocation": "CA"}, + True, ), ( {"zyte_api": {"javascript": True, "geolocation": "US"}}, {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True, "geolocation": "CA"}}, {"browserHtml": True, "geolocation": "US", "javascript": True}, + True, ), ], ) @mock.patch("tests.AsyncClient") @pytest.mark.asyncio - async def test_empty_zyte_api_request_meta( + async def test_zyte_api_request_meta( self, mock_client, meta: Dict[str, Dict[str, Any]], custom_settings: Dict[str, str], expected: Dict[str, str], + use_zyte_api: bool, ): try: # This would always error out since the mocked client doesn't @@ -152,7 +179,12 @@ async def test_empty_zyte_api_request_meta( # What we're interested in is the Request call in the API request_call = [c for c in mock_client.mock_calls if "request_raw(" in str(c)] - if not request_call: + + if not use_zyte_api: + assert request_call == [] + return + + elif not request_call: pytest.fail("The client's request_raw() method was not called.") args_used = request_call[0].args[0] @@ -166,15 +198,18 @@ async def test_empty_zyte_api_request_meta( ({"zyte_api": {"waka": True}}, True), ({"zyte_api": True}, True), ({"zyte_api": {"browserHtml": True}}, True), - ({"zyte_api": {}}, False), + ({"zyte_api": {}}, True), + ({"zyte_api": None}, False), ({"randomParameter": True}, False), ({}, False), + (None, False), ], ) @pytest.mark.asyncio async def test_coro_handling( self, meta: Dict[str, Dict[str, Any]], api_relevant: bool ): + custom_settings = {"ZYTE_API_DEFAULT_PARAMS": {"browserHtml": True}} with MockServer() as server: async with make_handler({}, server.urljoin("/")) as handler: req = Request( @@ -182,13 +217,14 @@ async def test_coro_handling( method="POST", meta=meta, ) + handler._zyte_api_default_params = custom_settings if api_relevant: coro = handler.download_request(req, Spider("test")) assert not iscoroutine(coro) assert isinstance(coro, Deferred) else: # Non-API requests won't get into handle, but run HTTPDownloadHandler.download_request instead - # But because they're Deffered - they won't run because event loop is closed + # But because they're Deferred - they won't run because event loop is closed with pytest.raises(RuntimeError, match="Event loop is closed"): handler.download_request(req, Spider("test")) @@ -202,13 +238,6 @@ async def test_coro_handling( "Got an error when processing Zyte API request (http://example.com): " "Object of type Request is not JSON serializable", ), - ( - {"zyte_api": True}, - "/", - IgnoreRequest, - "zyte_api parameters in the request meta should be provided as " - "dictionary, got instead (http://example.com)", - ), ( {"zyte_api": {"browserHtml": True}}, "/exception/", @@ -230,8 +259,12 @@ async def test_exceptions( with MockServer() as server: async with make_handler({}, server.urljoin(server_path)) as handler: req = Request("http://example.com", method="POST", meta=meta) + api_params = handler._prepare_api_params(req) + with pytest.raises(exception_type): # NOQA - await handler._download_request(req, Spider("test")) # NOQA + await handler._download_request( + api_params, req, Spider("test") + ) # NOQA assert exception_text in caplog.text @pytest.mark.parametrize( @@ -247,7 +280,10 @@ async def test_job_id(self, job_id): method="POST", meta={"zyte_api": {"browserHtml": True}}, ) - resp = await handler._download_request(req, Spider("test")) # NOQA + api_params = handler._prepare_api_params(req) + resp = await handler._download_request( + api_params, req, Spider("test") + ) # NOQA assert resp.request is req assert resp.url == req.url