diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index f94c2874..e76ee1c4 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -680,10 +680,26 @@ def _request(self, headers: dict | None = None): ) def read_range(self, start: int, length: int) -> bytes: + # Match the ``b''``-for-non-positive-length convention used by + # other source implementations (e.g. ``_BytesIOSource``). + # Without this guard, ``Range: bytes=-`` goes on + # the wire, which is an invalid range and triggers a 416 from + # well-behaved servers (or worse, an arbitrarily large 200 body + # from misbehaving ones). + if length <= 0: + return b'' end = start + length - 1 headers = {'Range': f'bytes={start}-{end}'} if self._pool is not None: - return self._request(headers=headers).data + resp = self._request(headers=headers) + data = resp.data + return self._validate_range_response( + status=resp.status, + content_range=resp.headers.get('Content-Range'), + data=data, + start=start, + length=length, + ) # Fallback: stdlib. urlopen's ``timeout`` is a single value, so # use the more conservative read timeout; the connect timeout # isn't separately controllable on stdlib urllib. The opener @@ -691,7 +707,118 @@ def read_range(self, start: int, length: int) -> bytes: # validated and capped at ``_HTTP_MAX_REDIRECTS``. req = urllib.request.Request(self._url, headers=headers) with _get_stdlib_opener().open(req, timeout=self._read_timeout) as resp: - return resp.read() + data = resp.read() + # stdlib raises HTTPError for 4xx/5xx automatically; we still + # need to catch the "server ignored Range and returned 200 + # with the whole object" case, plus any short body. + return self._validate_range_response( + status=getattr(resp, 'status', None) or resp.getcode(), + content_range=resp.headers.get('Content-Range'), + data=data, + start=start, + length=length, + ) + + @staticmethod + def _validate_range_response(*, status, content_range, data, + start: int, length: int) -> bytes: + """Reject HTTP responses that do not satisfy the Range request. + + Without this, three things can go wrong silently (issue #1735): + + - the server returns a 4xx/5xx body (urllib3 by default does not + raise on non-2xx, so the bytes would be handed to the caller); + - the server ignores ``Range`` for a non-zero start and returns + the whole object as a 200 with no ``Content-Range``, handing + the codec wrong-offset bytes; + - the body is shorter or longer than what the server advertised + via ``Content-Range``, which surfaces later inside a decoder + as an opaque error far from the real cause. + + A short response near EOF is legitimate: a fixed-size header + prefetch (e.g. ``read_range(0, 16384)``) will hit the end of a + smaller file and the server returns ``Content-Range: bytes + 0-(size-1)/size``. The validator accepts that as long as the + Content-Range starts at the requested offset and the body + length matches what Content-Range advertises. + + Returns the validated bytes, sliced to at most ``length`` bytes + in the "server ignored Range and returned the whole object as + 200" case (start == 0, no Content-Range). Other branches return + ``data`` unchanged. + """ + if status is None or status not in (200, 206): + raise OSError( + f"HTTP range request returned status {status}; expected " + f"206 Partial Content (or 200 with full body)." + ) + if content_range is None: + # No Content-Range. A 200 with no Content-Range is only safe + # when the caller asked for the beginning of the object; for + # any other ``start`` the bytes returned do not correspond + # to the requested offset. + if status == 206: + raise OSError( + "HTTP 206 response missing Content-Range header." + ) + if start != 0: + raise OSError( + f"HTTP server returned status 200 with no " + f"Content-Range for a range request starting at " + f"byte {start}; refusing to use the body as a " + f"range fetch." + ) + if len(data) < min(length, 1): + # Empty body but caller wanted bytes. + raise OSError("HTTP 200 response body is empty.") + # Server ignored Range and returned the full object as 200. + # The implicit contract is "at most ``length`` bytes"; slice + # so a 16 KiB prefetch against a 2 GiB object doesn't drag + # the whole thing into memory. + if len(data) > length: + return data[:length] + return data + # Parse ``bytes -/``. Reject anything + # that does not start at the requested offset; allow ``end`` to + # be lower than requested when EOF was hit. + try: + unit, _, spec = content_range.partition(' ') + rng, _, _total = spec.partition('/') + cr_start_s, _, cr_end_s = rng.partition('-') + cr_start = int(cr_start_s) + cr_end = int(cr_end_s) + except ValueError: + raise OSError( + f"HTTP Content-Range header {content_range!r} could " + f"not be parsed." + ) from None + if unit != 'bytes': + raise OSError( + f"HTTP Content-Range unit {unit!r} is not 'bytes'." + ) + if cr_start != start: + raise OSError( + f"HTTP Content-Range {content_range!r} starts at byte " + f"{cr_start}, but the request started at byte {start}." + ) + if cr_end < cr_start: + raise OSError( + f"HTTP Content-Range {content_range!r} has end " + f"({cr_end}) below start ({cr_start})." + ) + if cr_end - cr_start + 1 > length: + raise OSError( + f"HTTP Content-Range {content_range!r} advertises more " + f"bytes than were requested (length={length})." + ) + expected_len = cr_end - cr_start + 1 + if len(data) != expected_len: + raise OSError( + f"HTTP range body length {len(data)} does not match " + f"the {expected_len} bytes advertised by " + f"Content-Range {content_range!r}." + ) + return data def read_ranges( self, diff --git a/xrspatial/geotiff/tests/test_http_range_validation_1735.py b/xrspatial/geotiff/tests/test_http_range_validation_1735.py new file mode 100644 index 00000000..a74f47fa --- /dev/null +++ b/xrspatial/geotiff/tests/test_http_range_validation_1735.py @@ -0,0 +1,217 @@ +"""Regression tests for issue #1735. + +``_HTTPSource.read_range`` previously returned the response body +without checking the status code, the ``Content-Range`` header, or the +returned byte length. Three failure modes slipped through silently: + +- a 200 (Range ignored) or a 4xx/5xx body was handed to the caller as + if it were the requested range, +- a ``Content-Range`` header pointing at a different byte range was + trusted as the requested one, +- a truncated response was passed to a downstream codec where the + decode error appeared far from the real cause. + +These tests stand up tiny loopback HTTP servers that misbehave in each +of those ways and assert that ``read_range`` raises a clear ``OSError``. +""" +from __future__ import annotations + +import http.server +import socketserver +import threading + +import pytest + +from xrspatial.geotiff._reader import _HTTPSource + + +class _BaseHandler(http.server.BaseHTTPRequestHandler): + payload: bytes = b'0' * 64 + + def log_message(self, *_args, **_kwargs): + pass + + +def _serve(handler_cls): + httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls) + port = httpd.server_address[1] + thread = threading.Thread(target=httpd.serve_forever, daemon=True) + thread.start() + return f'http://127.0.0.1:{port}/x.bin', httpd, thread + + +def _stop(httpd): + httpd.shutdown() + httpd.server_close() + + +@pytest.fixture(autouse=True) +def _allow_loopback(monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') + + +def test_range_request_ignored_for_nonzero_start_raises(): + """Server ignores ``Range`` for a non-zero start and returns full + 200 -> OSError. (A 200 with start=0 is harmless because the body + offsets line up with what the caller wanted.)""" + + class _Handler(_BaseHandler): + def do_GET(self): # noqa: N802 + # Ignore Range header; return the full object as 200. + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + with pytest.raises(OSError, match="Content-Range|range fetch"): + src.read_range(8, 16) + finally: + _stop(httpd) + + +def test_range_request_wrong_content_range_raises(): + """Server returns 206 but the Content-Range header points at the + wrong bytes -> OSError.""" + + class _Handler(_BaseHandler): + def do_GET(self): # noqa: N802 + # Pretend we sent bytes 4-19/64 regardless of what was asked. + self.send_response(206) + self.send_header('Content-Length', '16') + self.send_header('Content-Range', 'bytes 4-19/64') + self.end_headers() + self.wfile.write(self.payload[4:20]) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + # Caller asks for 0-15; server says 4-19. + with pytest.raises(OSError, match="Content-Range"): + src.read_range(0, 16) + finally: + _stop(httpd) + + +def test_range_request_short_body_raises(): + """Server returns 206 with a body shorter than the requested + length -> OSError.""" + + class _Handler(_BaseHandler): + def do_GET(self): # noqa: N802 + self.send_response(206) + self.send_header('Content-Length', '4') + self.send_header('Content-Range', 'bytes 0-15/64') + self.end_headers() + # Send only 4 bytes despite advertising a 16-byte range. + self.wfile.write(self.payload[:4]) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + with pytest.raises(OSError, match="length"): + src.read_range(0, 16) + finally: + _stop(httpd) + + +def test_range_request_well_formed_succeeds(): + """A correctly-formed 206 response is accepted as-is.""" + + class _Handler(_BaseHandler): + def do_GET(self): # noqa: N802 + rng = self.headers.get('Range', '') + spec = rng[len('bytes='):] + s, _, e = spec.partition('-') + start = int(s) + end = int(e) + chunk = self.payload[start:end + 1] + self.send_response(206) + self.send_header('Content-Length', str(len(chunk))) + self.send_header( + 'Content-Range', + f'bytes {start}-{start + len(chunk) - 1}/' + f'{len(self.payload)}', + ) + self.end_headers() + self.wfile.write(chunk) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + out = src.read_range(8, 16) + assert out == _BaseHandler.payload[8:24] + assert len(out) == 16 + finally: + _stop(httpd) + + +def test_read_range_zero_length_returns_empty_without_request(): + """``read_range(start, 0)`` (and negative ``length``) must short- + circuit to ``b''`` before any HTTP request goes on the wire. + + Without the guard, ``Range: bytes=-`` is sent, which + is an invalid range and either trips a 416 from a well-behaved + server or pulls down arbitrarily large bytes from a misbehaving one. + Other source implementations (e.g. ``_BytesIOSource``) already + follow the ``b''``-on-non-positive-length convention; this test + pins that contract for ``_HTTPSource`` too. + """ + hit_count = {'n': 0} + + class _Handler(_BaseHandler): + def do_GET(self): # noqa: N802 + # If we ever land here, the guard failed. Record the hit so + # the assertion below points at the right cause. + hit_count['n'] += 1 + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + assert src.read_range(10, 0) == b'' + assert src.read_range(0, 0) == b'' + assert src.read_range(10, -5) == b'' + assert hit_count['n'] == 0, ( + "read_range(length<=0) should not issue an HTTP request" + ) + finally: + _stop(httpd) + + +def test_range_ignored_200_with_full_body_is_sliced_to_length(): + """Server ignores ``Range`` for ``start=0`` and returns the full + object as a 200 with no ``Content-Range`` -> response is sliced to + the requested length. + + The implicit contract of ``read_range`` is "at most ``length`` + bytes". A 16 KiB header prefetch against a 2 GiB object must not + drag the whole thing into memory when the server misbehaves. + """ + + class _Handler(_BaseHandler): + # Payload larger than the requested length so we can verify the + # slice happens. + payload = b'\xab' * 4096 + + def do_GET(self): # noqa: N802 + # Ignore Range entirely; return the full object as 200. + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + # start=0 -> validator should slice rather than raise. + out = src.read_range(0, 64) + assert len(out) == 64 + assert out == _Handler.payload[:64] + finally: + _stop(httpd)