Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 129 additions & 2 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,18 +680,145 @@ def _request(self, headers: dict | None = None):
)

def read_range(self, start: int, length: int) -> bytes:
# Match the ``b''``-for-non-positive-length convention used by
# other source implementations (e.g. ``_BytesIOSource``).
# Without this guard, ``Range: bytes=<start>-<start-1>`` goes on
# the wire, which is an invalid range and triggers a 416 from
# well-behaved servers (or worse, an arbitrarily large 200 body
# from misbehaving ones).
if length <= 0:
return b''
end = start + length - 1
headers = {'Range': f'bytes={start}-{end}'}
if self._pool is not None:
return self._request(headers=headers).data
resp = self._request(headers=headers)
data = resp.data
return self._validate_range_response(
status=resp.status,
content_range=resp.headers.get('Content-Range'),
data=data,
start=start,
length=length,
)
# Fallback: stdlib. urlopen's ``timeout`` is a single value, so
# use the more conservative read timeout; the connect timeout
# isn't separately controllable on stdlib urllib. The opener
# carries ``_ValidatingRedirectHandler`` so 3xx hops are re-
# validated and capped at ``_HTTP_MAX_REDIRECTS``.
req = urllib.request.Request(self._url, headers=headers)
with _get_stdlib_opener().open(req, timeout=self._read_timeout) as resp:
return resp.read()
data = resp.read()
# stdlib raises HTTPError for 4xx/5xx automatically; we still
# need to catch the "server ignored Range and returned 200
# with the whole object" case, plus any short body.
return self._validate_range_response(
status=getattr(resp, 'status', None) or resp.getcode(),
content_range=resp.headers.get('Content-Range'),
data=data,
start=start,
length=length,
)

@staticmethod
def _validate_range_response(*, status, content_range, data,
start: int, length: int) -> bytes:
"""Reject HTTP responses that do not satisfy the Range request.

Without this, three things can go wrong silently (issue #1735):

- the server returns a 4xx/5xx body (urllib3 by default does not
raise on non-2xx, so the bytes would be handed to the caller);
- the server ignores ``Range`` for a non-zero start and returns
the whole object as a 200 with no ``Content-Range``, handing
the codec wrong-offset bytes;
- the body is shorter or longer than what the server advertised
via ``Content-Range``, which surfaces later inside a decoder
as an opaque error far from the real cause.

A short response near EOF is legitimate: a fixed-size header
prefetch (e.g. ``read_range(0, 16384)``) will hit the end of a
smaller file and the server returns ``Content-Range: bytes
0-(size-1)/size``. The validator accepts that as long as the
Content-Range starts at the requested offset and the body
length matches what Content-Range advertises.

Returns the validated bytes, sliced to at most ``length`` bytes
in the "server ignored Range and returned the whole object as
200" case (start == 0, no Content-Range). Other branches return
``data`` unchanged.
"""
if status is None or status not in (200, 206):
raise OSError(
f"HTTP range request returned status {status}; expected "
f"206 Partial Content (or 200 with full body)."
)
if content_range is None:
# No Content-Range. A 200 with no Content-Range is only safe
# when the caller asked for the beginning of the object; for
# any other ``start`` the bytes returned do not correspond
# to the requested offset.
if status == 206:
raise OSError(
"HTTP 206 response missing Content-Range header."
)
if start != 0:
raise OSError(
f"HTTP server returned status 200 with no "
f"Content-Range for a range request starting at "
f"byte {start}; refusing to use the body as a "
f"range fetch."
)
if len(data) < min(length, 1):
# Empty body but caller wanted bytes.
raise OSError("HTTP 200 response body is empty.")
# Server ignored Range and returned the full object as 200.
# The implicit contract is "at most ``length`` bytes"; slice
# so a 16 KiB prefetch against a 2 GiB object doesn't drag
# the whole thing into memory.
if len(data) > length:
return data[:length]
return data
# Parse ``bytes <start>-<end>/<total-or-*>``. Reject anything
# that does not start at the requested offset; allow ``end`` to
# be lower than requested when EOF was hit.
try:
unit, _, spec = content_range.partition(' ')
rng, _, _total = spec.partition('/')
cr_start_s, _, cr_end_s = rng.partition('-')
cr_start = int(cr_start_s)
cr_end = int(cr_end_s)
except ValueError:
raise OSError(
f"HTTP Content-Range header {content_range!r} could "
f"not be parsed."
) from None
if unit != 'bytes':
raise OSError(
f"HTTP Content-Range unit {unit!r} is not 'bytes'."
)
if cr_start != start:
raise OSError(
f"HTTP Content-Range {content_range!r} starts at byte "
f"{cr_start}, but the request started at byte {start}."
)
if cr_end < cr_start:
raise OSError(
f"HTTP Content-Range {content_range!r} has end "
f"({cr_end}) below start ({cr_start})."
)
if cr_end - cr_start + 1 > length:
raise OSError(
f"HTTP Content-Range {content_range!r} advertises more "
f"bytes than were requested (length={length})."
)
expected_len = cr_end - cr_start + 1
if len(data) != expected_len:
raise OSError(
f"HTTP range body length {len(data)} does not match "
f"the {expected_len} bytes advertised by "
f"Content-Range {content_range!r}."
)
return data

def read_ranges(
self,
Expand Down
217 changes: 217 additions & 0 deletions xrspatial/geotiff/tests/test_http_range_validation_1735.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
"""Regression tests for issue #1735.

``_HTTPSource.read_range`` previously returned the response body
without checking the status code, the ``Content-Range`` header, or the
returned byte length. Three failure modes slipped through silently:

- a 200 (Range ignored) or a 4xx/5xx body was handed to the caller as
if it were the requested range,
- a ``Content-Range`` header pointing at a different byte range was
trusted as the requested one,
- a truncated response was passed to a downstream codec where the
decode error appeared far from the real cause.

These tests stand up tiny loopback HTTP servers that misbehave in each
of those ways and assert that ``read_range`` raises a clear ``OSError``.
"""
from __future__ import annotations

import http.server
import socketserver
import threading

import pytest

from xrspatial.geotiff._reader import _HTTPSource


class _BaseHandler(http.server.BaseHTTPRequestHandler):
payload: bytes = b'0' * 64

def log_message(self, *_args, **_kwargs):
pass


def _serve(handler_cls):
httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls)
port = httpd.server_address[1]
thread = threading.Thread(target=httpd.serve_forever, daemon=True)
thread.start()
return f'http://127.0.0.1:{port}/x.bin', httpd, thread


def _stop(httpd):
httpd.shutdown()
httpd.server_close()


@pytest.fixture(autouse=True)
def _allow_loopback(monkeypatch):
monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1')


def test_range_request_ignored_for_nonzero_start_raises():
"""Server ignores ``Range`` for a non-zero start and returns full
200 -> OSError. (A 200 with start=0 is harmless because the body
offsets line up with what the caller wanted.)"""

class _Handler(_BaseHandler):
def do_GET(self): # noqa: N802
# Ignore Range header; return the full object as 200.
self.send_response(200)
self.send_header('Content-Length', str(len(self.payload)))
self.end_headers()
self.wfile.write(self.payload)

url, httpd, _ = _serve(_Handler)
try:
src = _HTTPSource(url)
with pytest.raises(OSError, match="Content-Range|range fetch"):
src.read_range(8, 16)
finally:
_stop(httpd)


def test_range_request_wrong_content_range_raises():
"""Server returns 206 but the Content-Range header points at the
wrong bytes -> OSError."""

class _Handler(_BaseHandler):
def do_GET(self): # noqa: N802
# Pretend we sent bytes 4-19/64 regardless of what was asked.
self.send_response(206)
self.send_header('Content-Length', '16')
self.send_header('Content-Range', 'bytes 4-19/64')
self.end_headers()
self.wfile.write(self.payload[4:20])

url, httpd, _ = _serve(_Handler)
try:
src = _HTTPSource(url)
# Caller asks for 0-15; server says 4-19.
with pytest.raises(OSError, match="Content-Range"):
src.read_range(0, 16)
finally:
_stop(httpd)


def test_range_request_short_body_raises():
"""Server returns 206 with a body shorter than the requested
length -> OSError."""

class _Handler(_BaseHandler):
def do_GET(self): # noqa: N802
self.send_response(206)
self.send_header('Content-Length', '4')
self.send_header('Content-Range', 'bytes 0-15/64')
self.end_headers()
# Send only 4 bytes despite advertising a 16-byte range.
self.wfile.write(self.payload[:4])

url, httpd, _ = _serve(_Handler)
try:
src = _HTTPSource(url)
with pytest.raises(OSError, match="length"):
src.read_range(0, 16)
finally:
_stop(httpd)


def test_range_request_well_formed_succeeds():
"""A correctly-formed 206 response is accepted as-is."""

class _Handler(_BaseHandler):
def do_GET(self): # noqa: N802
rng = self.headers.get('Range', '')
spec = rng[len('bytes='):]
s, _, e = spec.partition('-')
start = int(s)
end = int(e)
chunk = self.payload[start:end + 1]
self.send_response(206)
self.send_header('Content-Length', str(len(chunk)))
self.send_header(
'Content-Range',
f'bytes {start}-{start + len(chunk) - 1}/'
f'{len(self.payload)}',
)
self.end_headers()
self.wfile.write(chunk)

url, httpd, _ = _serve(_Handler)
try:
src = _HTTPSource(url)
out = src.read_range(8, 16)
assert out == _BaseHandler.payload[8:24]
assert len(out) == 16
finally:
_stop(httpd)


def test_read_range_zero_length_returns_empty_without_request():
"""``read_range(start, 0)`` (and negative ``length``) must short-
circuit to ``b''`` before any HTTP request goes on the wire.

Without the guard, ``Range: bytes=<start>-<start-1>`` is sent, which
is an invalid range and either trips a 416 from a well-behaved
server or pulls down arbitrarily large bytes from a misbehaving one.
Other source implementations (e.g. ``_BytesIOSource``) already
follow the ``b''``-on-non-positive-length convention; this test
pins that contract for ``_HTTPSource`` too.
"""
hit_count = {'n': 0}

class _Handler(_BaseHandler):
def do_GET(self): # noqa: N802
# If we ever land here, the guard failed. Record the hit so
# the assertion below points at the right cause.
hit_count['n'] += 1
self.send_response(200)
self.send_header('Content-Length', str(len(self.payload)))
self.end_headers()
self.wfile.write(self.payload)

url, httpd, _ = _serve(_Handler)
try:
src = _HTTPSource(url)
assert src.read_range(10, 0) == b''
assert src.read_range(0, 0) == b''
assert src.read_range(10, -5) == b''
assert hit_count['n'] == 0, (
"read_range(length<=0) should not issue an HTTP request"
)
finally:
_stop(httpd)


def test_range_ignored_200_with_full_body_is_sliced_to_length():
"""Server ignores ``Range`` for ``start=0`` and returns the full
object as a 200 with no ``Content-Range`` -> response is sliced to
the requested length.

The implicit contract of ``read_range`` is "at most ``length``
bytes". A 16 KiB header prefetch against a 2 GiB object must not
drag the whole thing into memory when the server misbehaves.
"""

class _Handler(_BaseHandler):
# Payload larger than the requested length so we can verify the
# slice happens.
payload = b'\xab' * 4096

def do_GET(self): # noqa: N802
# Ignore Range entirely; return the full object as 200.
self.send_response(200)
self.send_header('Content-Length', str(len(self.payload)))
self.end_headers()
self.wfile.write(self.payload)

url, httpd, _ = _serve(_Handler)
try:
src = _HTTPSource(url)
# start=0 -> validator should slice rather than raise.
out = src.read_range(0, 64)
assert len(out) == 64
assert out == _Handler.payload[:64]
finally:
_stop(httpd)
Loading