-
Notifications
You must be signed in to change notification settings - Fork 507
Add PuffinWriter for writing deletion vectors #3474
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 8 commits
755793c
9b10a4f
c90ad38
842d6a5
9524618
e23a67d
72ebba8
4ecfd18
eb81422
a6d2f31
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,7 +14,11 @@ | |
| # KIND, either express or implied. See the License for the | ||
| # specific language governing permissions and limitations | ||
| # under the License. | ||
| import importlib.metadata | ||
| import io | ||
| import math | ||
| import zlib | ||
| from collections.abc import Iterable | ||
| from typing import TYPE_CHECKING, Literal | ||
|
|
||
| from pydantic import Field | ||
|
|
@@ -27,9 +31,13 @@ | |
|
|
||
| # Short for: Puffin Fratercula arctica, version 1 | ||
| MAGIC_BYTES = b"PFA1" | ||
| DELETION_VECTOR_MAGIC = b"\xd1\xd3\x39\x64" | ||
| EMPTY_BITMAP = FrozenBitMap() | ||
| MAX_JAVA_SIGNED = int(math.pow(2, 31)) - 1 | ||
| PROPERTY_REFERENCED_DATA_FILE = "referenced-data-file" | ||
| # Reserved field id of the row position (_pos) metadata column, referenced by | ||
| # deletion-vector-v1 blob metadata (Java: MetadataColumns.ROW_POSITION) | ||
| ROW_POSITION_FIELD_ID = 2147483645 | ||
|
|
||
|
|
||
| def _deserialize_bitmap(pl: bytes) -> list[BitMap]: | ||
|
|
@@ -62,6 +70,35 @@ def _deserialize_bitmap(pl: bytes) -> list[BitMap]: | |
| return bitmaps | ||
|
|
||
|
|
||
| def _serialize_bitmaps(bitmaps: dict[int, BitMap]) -> bytes: | ||
| """ | ||
| Serialize a dictionary of bitmaps into a byte array. | ||
|
|
||
| The format is: | ||
| - 8 bytes: number of bitmaps (little-endian) | ||
| - For each bitmap: | ||
| - 4 bytes: key (little-endian) | ||
| - n bytes: serialized bitmap | ||
| """ | ||
| with io.BytesIO() as out: | ||
| sorted_keys = sorted(bitmaps.keys()) | ||
|
|
||
| # number of bitmaps | ||
| out.write(len(sorted_keys).to_bytes(8, "little")) | ||
|
|
||
| for key in sorted_keys: | ||
| if key < 0: | ||
| raise ValueError(f"Invalid unsigned key: {key}") | ||
| if key > MAX_JAVA_SIGNED: | ||
| raise ValueError(f"Key {key} is too large, max {MAX_JAVA_SIGNED} to maintain compatibility with Java impl") | ||
|
|
||
| # key | ||
| out.write(key.to_bytes(4, "little")) | ||
| # bitmap | ||
| out.write(bitmaps[key].serialize()) | ||
| return out.getvalue() | ||
|
|
||
|
|
||
| class PuffinBlobMetadata(IcebergBaseModel): | ||
| type: Literal["deletion-vector-v1"] = Field() | ||
| fields: list[int] = Field() | ||
|
|
@@ -114,3 +151,92 @@ def __init__(self, puffin: bytes) -> None: | |
|
|
||
| def to_vector(self) -> dict[str, "pa.ChunkedArray"]: | ||
| return {path: _bitmaps_to_chunked_array(bitmaps) for path, bitmaps in self._deletion_vectors.items()} | ||
|
|
||
|
|
||
| class PuffinWriter: | ||
| """Writes a Puffin file containing a single deletion-vector-v1 blob.""" | ||
|
|
||
| _blobs: list[PuffinBlobMetadata] | ||
| _blob_payloads: list[bytes] | ||
| _created_by: str | ||
|
|
||
| def __init__(self, created_by: str | None = None) -> None: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about accepting an OutputFile or something, and writing the content to it? I think this is a better approach than returning bytes. Iceberg Java PuffinWriter also accepts an output file object.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea, done in eb81422. PuffinWriter now takes an OutputFile and finish() writes the content to it and returns the file size, following the Java PuffinWriter shape. One simplification compared to Java: the file is assembled in memory and written in one shot rather than streamed, which should be fine for DVs since they are small. Happy to revisit with a streaming implementation if needed. |
||
| self._blobs = [] | ||
| self._blob_payloads = [] | ||
| self._created_by = ( | ||
| created_by if created_by is not None else f"PyIceberg version {importlib.metadata.version('pyiceberg')}" | ||
| ) | ||
|
|
||
| def set_blob( | ||
| self, | ||
| positions: Iterable[int], | ||
| referenced_data_file: str, | ||
| ) -> None: | ||
| """Set the deletion vector blob for a data file, replacing any previously set blob. | ||
|
|
||
| Args: | ||
| positions: Zero-based positions of the deleted rows in the referenced data file. | ||
| referenced_data_file: Location of the data file the deletion vector applies to. | ||
| """ | ||
| # We only support one blob at the moment | ||
| self._blobs = [] | ||
| self._blob_payloads = [] | ||
|
|
||
| bitmaps: dict[int, BitMap] = {} | ||
| for pos in positions: | ||
| if pos < 0: | ||
| raise ValueError(f"Invalid position: {pos}, positions must be non-negative") | ||
| key = pos >> 32 | ||
| low_bits = pos & 0xFFFFFFFF | ||
| if key not in bitmaps: | ||
| bitmaps[key] = BitMap() | ||
| bitmaps[key].add(low_bits) | ||
|
|
||
| if not bitmaps: | ||
| raise ValueError("Deletion vector must contain at least one position") | ||
|
|
||
| cardinality = sum(len(bm) for bm in bitmaps.values()) | ||
| vector_payload = _serialize_bitmaps(bitmaps) | ||
|
|
||
| # deletion-vector-v1 blob layout: combined length of magic and vector (4 bytes, big-endian), | ||
| # the DV magic bytes, the serialized vector, and a CRC-32 checksum of magic + vector (4 bytes, big-endian) | ||
| blob_content = DELETION_VECTOR_MAGIC + vector_payload | ||
| self._blob_payloads.append( | ||
| len(blob_content).to_bytes(4, "big") + blob_content + zlib.crc32(blob_content).to_bytes(4, "big") | ||
| ) | ||
|
|
||
| self._blobs.append( | ||
| PuffinBlobMetadata( | ||
| type="deletion-vector-v1", | ||
| fields=[ROW_POSITION_FIELD_ID], | ||
| # -1 means the snapshot id and sequence number are inherited at commit time | ||
| snapshot_id=-1, | ||
| sequence_number=-1, | ||
| # offset and length are placeholders; finish() fills them in when assembling the file | ||
| offset=0, | ||
| length=0, | ||
| properties={PROPERTY_REFERENCED_DATA_FILE: referenced_data_file, "cardinality": str(cardinality)}, | ||
| compression_codec=None, | ||
| ) | ||
| ) | ||
|
|
||
| def finish(self) -> bytes: | ||
| """Serialize the Puffin file and return its contents as bytes.""" | ||
| with io.BytesIO() as out: | ||
| out.write(MAGIC_BYTES) | ||
|
|
||
| blobs_metadata: list[PuffinBlobMetadata] = [] | ||
| for blob_metadata, blob_payload in zip(self._blobs, self._blob_payloads, strict=True): | ||
| blobs_metadata.append(blob_metadata.model_copy(update={"offset": out.tell(), "length": len(blob_payload)})) | ||
| out.write(blob_payload) | ||
|
|
||
| footer = Footer(blobs=blobs_metadata, properties={"created-by": self._created_by}) | ||
| footer_payload_bytes = footer.model_dump_json(by_alias=True, exclude_none=True).encode("utf-8") | ||
|
|
||
| out.write(MAGIC_BYTES) | ||
| out.write(footer_payload_bytes) | ||
| out.write(len(footer_payload_bytes).to_bytes(4, "little")) | ||
| out.write((0).to_bytes(4, "little")) # flags | ||
| out.write(MAGIC_BYTES) | ||
|
|
||
| return out.getvalue() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comment looks misleading. This writer doesn't write a file in my understanding.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right, it didn't. Addressed in eb81422 together with the suggestion below: PuffinWriter now accepts an OutputFile and finish() writes the file, so the docstring matches the behavior now.