diff --git a/internal/encryption/cipher.go b/internal/encryption/cipher.go index fea071d42..c8f1ecc81 100644 --- a/internal/encryption/cipher.go +++ b/internal/encryption/cipher.go @@ -94,6 +94,23 @@ func (c *Cipher) Decrypt(ciphertextAndTag, aad []byte, keyID uint32, nonce []byt return plaintext, nil } +// LoadedKeyIDs returns the sorted list of key_ids currently loaded in +// the underlying keystore. Used by the storage layer's rebadge guard +// to trial-decrypt a cleartext-labelled body against every candidate +// DEK — rotation leaves multiple DEKs active at once, and the +// on-disk envelope's key_id field can be rewritten by an attacker, +// so the guard must iterate rather than trust it. +// +// Returns nil for a nil receiver or zero-value Cipher; callers +// MUST NOT treat that as "no keys" without considering the +// surrounding context. +func (c *Cipher) LoadedKeyIDs() []uint32 { + if c == nil || c.keystore == nil { + return nil + } + return c.keystore.IDs() +} + // aeadFor validates keyID and nonce length, then returns the // pre-initialized AEAD from the keystore. The hot path here is a single // atomic.Pointer load + a map lookup; AES key expansion happened once diff --git a/store/encryption_glue.go b/store/encryption_glue.go new file mode 100644 index 000000000..82f84bf9d --- /dev/null +++ b/store/encryption_glue.go @@ -0,0 +1,283 @@ +package store + +import ( + "github.com/bootjp/elastickv/internal/encryption" + "github.com/cockroachdb/errors" +) + +// ErrEncryptedReadIntegrity wraps encryption.ErrIntegrity for storage-layer +// callers (Get / scan / iterator). Per design §4.1, callers MUST treat this +// as a typed read error and never silently zero the value or skip the row. +// +// Callers can disambiguate it from any other read error with errors.Is. +var ErrEncryptedReadIntegrity = errors.New("store: encrypted value failed integrity check (GCM tag mismatch); refusing to surface plaintext") + +// NonceFactory produces unique 12-byte AES-GCM nonces for the storage +// envelope (§4.1). The factory is responsible for the cluster-wide +// uniqueness invariant across `(node_id, local_epoch, write_count)` — +// the storage layer just calls Next() and uses what comes back. +// +// Stage 7 of the encryption rollout will replace the in-tree +// reference implementation (CounterNonceFactory) with a +// writer-registry-backed factory that guarantees uniqueness across +// voters, learners, and historical replicas. The interface stays +// the same; only the construction changes. Implementations MUST +// NOT return the same nonce twice under the same DEK — AES-GCM +// nonce reuse is catastrophic (see encryption.Cipher doc). +type NonceFactory interface { + Next() ([encryption.NonceSize]byte, error) +} + +// ActiveStorageKeyID reports the currently-active storage DEK +// identifier. The bool is false when no storage DEK is active (i.e. +// the cluster has not run Phase 1 of the §7.1 rollout yet) — in that +// case the storage layer writes cleartext as if no cipher were +// configured. Stage 5/6 wires this from the sidecar's Active.Storage +// slot; Stage 2 takes it as a closure so test code can flip it +// independently. +type ActiveStorageKeyID func() (uint32, bool) + +// WithEncryption configures the pebble-backed store to wrap every +// committed value in the §4.1 storage envelope. +// +// All three arguments must be non-nil. activeKeyID is called on +// every Put — when it returns ok=false the store writes cleartext +// (encryption_state = 0b00) even though a cipher is wired, matching +// the §7.1 Phase 0 / Phase 1 split where capability is provisioned +// before activation. Reads that observe encryption_state = 0b01 +// always go through the cipher regardless of activeKeyID, so a +// cluster mid-cutover stays readable. +// +// Calling WithEncryption with any nil argument is a no-op (the +// store stays in legacy cleartext-only mode). This keeps the +// option backwards-compatible with every existing NewPebbleStore +// caller and keeps the Stage 2 wiring trivially reversible. +func WithEncryption(cipher *encryption.Cipher, nf NonceFactory, activeKeyID ActiveStorageKeyID) PebbleStoreOption { + return func(s *pebbleStore) { + if cipher == nil || nf == nil || activeKeyID == nil { + return + } + s.cipher = cipher + s.nonceFactory = nf + s.activeStorageKeyID = activeKeyID + } +} + +// encryptForKey wraps plaintext in the §4.1 storage envelope when an +// encryption key is active for the storage purpose. Returns +// (plaintext, encStateCleartext, nil) when encryption is disabled or +// no DEK is currently active so the cipher=nil fast path stays a +// single branch. +// +// AAD binds the ciphertext to: +// +// - the envelope header (envelope_version, flag, key_id), +// - the encoded Pebble key (defeats cut-and-paste / version +// substitution per §4.1 case 2/3), +// - the on-disk value-header bytes (tombstone bit, +// encryption_state, expireAt). Without binding the value-header, +// a disk attacker could flip the tombstone bit or lower expireAt +// to force GetAt/scan into a silent ErrKeyNotFound/expired +// branch BEFORE any AEAD verification runs. +// +// The expireAt argument is the value the caller will write into the +// resulting storage entry; tombstone is hard-coded false because the +// encrypt path is never invoked for tombstone writes (deletes carry +// no plaintext and are emitted as cleartext by the store +// already). +func (s *pebbleStore) encryptForKey(pebbleKey, plaintext []byte, expireAt uint64) ([]byte, byte, error) { + if s.cipher == nil || s.activeStorageKeyID == nil { + return plaintext, encStateCleartext, nil + } + keyID, ok := s.activeStorageKeyID() + if !ok { + return plaintext, encStateCleartext, nil + } + nonceArr, err := s.nonceFactory.Next() + if err != nil { + return nil, 0, errors.Wrap(err, "store: nonce factory") + } + nonce := nonceArr[:] + // flag = 0: Snappy compression deferred to Stage 9 per design §4.1. + const envelopeFlag byte = 0 + var hdr [valueHeaderSize]byte + writeValueHeaderBytes(hdr[:], false /*tombstone*/, expireAt, encStateEncrypted) + aad := buildStorageAAD(encryption.EnvelopeVersionV1, envelopeFlag, keyID, hdr[:], pebbleKey) + ciphertextAndTag, err := s.cipher.Encrypt(plaintext, aad, keyID, nonce) + if err != nil { + return nil, 0, errors.Wrap(err, "store: encrypt value") + } + env := encryption.Envelope{ + Version: encryption.EnvelopeVersionV1, + Flag: envelopeFlag, + KeyID: keyID, + Nonce: nonceArr, + Body: ciphertextAndTag, + } + encoded, err := env.Encode() + if err != nil { + return nil, 0, errors.Wrap(err, "store: encode envelope") + } + return encoded, encStateEncrypted, nil +} + +// decryptForKey is the read-side counterpart of encryptForKey. +// encState=cleartext returns the body verbatim after the +// envelope-rebadge guard below; encState=encrypted decodes the +// envelope, reconstructs the AAD over (header + value-header + +// pebble key), and unwraps via the cipher. +// +// A GCM tag mismatch surfaces as ErrEncryptedReadIntegrity. Callers +// MUST NOT silently translate this into "key not found" or "empty +// value" because that would let a disk attacker who flipped a tag +// bit (or any AAD-bound header field) silently corrupt reads. +// +// Reserved encryption_state values are rejected upstream in +// decodeValue, so this function only sees the two valid states. +// +// sv is the storedValue freshly decoded from the on-disk bytes; its +// Tombstone, ExpireAt, and EncState are reproduced into the AAD so +// any flip on disk fails GCM verification. Callers MUST run +// tombstone / expireAt visibility checks AFTER decrypt succeeds — +// the values they observe pre-decrypt are not yet authenticated. +func (s *pebbleStore) decryptForKey(pebbleKey []byte, sv storedValue, body []byte) ([]byte, error) { + if sv.EncState == encStateCleartext { + if err := s.rejectRebadgedEnvelope(pebbleKey, sv, body); err != nil { + return nil, err + } + return body, nil + } + if s.cipher == nil { + return nil, errors.New("store: encrypted value present but no cipher configured") + } + env, err := encryption.DecodeEnvelope(body) + if err != nil { + return nil, errors.Wrap(err, "store: decode envelope") + } + var hdr [valueHeaderSize]byte + writeValueHeaderBytes(hdr[:], sv.Tombstone, sv.ExpireAt, sv.EncState) + aad := buildStorageAAD(env.Version, env.Flag, env.KeyID, hdr[:], pebbleKey) + plain, err := s.cipher.Decrypt(env.Body, aad, env.KeyID, env.Nonce[:]) + if err != nil { + if errors.Is(err, encryption.ErrIntegrity) { + return nil, errors.Wrap( + errors.WithSecondaryError(ErrEncryptedReadIntegrity, err), + "store: decrypt value") + } + return nil, errors.Wrap(err, "store: decrypt value") + } + // AES-GCM Open returns a nil dst slice for an empty plaintext; + // upstream callers (notably ExistsAt) distinguish "key absent" + // from "key present with empty value" via val != nil. Normalize + // to a non-nil zero-length slice so an empty stored value + // continues to satisfy ExistsAt → true. + if plain == nil { + plain = []byte{} + } + return plain, nil +} + +// rejectRebadgedEnvelope is the cleartext-branch guard for the §4.1 +// encryption-state rebadge attack. The on-disk encryption_state bit +// is not itself authenticated, so a disk attacker who flips it from +// 0b01 to 0b00 leaves the original envelope bytes in place and tells +// the read path to skip decryption. Without a guard the caller +// would silently receive raw envelope bytes as "plaintext". +// +// The guard runs an AEAD trial decrypt under each loaded DEK: +// reconstruct the AAD that the encrypt path would have produced, then +// call cipher.Decrypt over the body's ciphertext+tag region. If the +// GCM tag verifies the bytes are unambiguously a real envelope — +// only the DEK holder can produce a tag that survives this check, so +// legitimate cleartext has a 2⁻¹²⁸ false-positive probability. On +// any other outcome (parse failure, unknown key, tag mismatch) the +// row is treated as legitimate cleartext. +// +// AAD reconstruction substitutes the values the encrypt path always +// uses, instead of trusting the on-disk header bytes the attacker +// could have flipped: +// +// - envelope_version = EnvelopeVersionV1 (the encrypt path's +// fixed value; trusting on-disk would let a corrupted version +// byte force the body through DecodeEnvelope's error path) +// - flag = 0 (Snappy compression is deferred; the +// encrypt path's fixed value) +// - tombstone = false (the encrypt path never wraps +// tombstones, so any on-disk tombstone bit on an encrypted +// entry is necessarily attacker-supplied) +// +// The remaining AAD inputs come from disk and we enumerate plausible +// candidates: +// +// - key_id: every loaded DEK (the attacker can rewrite the on-disk +// byte, so we substitute candidates rather than trust env.KeyID) +// - expireAt: {on-disk value, 0} (covers both no-flip and the +// common "no-TTL write whose expireAt was rewritten by the +// attacker" case) +// +// The body is sliced at fixed offsets rather than going through +// DecodeEnvelope so a corrupted version or flag byte cannot force +// the parse to fail and short-circuit the guard. +// +// Residual gap. The encryption_state bit cannot itself be AAD-bound +// because the AAD reconstruction depends on it for dispatch. An +// attacker who flips encState=0b01→0b00 and ALSO corrupts a byte the +// trial cannot reproduce from canonical inputs — specifically +// body[HeaderSize:] (ciphertext / tag) or expireAt when the original +// was a non-zero value the attacker also rewrote — falls through to +// the cleartext branch. The user receives garbage bytes (NOT the +// original plaintext: the attacker does not hold the DEK), so the +// gap is in integrity observability, not confidentiality. Stage 8 +// closes this by moving encryption_state into authenticated MVCC +// metadata. +// +// No-op when the store has no cipher wired (legacy single-mode +// deployments have no rebadge attack surface) or when the body is +// too short to be an envelope. +func (s *pebbleStore) rejectRebadgedEnvelope(pebbleKey []byte, sv storedValue, body []byte) error { + if s.cipher == nil { + return nil + } + if len(body) < encryption.EnvelopeOverhead { + return nil + } + nonce := body[encryption.HeaderAADSize:encryption.HeaderSize] + ct := body[encryption.HeaderSize:] + candidateExpireAts := []uint64{sv.ExpireAt} + if sv.ExpireAt != 0 { + candidateExpireAts = append(candidateExpireAts, 0) + } + for _, kid := range s.cipher.LoadedKeyIDs() { + for _, candidateExpire := range candidateExpireAts { + var hdr [valueHeaderSize]byte + writeValueHeaderBytes(hdr[:], false /*canonical*/, candidateExpire, encStateEncrypted) + aad := buildStorageAAD(encryption.EnvelopeVersionV1, 0 /*flag canonical*/, kid, hdr[:], pebbleKey) + if _, err := s.cipher.Decrypt(ct, aad, kid, nonce); err == nil { + return errors.Wrap(ErrEncryptedReadIntegrity, + "store: cleartext-labelled value verifies as a relabeled envelope under a loaded DEK") + } + } + } + // No (DEK, candidate-expireAt) combination tag-matches. The body + // is legitimate cleartext, an envelope under a retired DEK, or + // fell into the documented residual gap above. + return nil +} + +// buildStorageAAD composes the §4.1 storage-envelope AAD with a +// single allocation. Layout: +// +// envelope_version ‖ flag ‖ key_id ‖ value_header(9B) ‖ pebble_key +// +// Pre-sizing avoids re-allocation across the two appends below +// (AppendHeaderAADBytes + the value-header / pebble-key append). +// The value-header inclusion is what binds tombstone, encryption_state, +// and expireAt into the AAD so an on-disk flip of those fields fails +// GCM verification on read. +func buildStorageAAD(version, flag byte, keyID uint32, header, pebbleKey []byte) []byte { + aad := make([]byte, 0, encryption.HeaderAADSize+len(header)+len(pebbleKey)) + aad = encryption.AppendHeaderAADBytes(aad, version, flag, keyID) + aad = append(aad, header...) + aad = append(aad, pebbleKey...) + return aad +} diff --git a/store/encryption_test_helpers.go b/store/encryption_test_helpers.go new file mode 100644 index 000000000..bcba822d3 --- /dev/null +++ b/store/encryption_test_helpers.go @@ -0,0 +1,52 @@ +package store + +import ( + "encoding/binary" + "sync/atomic" + + "github.com/bootjp/elastickv/internal/encryption" +) + +// CounterNonceFactory is a test-only NonceFactory that produces the +// design §4.1 deterministic nonce shape (`node_id ‖ local_epoch ‖ +// write_count`) without the writer-registry round-trip Stage 7 +// brings. Production wiring uses the registry-backed factory; this +// implementation is only safe for tests where the caller controls +// every node_id / local_epoch combination. +// +// Exposed (vs. living in a *_test.go file) so the encryption +// integration tests in other packages can build on the same +// implementation without re-deriving the byte layout. It is +// nevertheless test-grade — the doc comment on NonceFactory +// emphasises that production callers MUST guarantee +// (node_id, local_epoch, write_count) uniqueness. +type CounterNonceFactory struct { + nodeID uint16 + localEpoch uint16 + writes atomic.Uint64 +} + +// NewCounterNonceFactory constructs a CounterNonceFactory pinned to +// the given (nodeID, localEpoch). write_count starts at 0 and +// monotonically increments on every Next(). +func NewCounterNonceFactory(nodeID, localEpoch uint16) *CounterNonceFactory { + return &CounterNonceFactory{nodeID: nodeID, localEpoch: localEpoch} +} + +// Next produces the next 12-byte nonce. Layout matches design §4.1: +// +// bytes 0-1 node_id (big-endian uint16) +// bytes 2-3 local_epoch (big-endian uint16) +// bytes 4-11 write_count (big-endian uint64) +// +// Big-endian is chosen so a hex dump of consecutive nonces is +// human-readable as a counter; the AAD does NOT include the nonce +// bytes (the cipher composes the nonce into AES-GCM directly), so +// the byte order is internal to the factory. +func (f *CounterNonceFactory) Next() ([encryption.NonceSize]byte, error) { + var n [encryption.NonceSize]byte + binary.BigEndian.PutUint16(n[0:2], f.nodeID) + binary.BigEndian.PutUint16(n[2:4], f.localEpoch) + binary.BigEndian.PutUint64(n[4:12], f.writes.Add(1)) + return n, nil +} diff --git a/store/lsm_store.go b/store/lsm_store.go index 07f86a984..f2d8d7942 100644 --- a/store/lsm_store.go +++ b/store/lsm_store.go @@ -16,21 +16,32 @@ import ( "strings" "sync" + "github.com/bootjp/elastickv/internal/encryption" "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/v2" "github.com/cockroachdb/pebble/v2/vfs" ) const ( - timestampSize = 8 - valueHeaderSize = 9 // 1 byte tombstone + 8 bytes expireAt - snapshotBatchCountLimit = 1000 - snapshotBatchByteLimit = 8 << 20 // 8 MiB; balances restore write amplification vs peak memory usage - dirPerms = 0755 - metaLastCommitTS = "_meta_last_commit_ts" - metaMinRetainedTS = "_meta_min_retained_ts" - metaPendingMinRetainedTS = "_meta_pending_min_retained_ts" - spoolBufSize = 32 * 1024 // buffer size for streaming I/O during restore + timestampSize = 8 + valueHeaderSize = 9 // 1 byte flags + 8 bytes expireAt + // First-byte (flags) layout per design §4.1: + // bit 0 tombstone + // bits 1-2 encryption_state (0b00 cleartext, 0b01 encrypted, 0b10/0b11 reserved) + // bits 3-7 reserved (must be zero) + encStateMask byte = 0b0000_0110 + encStateShift = 1 + tombstoneMask byte = 0b0000_0001 + encStateCleartext byte = 0b00 + encStateEncrypted byte = 0b01 + encStateReservedMask byte = 0b1111_1000 // bits 3-7 must stay zero + snapshotBatchCountLimit = 1000 + snapshotBatchByteLimit = 8 << 20 // 8 MiB; balances restore write amplification vs peak memory usage + dirPerms = 0755 + metaLastCommitTS = "_meta_last_commit_ts" + metaMinRetainedTS = "_meta_min_retained_ts" + metaPendingMinRetainedTS = "_meta_pending_min_retained_ts" + spoolBufSize = 32 * 1024 // buffer size for streaming I/O during restore // maxPebbleEncodedKeySize is the limit for encoded Pebble on-disk keys, // which are the user key concatenated with the 8-byte inverted timestamp. @@ -186,6 +197,14 @@ type pebbleStore struct { // write-options pointer so monitoring (elastickv_fsm_apply_sync_mode) // and log lines stay in sync with the resolved mode. fsmApplySyncModeLabel string + // cipher / nonceFactory / activeStorageKeyID drive the §4.1 + // storage envelope. nil cipher = cleartext-only legacy behaviour; + // see WithEncryption. Once wired, cipher and nonceFactory MUST + // outlive the store (the keystore behind cipher is itself + // copy-on-write so rotation does not break this invariant). + cipher *encryption.Cipher + nonceFactory NonceFactory + activeStorageKeyID ActiveStorageKeyID } // Ensure pebbleStore implements MVCCStore and RetentionController. @@ -360,17 +379,36 @@ func decodeKeyView(k []byte) ([]byte, uint64) { return k[:keyLen], ^invTs } -// Value encoding: fixed binary header [Tombstone(1)][ExpireAt(8)] followed by raw value bytes; key and timestamp are encoded in the SST key. +// Value encoding: fixed binary header +// +// byte 0: bit 0 tombstone | bits 1-2 encryption_state | bits 3-7 reserved +// bytes 1-8: ExpireAt (LittleEndian uint64) +// bytes 9..: body — either raw plaintext (encState=0b00) or the §4.1 +// authenticated envelope bytes (encState=0b01). Reserved +// encryption_state values (0b10, 0b11) are rejected at decode +// per design §7.1. +// +// The Pebble key (`encodeKey(user_key, commit_ts)`) is signed into the +// envelope's AAD so a cut-and-paste / version-substitution attack +// rejects on Decrypt; see §4.1 case 2/3. type storedValue struct { Value []byte Tombstone bool + EncState byte // 0b00 cleartext, 0b01 encrypted; reserved values rejected at decode ExpireAt uint64 } -func encodeValue(val []byte, tombstone bool, expireAt uint64) []byte { - // Format: [Tombstone(1)] [ExpireAt(8)] [Value(...)] +// ErrEncryptedValueReservedState indicates decodeValue saw an +// encryption_state value (0b10 or 0b11) that the current build does +// not know how to interpret. Per design §7.1, this is a fail-closed +// trip-wire so an old binary cannot silently treat a future-version +// encrypted entry as cleartext bytes. +var ErrEncryptedValueReservedState = errors.New("store: value header carries reserved encryption_state; binary too old to read this entry") + +func encodeValue(val []byte, tombstone bool, expireAt uint64, encState byte) []byte { + // Format: [flags(1)] [ExpireAt(8)] [Body(...)] buf := make([]byte, encodedValueLen(len(val))) - fillEncodedValue(buf, val, tombstone, expireAt) + fillEncodedValue(buf, val, tombstone, expireAt, encState) return buf } @@ -378,21 +416,43 @@ func encodedValueLen(valueLen int) int { return valueHeaderSize + valueLen } -func fillEncodedValue(dst []byte, val []byte, tombstone bool, expireAt uint64) { +func fillEncodedValue(dst []byte, val []byte, tombstone bool, expireAt uint64, encState byte) { + writeValueHeaderBytes(dst, tombstone, expireAt, encState) + copy(dst[valueHeaderSize:], val) +} + +// writeValueHeaderBytes writes only the 9-byte value-header (flags + +// expireAt) into dst[0:valueHeaderSize]. Extracted from fillEncodedValue +// so the encryption path (encryption_glue.go) can reproduce the +// header bytes for AAD without having a body slice in hand: the AAD +// must bind tombstone, encryption_state, and expireAt so a disk +// attacker cannot flip those fields to force a silent +// ErrKeyNotFound / expired read on an encrypted record. +func writeValueHeaderBytes(dst []byte, tombstone bool, expireAt uint64, encState byte) { + var flags byte if tombstone { - dst[0] = 1 - } else { - dst[0] = 0 + flags |= tombstoneMask } + flags |= (encState << encStateShift) & encStateMask + dst[0] = flags binary.LittleEndian.PutUint64(dst[1:], expireAt) - copy(dst[valueHeaderSize:], val) } func decodeValue(data []byte) (storedValue, error) { if len(data) < valueHeaderSize { return storedValue{}, errors.New("invalid value length") } - tombstone := data[0] != 0 + flags := data[0] + if flags&encStateReservedMask != 0 { + return storedValue{}, errors.Wrapf(ErrEncryptedValueReservedState, + "value header byte = %#08b", flags) + } + encState := (flags & encStateMask) >> encStateShift + if encState != encStateCleartext && encState != encStateEncrypted { + return storedValue{}, errors.Wrapf(ErrEncryptedValueReservedState, + "encryption_state=%#x is reserved", encState) + } + tombstone := (flags & tombstoneMask) != 0 expireAt := binary.LittleEndian.Uint64(data[1:]) val := make([]byte, len(data)-valueHeaderSize) copy(val, data[valueHeaderSize:]) @@ -400,6 +460,7 @@ func decodeValue(data []byte) (storedValue, error) { return storedValue{ Value: val, Tombstone: tombstone, + EncState: encState, ExpireAt: expireAt, }, nil } @@ -642,33 +703,45 @@ func (s *pebbleStore) getAt(_ context.Context, key []byte, ts uint64) ([]byte, e } defer iter.Close() - if iter.SeekGE(seekKey) { - k := iter.Key() - userKey, _ := decodeKeyView(k) - - if !bytes.Equal(userKey, key) { - // Moved to next user key - return nil, ErrKeyNotFound - } - - // Found a version. Check if valid. - valBytes := iter.Value() - sv, err := decodeValue(valBytes) - if err != nil { - return nil, errors.WithStack(err) - } - - if sv.Tombstone { - return nil, ErrKeyNotFound - } - if sv.ExpireAt != 0 && sv.ExpireAt <= ts { - return nil, ErrKeyNotFound - } - - return sv.Value, nil + if !iter.SeekGE(seekKey) { + return nil, ErrKeyNotFound } + return s.readVisibleVersion(iter, key, ts) +} - return nil, ErrKeyNotFound +// readVisibleVersion examines the iterator's current entry and +// returns the live plaintext value at ts, or ErrKeyNotFound if the +// entry is a different user key, a tombstone, or expired. +// +// For encrypted entries the decrypt step runs BEFORE the +// tombstone/expireAt visibility checks. The AAD passed to Decrypt +// includes the on-disk value-header (tombstone bit + encryption_state +// + expireAt), so a disk attacker cannot flip those fields to force +// a silent ErrKeyNotFound/expired branch — any tamper either fails +// GCM (returns ErrEncryptedReadIntegrity) or matches the original +// values, in which case the visibility checks below are operating +// on authenticated bytes. +func (s *pebbleStore) readVisibleVersion(iter *pebble.Iterator, key []byte, ts uint64) ([]byte, error) { + k := iter.Key() + userKey, _ := decodeKeyView(k) + if !bytes.Equal(userKey, key) { + return nil, ErrKeyNotFound + } + sv, err := decodeValue(iter.Value()) + if err != nil { + return nil, errors.WithStack(err) + } + plain, err := s.decryptForKey(k, sv, sv.Value) + if err != nil { + return nil, err + } + if sv.Tombstone { + return nil, ErrKeyNotFound + } + if sv.ExpireAt != 0 && sv.ExpireAt <= ts { + return nil, ErrKeyNotFound + } + return plain, nil } func (s *pebbleStore) GetAt(ctx context.Context, key []byte, ts uint64) ([]byte, error) { @@ -699,10 +772,19 @@ func (s *pebbleStore) processFoundValue(iter *pebble.Iterator, userKey []byte, t return nil, err } + // Decrypt before the tombstone/expireAt visibility checks so the + // per-value AAD authenticates the header bits we are about to + // branch on. See readVisibleVersion for the matching rationale: + // a flipped tombstone or lowered expireAt would otherwise force + // a silent skip on an encrypted entry. + plain, err := s.decryptForKey(iter.Key(), sv, sv.Value) + if err != nil { + return nil, err + } if !sv.Tombstone && (sv.ExpireAt == 0 || sv.ExpireAt > ts) { return &KVPair{ Key: userKey, - Value: sv.Value, + Value: plain, }, nil } return nil, nil @@ -937,7 +1019,11 @@ func (s *pebbleStore) PutAt(ctx context.Context, key []byte, value []byte, commi commitTS = s.alignCommitTS(commitTS) k := encodeKey(key, commitTS) - v := encodeValue(value, false, expireAt) + body, encState, err := s.encryptForKey(k, value, expireAt) + if err != nil { + return err + } + v := encodeValue(body, false, expireAt, encState) if err := s.db.Set(k, v, pebble.NoSync); err != nil { //nolint:wrapcheck return errors.WithStack(err) @@ -952,7 +1038,7 @@ func (s *pebbleStore) DeleteAt(ctx context.Context, key []byte, commitTS uint64) commitTS = s.alignCommitTS(commitTS) k := encodeKey(key, commitTS) - v := encodeValue(nil, true, 0) + v := encodeValue(nil, true, 0, encStateCleartext) if err := s.db.Set(k, v, pebble.NoSync); err != nil { return errors.WithStack(err) @@ -978,7 +1064,11 @@ func (s *pebbleStore) ExpireAt(ctx context.Context, key []byte, expireAt uint64, commitTS = s.alignCommitTS(commitTS) k := encodeKey(key, commitTS) - v := encodeValue(val, false, expireAt) + body, encState, err := s.encryptForKey(k, val, expireAt) + if err != nil { + return err + } + v := encodeValue(body, false, expireAt, encState) if err := s.db.Set(k, v, pebble.NoSync); err != nil { return errors.WithStack(err) } @@ -1075,9 +1165,13 @@ func (s *pebbleStore) applyMutationsBatch(b *pebble.Batch, mutations []*KVPairMu if err := validateValueSize(mut.Value); err != nil { return err } - v = encodeValue(mut.Value, false, mut.ExpireAt) + body, encState, encErr := s.encryptForKey(k, mut.Value, mut.ExpireAt) + if encErr != nil { + return encErr + } + v = encodeValue(body, false, mut.ExpireAt, encState) case OpTypeDelete: - v = encodeValue(nil, true, 0) + v = encodeValue(nil, true, 0, encStateCleartext) default: return ErrUnknownOp } @@ -1240,7 +1334,7 @@ func (s *pebbleStore) deletePrefixAtWithOpts(_ context.Context, prefix []byte, e } func (s *pebbleStore) scanDeletePrefix(iter *pebble.Iterator, batch *pebble.Batch, prefix, excludePrefix []byte, commitTS uint64) error { - tombstoneVal := encodeValue(nil, true, 0) + tombstoneVal := encodeValue(nil, true, 0, encStateCleartext) for iter.SeekGE(encodeKey(prefix, math.MaxUint64)); iter.Valid(); { userKey, version, ok := nextScannableUserKey(iter) @@ -1295,6 +1389,17 @@ func (s *pebbleStore) classifyDeletePrefixKey(userKey, prefix, excludePrefix []b // isVisibleLiveKey checks whether the key has a visible, non-tombstone, // non-expired version at commitTS. It advances the iterator as a side effect. +// +// Sole caller is scanDeletePrefix, which uses the bool to decide +// whether DeletePrefixAt needs to write a fresh tombstone for the +// observed live key. The read path's value-header tamper guard +// (rounds 3–5 of PR #742) is therefore reproduced here: for encrypted +// entries we run cipher.Decrypt over (header bytes ‖ pebble key) AAD +// before branching on the unauthenticated tombstone / expireAt fields. +// Without this, a disk attacker who flips the tombstone bit on an +// encrypted entry would cause DeletePrefixAt to skip writing the +// deletion tombstone — the key survives the prefix delete silently +// (a write-side integrity bypass, not just a transient wrong return). func (s *pebbleStore) isVisibleLiveKey(iter *pebble.Iterator, userKey []byte, version, commitTS uint64) (bool, error) { if !s.seekToVisibleVersion(iter, userKey, version, commitTS) { return false, nil @@ -1303,6 +1408,14 @@ func (s *pebbleStore) isVisibleLiveKey(iter *pebble.Iterator, userKey []byte, ve if err != nil { return false, errors.WithStack(err) } + // decryptForKey authenticates the value-header bytes when the + // entry is encrypted (cleartext entries no-op except for the + // rebadge guard). We discard the plaintext — we only need the + // authentication side-effect; tombstone / expireAt visibility + // is then decided on now-trusted bytes. + if _, err := s.decryptForKey(iter.Key(), sv, sv.Value); err != nil { + return false, err + } if sv.Tombstone || (sv.ExpireAt != 0 && sv.ExpireAt <= commitTS) { return false, nil } @@ -1568,7 +1681,15 @@ func readRestoreEntry(r io.Reader, keyBuf *[]byte) (kLen, vLen int, eof bool, er if _, err = io.ReadFull(r, (*keyBuf)[:kLen]); err != nil { return 0, 0, false, errors.WithStack(err) } - vLen, err = readRestoreFieldLen(r, "snapshot value", maxSnapshotValueSize+valueHeaderSize) + // Native Pebble snapshots ship raw on-disk bytes, which for an + // encrypted row is value-header(9B) + envelope-overhead(34B) + + // ciphertext. The cap must accommodate envelope overhead so a + // plaintext written at maxSnapshotValueSize round-trips through + // snapshot restore — without it, validateValueSize accepts the + // plaintext but restore rejects the encrypted body with + // ErrValueTooLarge. + vLen, err = readRestoreFieldLen(r, "snapshot value", + maxSnapshotValueSize+valueHeaderSize+encryption.EnvelopeOverhead) if err != nil { return 0, 0, false, err } @@ -1625,7 +1746,14 @@ func flushSnapshotBatch(db *pebble.DB, batch **pebble.Batch, opts *pebble.WriteO func setEncodedVersionInBatch(batch *pebble.Batch, key []byte, version VersionedValue) error { deferred := batch.SetDeferred(encodedKeyLen(key), encodedValueLen(len(version.Value))) fillEncodedKey(deferred.Key, key, version.TS) - fillEncodedValue(deferred.Value, version.Value, version.Tombstone, version.ExpireAt) + // MVCC snapshot format v2 does not carry encryption_state — Stage 8 of + // the encryption rollout (per docs/design/2026_04_29_proposed...) bumps + // the format to v3 to round-trip encrypted entries through this path. + // Until then, restored versions are written as cleartext and any node + // snapshotting/restoring an encrypted dataset must use the native + // Pebble snapshot path (snapshot_pebble.go), which ships raw bytes + // and thus preserves the on-disk envelope verbatim. + fillEncodedValue(deferred.Value, version.Value, version.Tombstone, version.ExpireAt, encStateCleartext) return errors.WithStack(deferred.Finish()) } diff --git a/store/lsm_store_encryption_prop_test.go b/store/lsm_store_encryption_prop_test.go new file mode 100644 index 000000000..194f610f8 --- /dev/null +++ b/store/lsm_store_encryption_prop_test.go @@ -0,0 +1,85 @@ +package store + +import ( + "bytes" + "context" + "crypto/rand" + "path/filepath" + "testing" + + "github.com/bootjp/elastickv/internal/encryption" + "github.com/stretchr/testify/require" + "pgregory.net/rapid" +) + +// TestEncryption_Property_PutGet is a rapid-driven round-trip +// regression for the §4.1 envelope. For every drawn (key, value, ts) +// triple, an encrypted PutAt followed by a GetAt at ts must return +// the original plaintext byte-for-byte. The property covers: +// +// - Empty plaintext (envelope still has 34 bytes of overhead) +// - Single byte, multi-KiB, and adversarial random binary keys +// - Timestamps spanning the full uint64 range (commit-ts is +// part of AAD via encodeKey, so timestamp interactions cannot +// silently corrupt decrypt) +// +// The fixture is constructed once per Check call so the same +// keystore + cipher + nonce factory exercises many writes — this +// catches any nonce-reuse regression in CounterNonceFactory's +// atomic counter. +// +// Tamper rejection is covered by the deterministic +// TestEncryption_TagTamper unit test, where the close/reopen +// dance is straightforward; reproducing it inside rapid.Check +// would tangle with Cleanup ordering for marginal extra signal. +func TestEncryption_Property_PutGet(t *testing.T) { + rapid.Check(t, func(rt *rapid.T) { + key := nonEmptyBytes.Draw(rt, "key") + value := rapid.SliceOf(rapid.Byte()).Draw(rt, "value") + ts := rapid.Uint64Range(1, ^uint64(0)>>1).Draw(rt, "ts") + + mvcc := newPropEncryptedStore(t) + ctx := context.Background() + require.NoError(rt, mvcc.PutAt(ctx, key, value, ts, 0)) + got, err := mvcc.GetAt(ctx, key, ts) + require.NoError(rt, err) + // bytes.Equal treats nil and []byte{} as equal; the AEAD's + // Open returns nil for an empty plaintext, but the input + // from rapid may have len=0 with a non-nil slice header. + // The on-disk bytes are byte-for-byte identical either way. + require.True(rt, bytes.Equal(value, got), + "round-trip mismatch: input=%q got=%q", value, got) + }) +} + +// newPropEncryptedStore builds a fresh encrypted MVCCStore for each +// rapid.Check iteration. Constructing per iteration keeps the +// CounterNonceFactory's atomic counter local to the draw — a +// regression in the factory's uniqueness guarantee would still +// surface across multiple PutAt calls within one iteration (rapid +// shrinks toward minimal failing examples). +func newPropEncryptedStore(t *testing.T) MVCCStore { + t.Helper() + ks := encryption.NewKeystore() + dek := make([]byte, encryption.KeySize) + _, _ = rand.Read(dek) + if err := ks.Set(7, dek); err != nil { + t.Fatalf("Keystore.Set: %v", err) + } + c, err := encryption.NewCipher(ks) + if err != nil { + t.Fatalf("NewCipher: %v", err) + } + dir := filepath.Join(t.TempDir(), "pebble") + mvcc, err := NewPebbleStore(dir, + WithEncryption(c, + NewCounterNonceFactory(0xCAFE, 0x0001), + func() (uint32, bool) { return 7, true }, + ), + ) + if err != nil { + t.Fatalf("NewPebbleStore: %v", err) + } + t.Cleanup(func() { _ = mvcc.Close() }) + return mvcc +} diff --git a/store/lsm_store_encryption_test.go b/store/lsm_store_encryption_test.go new file mode 100644 index 000000000..a5b65266c --- /dev/null +++ b/store/lsm_store_encryption_test.go @@ -0,0 +1,851 @@ +package store + +import ( + "bytes" + "context" + "crypto/rand" + "path/filepath" + "testing" + + "github.com/bootjp/elastickv/internal/encryption" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/v2" +) + +// encryptedStoreFixture wires a pebbleStore with a one-DEK keystore, +// a deterministic-counter nonce factory, and an "always active" key +// resolver. Stage 5/6 will replace the resolver with a sidecar-backed +// one; Stage 2 just needs a knob the test can flip. +// +// Tests that simulate a disk attacker (AAD binding, tag tamper, etc.) +// need to close the store, mutate Pebble bytes directly, then reopen. +// The fixture's Cleanup-registered close therefore goes through +// `closeIfOpen`, which is idempotent so the test can call it +// explicitly and the t.Cleanup is a no-op on the second pass. +type encryptedStoreFixture struct { + dir string + mvcc MVCCStore + cipher *encryption.Cipher + keyID uint32 + keystore *encryption.Keystore + closed bool +} + +func (f *encryptedStoreFixture) closeIfOpen(tb testing.TB) { + tb.Helper() + if f.closed { + return + } + f.closed = true + if err := f.mvcc.Close(); err != nil { + tb.Fatalf("close pebble: %v", err) + } +} + +func (f *encryptedStoreFixture) reopen(tb testing.TB) { + tb.Helper() + mvcc, err := NewPebbleStore(f.dir, + WithEncryption(f.cipher, + NewCounterNonceFactory(0xABCD, 0x0001), + func() (uint32, bool) { return f.keyID, true }, + ), + ) + if err != nil { + tb.Fatalf("reopen NewPebbleStore: %v", err) + } + f.mvcc = mvcc + f.closed = false +} + +func newEncryptedStoreFixture(t *testing.T, keyID uint32) *encryptedStoreFixture { + t.Helper() + ks := encryption.NewKeystore() + dek := make([]byte, encryption.KeySize) + if _, err := rand.Read(dek); err != nil { + t.Fatalf("rand.Read DEK: %v", err) + } + if err := ks.Set(keyID, dek); err != nil { + t.Fatalf("Keystore.Set: %v", err) + } + c, err := encryption.NewCipher(ks) + if err != nil { + t.Fatalf("NewCipher: %v", err) + } + dir := filepath.Join(t.TempDir(), "pebble") + mvcc, err := NewPebbleStore(dir, + WithEncryption(c, + NewCounterNonceFactory(0xABCD, 0x0001), + func() (uint32, bool) { return keyID, true }, + ), + ) + if err != nil { + t.Fatalf("NewPebbleStore: %v", err) + } + f := &encryptedStoreFixture{ + dir: dir, + mvcc: mvcc, + cipher: c, + keyID: keyID, + keystore: ks, + } + t.Cleanup(func() { + if !f.closed { + _ = f.mvcc.Close() + } + }) + return f +} + +// tamperPebbleValue closes the encrypted store, opens the same +// directory directly via Pebble, applies tamperFn to the on-disk +// value at (key, ts), writes it back, and reopens through +// newEncryptedStoreFixture's pattern. The fixture's `mvcc` field is +// replaced so subsequent reads go through the new handle. +// +// The pattern is necessary because Pebble holds an exclusive lock on +// the dir while open; bypassing the public API to plant adversarial +// bytes would otherwise contend with the live store. +func (f *encryptedStoreFixture) tamperPebbleValue(t *testing.T, key []byte, ts uint64, tamperFn func(raw []byte) []byte) { + t.Helper() + f.closeIfOpen(t) + pdb, err := pebble.Open(f.dir, &pebble.Options{}) + if err != nil { + t.Fatalf("pebble.Open for tamper: %v", err) + } + pebbleKey := encodeKey(key, ts) + raw, closer, err := pdb.Get(pebbleKey) + if err != nil { + _ = pdb.Close() + t.Fatalf("read raw value: %v", err) + } + tampered := tamperFn(append([]byte(nil), raw...)) + _ = closer.Close() + if err := pdb.Set(pebbleKey, tampered, pebble.Sync); err != nil { + _ = pdb.Close() + t.Fatalf("write tampered: %v", err) + } + if err := pdb.Close(); err != nil { + t.Fatalf("pebble close: %v", err) + } + f.reopen(t) +} + +func TestEncryption_PutGet_Roundtrip(t *testing.T) { + t.Parallel() + f := newEncryptedStoreFixture(t, 7) + ctx := context.Background() + cases := []struct { + name string + key []byte + value []byte + }{ + {"short", []byte("k1"), []byte("hello")}, + {"empty value", []byte("k2"), []byte("")}, + {"binary", []byte{0x00, 0xff, 0x10}, []byte{0xde, 0xad, 0xbe, 0xef}}, + {"4 KiB", []byte("k4"), bytes.Repeat([]byte("A"), 4096)}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if err := f.mvcc.PutAt(ctx, tc.key, tc.value, 100, 0); err != nil { + t.Fatalf("PutAt: %v", err) + } + got, err := f.mvcc.GetAt(ctx, tc.key, 100) + if err != nil { + t.Fatalf("GetAt: %v", err) + } + if !bytes.Equal(got, tc.value) { + t.Fatalf("GetAt round-trip mismatch: got=%q want=%q", got, tc.value) + } + }) + } +} + +func TestEncryption_ApplyMutationsRoundtrip(t *testing.T) { + t.Parallel() + f := newEncryptedStoreFixture(t, 11) + ctx := context.Background() + muts := []*KVPairMutation{ + {Op: OpTypePut, Key: []byte("a"), Value: []byte("alpha")}, + {Op: OpTypePut, Key: []byte("b"), Value: []byte("beta")}, + {Op: OpTypePut, Key: []byte("c"), Value: bytes.Repeat([]byte("X"), 1024)}, + } + if err := f.mvcc.ApplyMutations(ctx, muts, nil, 199, 200); err != nil { + t.Fatalf("ApplyMutations: %v", err) + } + for _, m := range muts { + got, err := f.mvcc.GetAt(ctx, m.Key, 200) + if err != nil { + t.Fatalf("GetAt %q: %v", m.Key, err) + } + if !bytes.Equal(got, m.Value) { + t.Fatalf("GetAt %q mismatch: got=%q want=%q", m.Key, got, m.Value) + } + } +} + +// TestEncryption_TombstoneIndependent confirms tombstone writes do +// NOT trip the encryption path (they carry no plaintext) and reads +// of a deleted key surface ErrKeyNotFound regardless of the cipher +// being wired. +func TestEncryption_TombstoneIndependent(t *testing.T) { + t.Parallel() + f := newEncryptedStoreFixture(t, 13) + ctx := context.Background() + if err := f.mvcc.PutAt(ctx, []byte("doomed"), []byte("alive"), 100, 0); err != nil { + t.Fatalf("PutAt: %v", err) + } + if err := f.mvcc.DeleteAt(ctx, []byte("doomed"), 200); err != nil { + t.Fatalf("DeleteAt: %v", err) + } + if _, err := f.mvcc.GetAt(ctx, []byte("doomed"), 250); !errors.Is(err, ErrKeyNotFound) { + t.Fatalf("expected ErrKeyNotFound after delete, got %v", err) + } + got, err := f.mvcc.GetAt(ctx, []byte("doomed"), 150) + if err != nil { + t.Fatalf("GetAt pre-delete: %v", err) + } + if !bytes.Equal(got, []byte("alive")) { + t.Fatalf("pre-delete read mismatch: got=%q want=%q", got, "alive") + } +} + +// newToggleableEncryptedStore returns an MVCCStore wired with an +// encryption cipher whose active-key closure honours the supplied +// activeFlag pointer. Used by the activation/deactivation windows +// test. +func newToggleableEncryptedStore(t *testing.T, keyID uint32, activeFlag *bool) MVCCStore { + t.Helper() + ks := encryption.NewKeystore() + dek := make([]byte, encryption.KeySize) + dek[0] = 0xAA + if err := ks.Set(keyID, dek); err != nil { + t.Fatalf("Keystore.Set: %v", err) + } + c, err := encryption.NewCipher(ks) + if err != nil { + t.Fatalf("NewCipher: %v", err) + } + dir := filepath.Join(t.TempDir(), "pebble") + mvcc, err := NewPebbleStore(dir, + WithEncryption(c, + NewCounterNonceFactory(0x0011, 0x0002), + func() (uint32, bool) { + if !*activeFlag { + return 0, false + } + return keyID, true + }, + ), + ) + if err != nil { + t.Fatalf("NewPebbleStore: %v", err) + } + t.Cleanup(func() { _ = mvcc.Close() }) + return mvcc +} + +// mustGet calls GetAt and asserts the value matches want, surfacing +// any error via t.Fatalf so the caller test stays linear. +func mustGet(t *testing.T, mvcc MVCCStore, key []byte, ts uint64, want string) { + t.Helper() + got, err := mvcc.GetAt(context.Background(), key, ts) + if err != nil { + t.Fatalf("GetAt %q@%d: %v", key, ts, err) + } + if string(got) != want { + t.Fatalf("GetAt %q@%d: got=%q want=%q", key, ts, got, want) + } +} + +// TestEncryption_InactiveKeyWritesCleartext exercises the §7.1 +// rollout's "cipher wired but DEK not active yet" window: PutAt +// writes cleartext (encryption_state = 0b00) when the active-key +// closure returns ok=false. A later activation must not break reads +// of those cleartext entries — the read path looks at the per-value +// encryption_state, not the global active-key state. +func TestEncryption_InactiveKeyWritesCleartext(t *testing.T) { + t.Parallel() + var active bool + mvcc := newToggleableEncryptedStore(t, 42, &active) + ctx := context.Background() + + // active=false → cleartext write + if err := mvcc.PutAt(ctx, []byte("legacy"), []byte("plain"), 100, 0); err != nil { + t.Fatalf("PutAt before activation: %v", err) + } + active = true + if err := mvcc.PutAt(ctx, []byte("modern"), []byte("encrypted"), 200, 0); err != nil { + t.Fatalf("PutAt after activation: %v", err) + } + mustGet(t, mvcc, []byte("legacy"), 150, "plain") + mustGet(t, mvcc, []byte("modern"), 250, "encrypted") + // Deactivating must not break reads of the already-encrypted entry. + active = false + mustGet(t, mvcc, []byte("modern"), 250, "encrypted") +} + +// TestEncryption_AADRecordBinding is the §4.1 case-2/3 regression: +// copying a valid encrypted value from one (key, ts) slot into +// another must NOT verify under the target — the AAD includes the +// encoded Pebble key, so Decrypt returns ErrEncryptedReadIntegrity. +func TestEncryption_AADRecordBinding(t *testing.T) { + t.Parallel() + f := newEncryptedStoreFixture(t, 19) + ctx := context.Background() + + if err := f.mvcc.PutAt(ctx, []byte("a"), []byte("secret-a"), 100, 0); err != nil { + t.Fatalf("PutAt a: %v", err) + } + if err := f.mvcc.PutAt(ctx, []byte("b"), []byte("secret-b"), 100, 0); err != nil { + t.Fatalf("PutAt b: %v", err) + } + + // Snapshot a's raw on-disk bytes, then overwrite b's slot with + // them. Done via the close-tamper-reopen helper so the fixture's + // idempotent close keeps the t.Cleanup safe. + f.closeIfOpen(t) + pdb, err := pebble.Open(f.dir, &pebble.Options{}) + if err != nil { + t.Fatalf("pebble.Open: %v", err) + } + rawA, closer, err := pdb.Get(encodeKey([]byte("a"), 100)) + if err != nil { + _ = pdb.Close() + t.Fatalf("read raw a: %v", err) + } + rawACopy := append([]byte(nil), rawA...) + _ = closer.Close() + if err := pdb.Set(encodeKey([]byte("b"), 100), rawACopy, pebble.Sync); err != nil { + _ = pdb.Close() + t.Fatalf("write tampered b: %v", err) + } + if err := pdb.Close(); err != nil { + t.Fatalf("pebble close: %v", err) + } + f.reopen(t) + + if _, err := f.mvcc.GetAt(ctx, []byte("b"), 100); !errors.Is(err, ErrEncryptedReadIntegrity) { + t.Fatalf("cut-and-paste ciphertext should fail integrity, got %v", err) + } +} + +// TestEncryption_TagTamper flips one byte of the GCM tag and confirms +// reads surface ErrEncryptedReadIntegrity. +func TestEncryption_TagTamper(t *testing.T) { + t.Parallel() + f := newEncryptedStoreFixture(t, 23) + ctx := context.Background() + if err := f.mvcc.PutAt(ctx, []byte("tamper"), []byte("payload"), 100, 0); err != nil { + t.Fatalf("PutAt: %v", err) + } + f.tamperPebbleValue(t, []byte("tamper"), 100, func(raw []byte) []byte { + raw[len(raw)-1] ^= 0xff + return raw + }) + if _, err := f.mvcc.GetAt(ctx, []byte("tamper"), 100); !errors.Is(err, ErrEncryptedReadIntegrity) { + t.Fatalf("tag tamper should fail integrity, got %v", err) + } +} + +// TestEncryption_ValueHeaderTamperRejected is the PR742 codex P1 +// regression: the value-header (tombstone bit + encryption_state + +// expireAt) is bound into the storage envelope's AAD so a disk +// attacker who flips any of those fields fails GCM verification +// and surfaces ErrEncryptedReadIntegrity, NOT a silent +// ErrKeyNotFound or expired-skip. The original AAD only bound the +// envelope header + Pebble key, leaving these three header fields +// as a tamper bypass. +func TestEncryption_ValueHeaderTamperRejected(t *testing.T) { + t.Parallel() + cases := []struct { + name string + mutate func(raw []byte) []byte + summary string + }{ + { + name: "tombstone bit flipped", + summary: "would otherwise force silent ErrKeyNotFound on a live encrypted record", + mutate: func(raw []byte) []byte { + raw[0] |= 0b0000_0001 // set tombstone bit + return raw + }, + }, + { + name: "expireAt lowered to past", + summary: "would otherwise force a silent expired-skip on a live encrypted record", + mutate: func(raw []byte) []byte { + // Overwrite the 8-byte expireAt with a past timestamp; + // before the AAD fix this was a free attack vector. + past := []byte{0x01, 0, 0, 0, 0, 0, 0, 0} + copy(raw[1:1+timestampSize], past) + return raw + }, + }, + { + name: "expireAt advanced", + summary: "asymmetric — but any change must still fail closed", + mutate: func(raw []byte) []byte { + future := []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f} + copy(raw[1:1+timestampSize], future) + return raw + }, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + f := newEncryptedStoreFixture(t, 1009) + ctx := context.Background() + // Use a future-but-finite expireAt so a "lower to past" + // mutation has somewhere to go AND so the original entry + // is live (expireAt > read ts). + const writeTS uint64 = 100 + const readTS uint64 = 200 + const liveExpireAt uint64 = 1_000_000 + if err := f.mvcc.PutAt(ctx, []byte("vh"), []byte("payload"), writeTS, liveExpireAt); err != nil { + t.Fatalf("PutAt: %v", err) + } + f.tamperPebbleValue(t, []byte("vh"), writeTS, tc.mutate) + _, err := f.mvcc.GetAt(ctx, []byte("vh"), readTS) + if !errors.Is(err, ErrEncryptedReadIntegrity) { + t.Fatalf("%s: expected ErrEncryptedReadIntegrity (%s), got %v", + tc.name, tc.summary, err) + } + }) + } +} + +// TestEncryption_DeletePrefixHeaderTamperRejected covers the PR742 +// claude[bot] round-5 follow-up: isVisibleLiveKey is the write-side +// counterpart to readVisibleVersion / processFoundValue (read paths +// fixed in rounds 3–5). Without authenticating the value-header, +// a disk attacker who flips the tombstone bit on an encrypted entry +// would cause DeletePrefixAt to skip writing the deletion tombstone, +// silently leaving the key alive after the prefix delete — a +// write-side integrity bypass that survives across restarts. +// +// The fix runs decryptForKey inside isVisibleLiveKey; this test +// pins the contract that scanDeletePrefix surfaces the integrity +// error rather than no-oping the tombstone. +func TestEncryption_DeletePrefixHeaderTamperRejected(t *testing.T) { + t.Parallel() + f := newEncryptedStoreFixture(t, 41) + ctx := context.Background() + const writeTS uint64 = 100 + const deleteTS uint64 = 200 + if err := f.mvcc.PutAt(ctx, []byte("doomed/k1"), []byte("victim"), writeTS, 0); err != nil { + t.Fatalf("PutAt: %v", err) + } + // Flip the tombstone bit on the encrypted entry's value header + // directly on disk. Pre-fix, scanDeletePrefix would observe the + // flipped sv.Tombstone and skip writing a tombstone for this key. + f.tamperPebbleValue(t, []byte("doomed/k1"), writeTS, func(raw []byte) []byte { + raw[0] |= tombstoneMask + return raw + }) + err := f.mvcc.DeletePrefixAt(ctx, []byte("doomed/"), nil, deleteTS) + if !errors.Is(err, ErrEncryptedReadIntegrity) { + t.Fatalf("DeletePrefixAt over a tampered encrypted entry should fail integrity, got %v", err) + } +} + +// TestEncryption_RebadgeAttackRejected covers the PR742 codex P1 +// rebadge attack family: a disk attacker who flips encryption_state +// from 0b01 to 0b00 leaves the envelope bytes in place but tells +// the read path to skip decryption. Without the cleartext-branch +// guard in decryptForKey, the caller would receive raw envelope +// bytes as "plaintext" — a fail-open integrity bypass. +// +// Round-3 caught the simple flip; round-4 caught the variant where +// the attacker ALSO modifies the embedded key_id bytes to any +// unloaded value. The strengthened guard rejects whenever +// DecodeEnvelope parses the body — independent of whether the +// embedded key_id is currently loaded. +func TestEncryption_RebadgeAttackRejected(t *testing.T) { + t.Parallel() + cases := []struct { + name string + mutate func(raw []byte) []byte + writeTS uint64 + }{ + { + name: "encState flipped, key_id intact", + writeTS: 314159, + mutate: func(raw []byte) []byte { + raw[0] &^= encStateMask + return raw + }, + }, + { + name: "encState flipped AND key_id rewritten to unloaded", + writeTS: 271828, + mutate: func(raw []byte) []byte { + raw[0] &^= encStateMask + // envelope key_id is at byte offset valueHeaderSize+2 + // (skip flags(1)+expireAt(8) + version(1)+flag(1)). + kidOffset := valueHeaderSize + 2 + // Set to 0xFFFFFFFF, an id that the test fixture has + // not loaded. Pre-round-4 guard returned nil here. + raw[kidOffset+0] = 0xff + raw[kidOffset+1] = 0xff + raw[kidOffset+2] = 0xff + raw[kidOffset+3] = 0xff + return raw + }, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + f := newEncryptedStoreFixture(t, 31) + ctx := context.Background() + if err := f.mvcc.PutAt(ctx, []byte("rebadge"), []byte("classified"), tc.writeTS, 0); err != nil { + t.Fatalf("PutAt: %v", err) + } + f.tamperPebbleValue(t, []byte("rebadge"), tc.writeTS, tc.mutate) + _, err := f.mvcc.GetAt(ctx, []byte("rebadge"), tc.writeTS) + if !errors.Is(err, ErrEncryptedReadIntegrity) { + t.Fatalf("rebadge attempt should fail integrity, got %v", err) + } + }) + } +} + +// TestEncryption_SnapshotRestoreAtMaxValueSize covers the PR742 +// codex P1 round-8 finding: validateValueSize accepts a plaintext +// up to maxSnapshotValueSize, but encryptForKey adds 34 bytes +// (EnvelopeOverhead) on the storage envelope, and the restore +// path's per-entry cap (`maxSnapshotValueSize + valueHeaderSize`) +// would reject the encrypted body — making it possible to persist +// data that cannot be recovered via snapshot. The fix raises the +// restore cap by EnvelopeOverhead so a plaintext written at the +// exact maxSnapshotValueSize round-trips through Pebble snapshot +// save/restore. +func TestEncryption_SnapshotRestoreAtMaxValueSize(t *testing.T) { + // NOT t.Parallel: this test mutates the package-level + // maxSnapshotValueSize var, which other tests read; running it + // alongside parallel tests trips -race. Keeping it serial is + // the same convention the existing snapshot suite uses for the + // same var. + prev := maxSnapshotValueSize + maxSnapshotValueSize = 4096 + t.Cleanup(func() { maxSnapshotValueSize = prev }) + + f := newEncryptedStoreFixture(t, 71) + ctx := context.Background() + + // Write a plaintext exactly at the (shrunk) maxSnapshotValueSize. + value := bytes.Repeat([]byte{0xA5}, maxSnapshotValueSize) + if err := f.mvcc.PutAt(ctx, []byte("max"), value, 100, 0); err != nil { + t.Fatalf("PutAt at max value size: %v", err) + } + // Capture a snapshot before tearing the source store down. + snap, err := f.mvcc.Snapshot() + if err != nil { + t.Fatalf("Snapshot: %v", err) + } + defer snap.Close() + var buf bytes.Buffer + if _, err := snap.WriteTo(&buf); err != nil { + t.Fatalf("Snapshot.WriteTo: %v", err) + } + + // Restore into a fresh store (unencrypted is fine — Pebble snapshots + // ship raw bytes and the encrypted envelope is preserved verbatim; + // we are only testing the restore size cap here, not decrypt). + dstDir := filepath.Join(t.TempDir(), "restore") + dst, err := NewPebbleStore(dstDir, + WithEncryption(f.cipher, + NewCounterNonceFactory(0xABCD, 0x0001), + func() (uint32, bool) { return f.keyID, true }, + ), + ) + if err != nil { + t.Fatalf("NewPebbleStore restore-target: %v", err) + } + t.Cleanup(func() { _ = dst.Close() }) + + if err := dst.Restore(bytes.NewReader(buf.Bytes())); err != nil { + t.Fatalf("Restore at max value size + envelope overhead: %v", err) + } + got, err := dst.GetAt(ctx, []byte("max"), 100) + if err != nil { + t.Fatalf("GetAt after restore: %v", err) + } + if !bytes.Equal(got, value) { + t.Fatalf("restored value mismatch: len=%d want=%d", len(got), len(value)) + } +} + +// TestEncryption_RebadgeAttackEnvelopeHeaderCorruption covers the +// PR742 codex P1 round-9 finding: a disk attacker who flips +// encryption_state to cleartext AND corrupts the envelope's version +// (or flag) byte forces DecodeEnvelope to fail, which the previous +// guard treated as "legitimate cleartext". The round-9 fix bypasses +// DecodeEnvelope and slices the body at fixed offsets, trial- +// decrypting with canonical (version=0x01, flag=0) so a corrupted +// version or flag byte no longer ducks the integrity check. +func TestEncryption_RebadgeAttackEnvelopeHeaderCorruption(t *testing.T) { + t.Parallel() + cases := []struct { + name string + mutate func(raw []byte) []byte + }{ + { + name: "encState + envelope version byte corrupted", + mutate: func(raw []byte) []byte { + raw[0] &^= encStateMask // clear encState bits → cleartext + // envelope version sits at the start of the body, after + // the 9-byte value header. + raw[valueHeaderSize] = 0x07 // arbitrary non-0x01 + return raw + }, + }, + { + name: "encState + envelope flag byte corrupted", + mutate: func(raw []byte) []byte { + raw[0] &^= encStateMask + raw[valueHeaderSize+1] = 0xff // flag canonical = 0 + return raw + }, + }, + { + name: "encState + version AND flag corrupted", + mutate: func(raw []byte) []byte { + raw[0] &^= encStateMask + raw[valueHeaderSize] = 0x42 + raw[valueHeaderSize+1] = 0x99 + return raw + }, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + f := newEncryptedStoreFixture(t, 73) + ctx := context.Background() + const writeTS uint64 = 500001 + if err := f.mvcc.PutAt(ctx, []byte("hdr"), []byte("payload"), writeTS, 0); err != nil { + t.Fatalf("PutAt: %v", err) + } + f.tamperPebbleValue(t, []byte("hdr"), writeTS, tc.mutate) + _, err := f.mvcc.GetAt(ctx, []byte("hdr"), writeTS) + if !errors.Is(err, ErrEncryptedReadIntegrity) { + t.Fatalf("envelope header corruption rebadge should fail integrity, got %v", err) + } + }) + } +} + +// TestEncryption_RebadgeAttackCombinedHeaderFlips covers the PR742 +// codex P1 round-7 finding: a disk attacker who flips +// encryption_state AND simultaneously modifies tombstone or expireAt +// would otherwise bypass the rebadge guard because the trial-decrypt +// AAD reconstructed from the on-disk (tampered) header bytes no +// longer matches the encrypt-time AAD. The fix canonicalises +// tombstone to false in the trial AAD (encrypt path always writes +// tombstone=false) and enumerates expireAt candidates ({on-disk, 0}) +// to cover the common no-TTL case. +// +// Residual: encState + expireAt flip when the original expireAt was +// a non-zero value the attacker also rewrites. Stage 8's +// authenticated MVCC metadata bit closes that deterministically. +func TestEncryption_RebadgeAttackCombinedHeaderFlips(t *testing.T) { + t.Parallel() + cases := []struct { + name string + mutate func(raw []byte) []byte + writeTS uint64 + // origExpireAt is what we passed to PutAt; the trial guard + // must catch the attack regardless of what the attacker + // modifies on top of the encState flip. + origExpireAt uint64 + }{ + { + name: "encState + tombstone flipped (no TTL)", + writeTS: 400001, + origExpireAt: 0, + mutate: func(raw []byte) []byte { + raw[0] &^= encStateMask // clear bits 1-2 + raw[0] |= tombstoneMask // set bit 0 + return raw + }, + }, + { + name: "encState + expireAt rewritten to past (no TTL)", + writeTS: 400002, + origExpireAt: 0, + mutate: func(raw []byte) []byte { + raw[0] &^= encStateMask + // rewrite expireAt to a small past value + past := []byte{0x01, 0, 0, 0, 0, 0, 0, 0} + copy(raw[1:1+timestampSize], past) + return raw + }, + }, + { + name: "encState + tombstone + expireAt all flipped (no TTL)", + writeTS: 400003, + origExpireAt: 0, + mutate: func(raw []byte) []byte { + raw[0] &^= encStateMask + raw[0] |= tombstoneMask + future := []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f} + copy(raw[1:1+timestampSize], future) + return raw + }, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + f := newEncryptedStoreFixture(t, 67) + ctx := context.Background() + if err := f.mvcc.PutAt(ctx, []byte("combined"), []byte("payload"), tc.writeTS, tc.origExpireAt); err != nil { + t.Fatalf("PutAt: %v", err) + } + f.tamperPebbleValue(t, []byte("combined"), tc.writeTS, tc.mutate) + _, err := f.mvcc.GetAt(ctx, []byte("combined"), tc.writeTS) + if !errors.Is(err, ErrEncryptedReadIntegrity) { + t.Fatalf("combined-flip rebadge attempt should fail integrity, got %v", err) + } + }) + } +} + +// TestEncryption_RebadgeGuardAllowsLegitimateEnvelopeShapedCleartext +// is the PR742 codex P1 round-6 regression: round-5's "reject any +// envelope-parseable body" guard turned legitimate cleartext rows +// that coincidentally start with 0x01 and pass DecodeEnvelope's +// length / version / flag / nonce checks into deterministic read +// failures. Round-7 replaced the shape-only check with an actual +// AEAD trial decrypt, so a body that does not verify under any +// loaded DEK is allowed through as cleartext. +// +// We construct a cleartext payload whose bytes are envelope-shaped: +// 0x01 version, 0x00 flag, an arbitrary 4-byte key_id, 12 random +// nonce bytes, and 16+ bytes of "ciphertext+tag" filler. Pre-fix +// the read would error; post-fix the body is returned unchanged. +func TestEncryption_RebadgeGuardAllowsLegitimateEnvelopeShapedCleartext(t *testing.T) { + t.Parallel() + // Build the store with cipher wired but the active-key flag + // pointing at "no DEK" so PutAt writes cleartext. This models + // the §7.1 Phase 0 / pre-cutover window where legacy data + // coexists with a configured cipher. + var active bool + mvcc := newToggleableEncryptedStore(t, 71, &active) + ctx := context.Background() + + envelopeShaped := make([]byte, 64) + envelopeShaped[0] = 0x01 // EnvelopeVersionV1 + // flag stays 0; key_id = 0xCAFEBABE; nonce + body filler are arbitrary. + envelopeShaped[2] = 0xCA + envelopeShaped[3] = 0xFE + envelopeShaped[4] = 0xBA + envelopeShaped[5] = 0xBE + for i := 6; i < len(envelopeShaped); i++ { + envelopeShaped[i] = byte(i) + } + + if err := mvcc.PutAt(ctx, []byte("legit"), envelopeShaped, 100, 0); err != nil { + t.Fatalf("PutAt cleartext: %v", err) + } + // Activate encryption AFTER the cleartext write — read must + // still surface the original bytes verbatim. + active = true + got, err := mvcc.GetAt(ctx, []byte("legit"), 100) + if err != nil { + t.Fatalf("GetAt envelope-shaped cleartext: %v (round-5 regression)", err) + } + if !bytes.Equal(got, envelopeShaped) { + t.Fatalf("round-trip mismatch: got %x, want %x", got, envelopeShaped) + } +} + +// TestEncryption_EmptyValueExistsAt is the PR742 codex P1 round-4 +// regression for empty-plaintext semantics: AES-GCM Open returns a +// nil dst for zero-length plaintext, and the upstream ExistsAt +// distinguishes "key absent" from "key present with empty value" +// via val != nil. decryptForKey now normalizes the nil-on-empty +// case to []byte{} so a Put of an empty value followed by ExistsAt +// returns true. +func TestEncryption_EmptyValueExistsAt(t *testing.T) { + t.Parallel() + f := newEncryptedStoreFixture(t, 37) + ctx := context.Background() + if err := f.mvcc.PutAt(ctx, []byte("empty"), []byte{}, 100, 0); err != nil { + t.Fatalf("PutAt: %v", err) + } + got, err := f.mvcc.GetAt(ctx, []byte("empty"), 100) + if err != nil { + t.Fatalf("GetAt: %v", err) + } + if got == nil { + t.Fatal("GetAt returned nil for an empty stored value (regresses ExistsAt)") + } + if len(got) != 0 { + t.Fatalf("GetAt returned %d bytes, want 0", len(got)) + } + exists, err := f.mvcc.ExistsAt(ctx, []byte("empty"), 100) + if err != nil { + t.Fatalf("ExistsAt: %v", err) + } + if !exists { + t.Fatal("ExistsAt returned false for a key with empty stored value") + } +} + +// TestEncryption_ReservedEncStateRejected is the §7.1 trip-wire test: +// a value-header byte carrying encryption_state=0b10 (or 0b11) is +// rejected by decodeValue with ErrEncryptedValueReservedState. An +// older binary must NOT silently treat a future-format encrypted +// entry as cleartext bytes. +func TestEncryption_ReservedEncStateRejected(t *testing.T) { + t.Parallel() + f := newEncryptedStoreFixture(t, 29) + ctx := context.Background() + if err := f.mvcc.PutAt(ctx, []byte("reserved"), []byte("body"), 100, 0); err != nil { + t.Fatalf("PutAt: %v", err) + } + f.tamperPebbleValue(t, []byte("reserved"), 100, func(raw []byte) []byte { + // Flip encryption_state from 0b01 to 0b10 (bits 1-2) without + // touching tombstone (bit 0) or reserved bits (3-7). + raw[0] = (raw[0] & 0b1111_1001) | 0b0000_0100 + return raw + }) + _, err := f.mvcc.GetAt(ctx, []byte("reserved"), 100) + if !errors.Is(err, ErrEncryptedValueReservedState) { + t.Fatalf("reserved encState should be rejected, got %v", err) + } +} + +// TestEncryption_HeaderEncodingPin pins the value-header packing so +// future refactors that re-pack the bits accidentally cannot land. +// The §4.1 contract is load-bearing for every persisted encrypted +// entry: tombstone in bit 0, encryption_state in bits 1-2, reserved +// elsewhere. +func TestEncryption_HeaderEncodingPin(t *testing.T) { + t.Parallel() + cases := []struct { + name string + tombstone bool + encState byte + want byte + }{ + {"cleartext live", false, encStateCleartext, 0b0000_0000}, + {"cleartext tombstone", true, encStateCleartext, 0b0000_0001}, + {"encrypted live", false, encStateEncrypted, 0b0000_0010}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + buf := encodeValue(nil, tc.tombstone, 0, tc.encState) + if buf[0] != tc.want { + t.Fatalf("flags byte = %#08b, want %#08b", buf[0], tc.want) + } + sv, err := decodeValue(buf) + if err != nil { + t.Fatalf("decodeValue: %v", err) + } + if sv.Tombstone != tc.tombstone || sv.EncState != tc.encState { + t.Fatalf("decode mismatch: got tomb=%v enc=%#x, want tomb=%v enc=%#x", + sv.Tombstone, sv.EncState, tc.tombstone, tc.encState) + } + }) + } +}