diff --git a/Cargo.lock b/Cargo.lock
index 334280fb331..9da05ac8543 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4145,6 +4145,7 @@ name = "ethrex-levm"
 version = "9.0.0"
 dependencies = [
  "bytes",
+ "dashmap",
  "derive_more 1.0.0",
  "ethrex-common",
  "ethrex-crypto",
diff --git a/cmd/ethrex/cli.rs b/cmd/ethrex/cli.rs
index 7f7647f786b..964f37c3e85 100644
--- a/cmd/ethrex/cli.rs
+++ b/cmd/ethrex/cli.rs
@@ -941,13 +941,12 @@ pub async fn import_blocks_bench(
                     _ => warn!("Failed to add block {number} with hash {hash:#x}"),
                 })?;
 
-            // TODO: replace this
-            // This sleep is because we have a background process writing to disk the last layer
-            // And until it's done we can't execute the new block
-            // Because this wants to compare against running a real node in terms of reported performance
-            // It takes less than 500ms, so this is good enough, but we should report the performance
-            // without taking into account that wait.
-            tokio::time::sleep(Duration::from_millis(500)).await;
+            // TODO: replace this with an explicit "wait for persistence idle" call.
+            // The previous block's Phase 2 (disk write of bottom-most diff layer) runs in
+            // a background thread; we sleep so its completion doesn't bleed into the next
+            // block's per-block timer. 100ms is comfortably above Phase 2 cost on SSD for
+            // small blocks; raise if metrics get noisy.
+            tokio::time::sleep(Duration::from_millis(100)).await;
         }
 
         // Make head canonical and label all special blocks correctly.
diff --git a/crates/blockchain/blockchain.rs b/crates/blockchain/blockchain.rs
index 078e12a87a9..711af26779c 100644
--- a/crates/blockchain/blockchain.rs
+++ b/crates/blockchain/blockchain.rs
@@ -837,36 +837,40 @@ impl Blockchain {
         const NUM_WORKERS: usize = 16;
         let parent_state_root = parent_header.state_root;
 
-        // === Stage A: Drain + accumulate all AccountUpdates ===
-        // BAL guarantees completeness, so we block until execution finishes.
-        let mut all_updates: FxHashMap<Address, AccountUpdate> = FxHashMap::default();
-        for updates in rx {
-            let current_length = queue_length.fetch_sub(1, Ordering::Acquire);
-            *max_queue_length = current_length.max(*max_queue_length);
-            for update in updates {
-                match all_updates.entry(update.address) {
-                    Entry::Vacant(e) => {
-                        e.insert(update);
-                    }
-                    Entry::Occupied(mut e) => {
-                        e.get_mut().merge(update);
-                    }
-                }
+        // === Stage A: receive the single BAL-derived batch ===
+        // execute_block_parallel calls bal_to_account_updates BEFORE the rayon tx
+        // loop and sends exactly one Vec<AccountUpdate>. Receiving once (instead of
+        // draining until channel close = exec end) lets Stage B's parallel storage
+        // roots overlap with parallel exec instead of serializing after it.
+        //
+        // BAL accounts are unique by address (one entry per touched address), so
+        // no merge step is needed — skip the FxHashMap detour entirely.
+        let updates: Vec<AccountUpdate> = match rx.recv() {
+            Ok(updates) => {
+                let current_length = queue_length.fetch_sub(1, Ordering::Acquire);
+                *max_queue_length = current_length.max(*max_queue_length);
+                updates
             }
-        }
+            Err(_) => {
+                // Channel closed without a message — execution failed before
+                // bal_to_account_updates ran. Return empty work so the exec
+                // error surfaces in execution_result rather than being masked.
+                Vec::new()
+            }
+        };
 
-        // Extract witness accumulator before consuming updates
+        // Witness accumulator (clone since we move `updates` into Stage B below).
         let accumulated_updates = if self.options.precompute_witnesses {
-            Some(all_updates.values().cloned().collect::<Vec<_>>())
+            Some(updates.clone())
         } else {
             None
         };
 
-        // Extract code updates and build work items with pre-hashed addresses
+        // Build work items with pre-hashed addresses + extract code updates.
         let mut code_updates: Vec<(H256, Code)> = Vec::new();
-        let mut accounts: Vec<(H256, AccountUpdate)> = Vec::with_capacity(all_updates.len());
-        for (addr, update) in all_updates {
-            let hashed = keccak(addr);
+        let mut accounts: Vec<(H256, AccountUpdate)> = Vec::with_capacity(updates.len());
+        for update in updates {
+            let hashed = keccak(update.address);
             if let Some(info) = &update.info
                 && let Some(code) = &update.code
             {
diff --git a/crates/vm/backends/levm/mod.rs b/crates/vm/backends/levm/mod.rs
index 9ed80d15e00..3f67478910a 100644
--- a/crates/vm/backends/levm/mod.rs
+++ b/crates/vm/backends/levm/mod.rs
@@ -355,8 +355,20 @@ impl LEVM {
                     EvmError::Transaction(format!("Couldn't recover addresses with error: {error}"))
                 })?;
 
+        // Threshold below which the BAL parallel-execution path's constants
+        // (validation index, system_seed clone, per-tx db setup, rayon scheduling,
+        // bal_to_account_updates) outweigh any exec-side parallelism gain. Reth uses
+        // 5 for the same reason. Below threshold we fall through to sequential exec
+        // which produces a BAL during execution; blockchain.rs hash-compares it
+        // against the header BAL (validate_block_access_list_hash) — same correctness
+        // guarantees, no parallel constants.
+        const BAL_PARALLEL_TX_THRESHOLD: usize = 5;
+
         // When BAL is provided (Amsterdam+ validation path): use parallel execution
-        if let Some(bal) = header_bal {
+        // for blocks large enough to amortize the parallel path's constant overhead.
+        if let Some(bal) = header_bal
+            && block.body.transactions.len() >= BAL_PARALLEL_TX_THRESHOLD
+        {
             // Validate header BAL structural properties before execution.
             // This catches index-out-of-bounds early, before wasting execution time.
             // Note: size cap validation is deferred until after transaction processing
@@ -404,6 +416,7 @@ impl LEVM {
                             bal,
                             last_tx_idx,
                             &validation_index.accounts_by_min_index,
+                            None,
                         )
                         .is_ok()
                             && let VMType::L1 = vm_type
@@ -426,6 +439,7 @@ impl LEVM {
                 bal,
                 last_tx_idx,
                 &validation_index.accounts_by_min_index,
+                None,
             )?;
 
             // Order must match geth: requests (system calls) BEFORE withdrawals.
@@ -748,31 +762,9 @@ impl LEVM {
                 continue;
             }
 
-            // Load pre-state for unchanged fields (cache hit after prefetch)
-            let prestate = store
-                .get_account_state(addr)
-                .map_err(|e| EvmError::Custom(format!("bal_to_account_updates: {e}")))?;
-
-            // Final balance: last entry (highest index) or prestate
-            let balance = acct_changes
-                .balance_changes
-                .last()
-                .map(|c| c.post_balance)
-                .unwrap_or(prestate.balance);
-
-            // Final nonce: last entry or prestate
-            let nonce = acct_changes
-                .nonce_changes
-                .last()
-                .map(|c| c.post_nonce)
-                .unwrap_or(prestate.nonce);
-
-            // Final code: last entry or prestate
-            let (code_hash, code) = if let Some(c) = acct_changes.code_changes.last() {
-                Self::code_from_bal(&c.new_code)
-            } else {
-                (prestate.code_hash, None)
-            };
+            let last_balance = acct_changes.balance_changes.last().map(|c| c.post_balance);
+            let last_nonce = acct_changes.nonce_changes.last().map(|c| c.post_nonce);
+            let last_code_change = acct_changes.code_changes.last();
 
             // Storage: per slot, last entry (highest index)
             let mut added_storage = FxHashMap::with_capacity_and_hasher(
@@ -786,6 +778,72 @@ impl LEVM {
                 }
             }
 
+            let has_info_changes =
+                last_balance.is_some() || last_nonce.is_some() || last_code_change.is_some();
+
+            // Fast path 1: storage-only update. No info changes, can't be a removal
+            // (BAL only records actual modifications, so prestate fields are unchanged →
+            // post non-empty unless prestate was already empty, in which case removed=false
+            // by definition). Skip prestate fetch entirely.
+            if !has_info_changes {
+                updates.push(AccountUpdate {
+                    address: addr,
+                    removed: false,
+                    info: None,
+                    code: None,
+                    added_storage,
+                    removed_storage: false,
+                });
+                continue;
+            }
+
+            // Compute would-be post-state info for the fast-path test.
+            let post_balance_zero = last_balance.is_some_and(|b| b.is_zero());
+            let post_nonce_zero = last_nonce.is_some_and(|n| n == 0);
+            let post_code_empty = last_code_change.is_some_and(|c| c.new_code.is_empty());
+            let post_could_be_empty = post_balance_zero && post_nonce_zero && post_code_empty;
+            let has_full_info_coverage =
+                last_balance.is_some() && last_nonce.is_some() && last_code_change.is_some();
+
+            // Fast path 2: BAL fully specifies post-state info AND post is non-empty.
+            // No prestate needed: removal is impossible (post non-empty), and post-state
+            // info comes entirely from BAL.
+            if has_full_info_coverage && !post_could_be_empty {
+                let (code_hash, code) = Self::code_from_bal(
+                    &last_code_change
+                        .expect("checked by has_full_info_coverage")
+                        .new_code,
+                );
+                let info = AccountInfo {
+                    code_hash,
+                    balance: last_balance.expect("checked by has_full_info_coverage"),
+                    nonce: last_nonce.expect("checked by has_full_info_coverage"),
+                };
+                updates.push(AccountUpdate {
+                    address: addr,
+                    removed: false,
+                    info: Some(info),
+                    code,
+                    added_storage,
+                    removed_storage: false,
+                });
+                continue;
+            }
+
+            // Slow path: partial info coverage OR post-state may be empty (removal check).
+            // Load pre-state for unchanged fields / removal detection (cache hit after prefetch).
+            let prestate = store
+                .get_account_state(addr)
+                .map_err(|e| EvmError::Custom(format!("bal_to_account_updates: {e}")))?;
+
+            let balance = last_balance.unwrap_or(prestate.balance);
+            let nonce = last_nonce.unwrap_or(prestate.nonce);
+            let (code_hash, code) = if let Some(c) = last_code_change {
+                Self::code_from_bal(&c.new_code)
+            } else {
+                (prestate.code_hash, None)
+            };
+
             // Detect account removal (EIP-161): post-state empty but pre-state existed
             let post_empty = balance.is_zero() && nonce == 0 && code_hash == *EMPTY_KECCACK_HASH;
             let pre_empty = prestate.balance.is_zero()
@@ -793,15 +851,9 @@ impl LEVM {
                 && prestate.code_hash == *EMPTY_KECCACK_HASH;
             let removed = post_empty && !pre_empty;
 
-            let balance_changed = acct_changes
-                .balance_changes
-                .last()
-                .is_some_and(|c| c.post_balance != prestate.balance);
-            let nonce_changed = acct_changes
-                .nonce_changes
-                .last()
-                .is_some_and(|c| c.post_nonce != prestate.nonce);
-            let code_changed = acct_changes.code_changes.last().is_some();
+            let balance_changed = last_balance.is_some_and(|b| b != prestate.balance);
+            let nonce_changed = last_nonce.is_some_and(|n| n != prestate.nonce);
+            let code_changed = last_code_change.is_some();
             let acc_info_updated = balance_changed || nonce_changed || code_changed;
 
             if !removed && !acc_info_updated && added_storage.is_empty() {
@@ -818,7 +870,7 @@ impl LEVM {
                 None
             };
 
-            let update = AccountUpdate {
+            updates.push(AccountUpdate {
                 address: addr,
                 removed,
                 info,
@@ -831,8 +883,7 @@ impl LEVM {
                 // so `added_storage` already contains those zeroed entries and
                 // the trie update is correct without setting removed_storage.
                 removed_storage: false,
-            };
-            updates.push(update);
+            });
         }
 
         Ok(updates)
@@ -853,6 +904,7 @@ impl LEVM {
         bal: &BlockAccessList,
         max_idx: u32,
         accounts_by_min_index: &[(u32, usize)],
+        code_cache: Option<&[Vec<(H256, Option<Code>)>]>,
     ) -> Result<(), EvmError> {
         // Only visit accounts whose minimum change index <= max_idx.
         let end = accounts_by_min_index.partition_point(|(min_idx, _)| *min_idx <= max_idx);
@@ -885,11 +937,18 @@ impl LEVM {
             }
 
             // Compute code update before borrowing acc (borrow checker: can't access
-            // db.codes while acc holds a mutable borrow of db)
+            // db.codes while acc holds a mutable borrow of db).
+            // Cache hit avoids redundant keccak + jump-target scan when the same code
+            // change is replayed across multiple txs in the parallel loop.
             let code_update = if code_pos > 0 {
-                Some(Self::code_from_bal(
-                    &acct_changes.code_changes[code_pos - 1].new_code,
-                ))
+                let code_idx = code_pos - 1;
+                if let Some(cache) = code_cache {
+                    Some(cache[acct_idx][code_idx].clone())
+                } else {
+                    Some(Self::code_from_bal(
+                        &acct_changes.code_changes[code_idx].new_code,
+                    ))
+                }
             } else {
                 None
             };
@@ -1059,31 +1118,57 @@ impl LEVM {
                 .is_some_and(|a| a.storage.contains_key(key))
         });
 
-        // Pre-compute capacity hint for per-tx DBs from BAL account count.
-        let bal_account_count = bal.accounts().len();
+        // Pre-compute Code objects (hash + jump_targets) once per BAL code change.
+        // Without this, every tx that re-applies the same code change in seed_db_from_bal
+        // re-keccaks and re-scans the same bytecode. Stress blocks have 10-30 code changes
+        // and 200-500 txs, so the savings compound. Outer Vec indexed by BAL account index;
+        // inner indexed by code_change position. `Code` clone is cheap relative to recompute
+        // (Bytes is refcounted; only jump_targets Vec<u32> heap-clones).
+        let code_cache: Vec<Vec<(H256, Option<Code>)>> = bal
+            .accounts()
+            .iter()
+            .map(|ac| {
+                ac.code_changes
+                    .iter()
+                    .map(|c| Self::code_from_bal(&c.new_code))
+                    .collect()
+            })
+            .collect();
 
         // 2. Execute all txs in parallel (embarrassingly parallel, BAL-seeded).
-        //    BAL validation is deferred to after the gas limit check (step 3) so that
-        //    blocks exceeding gas limit produce GAS_USED_OVERFLOW before BAL mismatch.
+        //    Per-tx BAL validation runs INSIDE the closure (alongside exec) so the
+        //    serial post-loop only handles gas accounting and shared-set bookkeeping.
+        //    The `validation_error` field carries any per-tx BAL validation failure;
+        //    it is surfaced AFTER the gas-limit check so that GAS_USED_OVERFLOW takes
+        //    priority over BAL mismatch errors (the BAL is built assuming rejected
+        //    txs; the miner balance in the BAL won't match execution that ran all txs).
+        //
+        //    `current_state` and `codes` are dropped inside the closure after
+        //    validation runs, so they don't need to traverse the rayon boundary.
+        //    Per-tx reduction summaries (`destroyed_addresses`, `read_keys`) are
+        //    pre-computed for the post-loop's `unread_storage_reads` cleanup.
         type TxExecResult = (
             usize,
             TxType,
             ExecutionReport,
-            FxHashMap<Address, LevmAccount>,
-            FxHashMap<H256, ethrex_common::types::Code>,
             FxHashSet<Address>,   // accessed_accounts tracker (coarse)
-            Vec<Address>,         // shadow recorder touched_addresses (EIP-7928 exact)
-            Vec<(Address, U256)>, // shadow recorder storage_reads (EIP-7928 exact)
+            Vec<Address>,         // destroyed addresses (clear all reads for these)
+            Vec<(Address, H256)>, // read keys (specific entries to clear)
+            Option<EvmError>,     // deferred BAL validation error (gas check first)
         );
 
         let exec_results: Result<Vec<TxExecResult>, EvmError> = (0..n_txs)
             .into_par_iter()
             .map(|tx_idx| -> Result<_, EvmError> {
                 let (tx, sender) = &txs_with_sender[tx_idx];
+                // Per-tx capacity hint: most txs touch <10 accounts. Sizing to the full
+                // BAL account count (often 100s on stress blocks) inflates per-task allocator
+                // pressure × n_rayon_workers. Use a small constant; HashMap grows on demand.
+                const PER_TX_DB_CAPACITY: usize = 32;
                 let mut tx_db = GeneralizedDatabase::new_with_shared_base_and_capacity(
                     store.clone(),
                     system_seed.clone(),
-                    bal_account_count,
+                    PER_TX_DB_CAPACITY,
                 );
                 // Small capacity: parallel txs rarely nest >8 call frames, and
                 // over-allocating per-tx wastes memory across many rayon tasks.
@@ -1098,6 +1183,7 @@ impl LEVM {
                     bal,
                     u32::try_from(tx_idx).unwrap_or(u32::MAX),
                     &validation_index.accounts_by_min_index,
+                    Some(&code_cache),
                 )?;
 
                 // Enable accessed_accounts tracker (coarse) for `unaccessed_pure_accounts`
@@ -1139,28 +1225,103 @@ impl LEVM {
                     .take()
                     .map(|mut r| (r.take_touched_addresses(), r.take_storage_reads()))
                     .unwrap_or_default();
+
+                // Pre-compute per-tx summaries for the post-loop's unread_storage_reads
+                // cleanup, then drop current_state/codes (they don't cross the rayon
+                // boundary). Destroyed accounts clear ALL their reads; non-destroyed
+                // accounts clear only the slots they actually loaded.
+                let mut destroyed_addresses: Vec<Address> = Vec::new();
+                let mut read_keys: Vec<(Address, H256)> = Vec::new();
+                for (addr, acct) in &current_state {
+                    if matches!(
+                        acct.status,
+                        AccountStatus::Destroyed | AccountStatus::DestroyedModified
+                    ) {
+                        destroyed_addresses.push(*addr);
+                    } else {
+                        for key in acct.storage.keys() {
+                            read_keys.push((*addr, *key));
+                        }
+                    }
+                }
+
+                // Run BAL validation inside the closure. Errors are deferred to the
+                // serial post-loop so the gas-limit check can take priority.
+                let bal_idx = u32::try_from(tx_idx + 1).unwrap_or(u32::MAX);
+                let seed_idx = u32::try_from(tx_idx).unwrap_or(u32::MAX);
+                let validation_error: Option<EvmError> = (|| {
+                    Self::validate_tx_execution(
+                        bal_idx,
+                        seed_idx,
+                        &current_state,
+                        &codes,
+                        bal,
+                        validation_index,
+                        &system_seed,
+                        &store,
+                    )
+                    .map_err(|e| {
+                        EvmError::Custom(format!("BAL validation failed for tx {tx_idx}: {e}"))
+                    })?;
+
+                    // EIP-7928 (Group B): missing-account check.
+                    for addr in &shadow_touched {
+                        if !validation_index.addr_to_idx.contains_key(addr) {
+                            return Err(EvmError::Custom(format!(
+                                "BAL validation failed for tx {tx_idx}: account {addr:?} was \
+                                 accessed during execution but is missing from BAL"
+                            )));
+                        }
+                    }
+                    // EIP-7928 (Group B): missing storage_read check.
+                    for (addr, slot) in &shadow_reads {
+                        let Some(&bal_acct_idx) = validation_index.addr_to_idx.get(addr) else {
+                            // Already caught by the touched-address check above.
+                            continue;
+                        };
+                        let acct = &bal.accounts()[bal_acct_idx];
+                        let in_changes = acct
+                            .storage_changes
+                            .binary_search_by(|sc| sc.slot.cmp(slot))
+                            .is_ok();
+                        let in_reads = acct.storage_reads.contains(slot);
+                        if !in_changes && !in_reads {
+                            return Err(EvmError::Custom(format!(
+                                "BAL validation failed for tx {tx_idx}: storage slot {slot} of \
+                                 account {addr:?} was read during execution but is missing from \
+                                 BAL (no storage_changes or storage_reads entry)"
+                            )));
+                        }
+                    }
+                    Ok(())
+                })()
+                .err();
+
+                drop(current_state);
+                drop(codes);
+
                 Ok((
                     tx_idx,
                     tx.tx_type(),
                     report,
-                    current_state,
-                    codes,
                     tracked,
-                    shadow_touched,
-                    shadow_reads,
+                    destroyed_addresses,
+                    read_keys,
+                    validation_error,
                 ))
             })
             .collect();
 
         let mut exec_results = exec_results?;
 
-        // Sort so gas accounting and validation happen in tx order.
-        exec_results.sort_unstable_by_key(|(idx, _, _, _, _, _, _, _)| *idx);
+        // Sort so gas accounting and validation surfacing happen in tx order.
+        exec_results.sort_unstable_by_key(|(idx, _, _, _, _, _, _)| *idx);
 
-        // 3. Gas limit check — must happen BEFORE BAL validation so that blocks
-        //    exceeding the gas limit produce GAS_USED_OVERFLOW instead of a BAL
-        //    mismatch error (the BAL is built assuming rejected txs, so the miner
-        //    balance in the BAL won't match execution that ran all txs).
+        // 3. Gas limit check — must happen BEFORE surfacing any BAL validation
+        //    error so that blocks exceeding the gas limit produce
+        //    GAS_USED_OVERFLOW instead of a BAL mismatch error (the BAL is
+        //    built assuming rejected txs, so the miner balance in the BAL won't
+        //    match execution that ran all txs).
         //
         //    EIP-8037 PR #2703: also enforce the per-tx 2D inclusion check
         //    against running block totals. A tx whose worst-case regular or
@@ -1168,7 +1329,7 @@ impl LEVM {
         //    position invalidates the block with GAS_ALLOWANCE_EXCEEDED.
         let mut block_regular_gas_used = 0_u64;
         let mut block_state_gas_used = 0_u64;
-        for (tx_idx, _, report, _, _, _, _, _) in &exec_results {
+        for (tx_idx, _, report, _, _, _, _) in &exec_results {
             let (tx, _) = txs_with_sender
                 .get(*tx_idx)
                 .ok_or_else(|| EvmError::Custom(format!("tx index {tx_idx} out of bounds")))?;
@@ -1197,95 +1358,42 @@ impl LEVM {
             )));
         }
 
-        // 4. Per-tx BAL validation — now safe to run after gas limit is confirmed OK.
-        //    Also mark off storage_reads that appear in per-tx execution state.
-        for (tx_idx, _, _, current_state, codes, tracked_accounts, shadow_touched, shadow_reads) in
-            &exec_results
-        {
-            let bal_idx = u32::try_from(*tx_idx + 1).unwrap_or(u32::MAX);
-            let seed_idx = u32::try_from(*tx_idx).unwrap_or(u32::MAX);
-            Self::validate_tx_execution(
-                bal_idx,
-                seed_idx,
-                current_state,
-                codes,
-                bal,
-                validation_index,
-                &system_seed,
-                &store,
-            )
-            .map_err(|e| EvmError::Custom(format!("BAL validation failed for tx {tx_idx}: {e}")))?;
-
-            // Mark storage_reads that were actually loaded during this tx.
-            // storage_reads slots are NOT in storage_changes (conflict check ensures this),
-            // so they're not seeded. If a slot appears in the per-tx state's storage,
-            // the tx genuinely read it via SLOAD.
-            // Special case: selfdestruct clears storage from the final state, so reads
-            // that happened before destruction are no longer visible. For destroyed
-            // accounts, mark ALL their BAL storage_reads as satisfied.
-            if !unread_storage_reads.is_empty() {
-                for (addr, acct) in current_state {
-                    if matches!(
-                        acct.status,
-                        AccountStatus::Destroyed | AccountStatus::DestroyedModified
-                    ) {
-                        unread_storage_reads.retain(|&(a, _)| a != *addr);
-                    } else {
-                        for key in acct.storage.keys() {
-                            unread_storage_reads.remove(&(*addr, *key));
-                        }
-                    }
-                }
+        // 4. Surface earliest deferred BAL validation error (post gas check).
+        for (_, _, _, _, _, _, val_err) in &mut exec_results {
+            if let Some(err) = val_err.take() {
+                return Err(err);
             }
+        }
 
-            // Mark pure-access accounts that were accessed during this tx.
+        // 5. Apply per-tx summaries to the shared `unread_storage_reads` and
+        //    `unaccessed_pure_accounts` checklists. The per-tx closures
+        //    pre-computed these from `current_state` before dropping it.
+        if !unread_storage_reads.is_empty() {
+            for (_, _, _, _, destroyed, read_keys, _) in &exec_results {
+                for addr in destroyed {
+                    unread_storage_reads.retain(|&(a, _)| a != *addr);
+                }
+                for entry in read_keys {
+                    unread_storage_reads.remove(entry);
+                }
+            }
+        }
+        if !unaccessed_pure_accounts.is_empty() {
             // The coinbase is always accessed during fee finalization (geth's
             // readerTracker records it), even when the miner fee is zero and
             // ethrex skips the load_account call.
-            if !unaccessed_pure_accounts.is_empty() {
-                unaccessed_pure_accounts.remove(&header.coinbase);
+            unaccessed_pure_accounts.remove(&header.coinbase);
+            for (_, _, _, tracked_accounts, _, _, _) in &exec_results {
                 for addr in tracked_accounts {
                     unaccessed_pure_accounts.remove(addr);
                 }
             }
-
-            // EIP-7928 (Group B): missing-access detection using the shadow recorder.
-            // For each address the per-tx shadow recorder marked as touched, the header
-            // BAL must contain an entry for it. For each storage read, the header BAL
-            // must carry the slot either in storage_changes or storage_reads.
-            for addr in shadow_touched {
-                if !validation_index.addr_to_idx.contains_key(addr) {
-                    return Err(EvmError::Custom(format!(
-                        "BAL validation failed for tx {tx_idx}: account {addr:?} was \
-                         accessed during execution but is missing from BAL"
-                    )));
-                }
-            }
-            for (addr, slot) in shadow_reads {
-                let Some(&bal_acct_idx) = validation_index.addr_to_idx.get(addr) else {
-                    // Already caught by the touched-address check above.
-                    continue;
-                };
-                let acct = &bal.accounts()[bal_acct_idx];
-                let in_changes = acct
-                    .storage_changes
-                    .binary_search_by(|sc| sc.slot.cmp(slot))
-                    .is_ok();
-                let in_reads = acct.storage_reads.contains(slot);
-                if !in_changes && !in_reads {
-                    return Err(EvmError::Custom(format!(
-                        "BAL validation failed for tx {tx_idx}: storage slot {slot} of \
-                         account {addr:?} was read during execution but is missing from \
-                         BAL (no storage_changes or storage_reads entry)"
-                    )));
-                }
-            }
         }
 
-        // 5. Build receipts in tx order.
+        // 6. Build receipts in tx order.
         let mut receipts = Vec::with_capacity(n_txs);
         let mut cumulative_gas_used = 0_u64;
-        for (_, tx_type, report, _, _, _, _, _) in exec_results {
+        for (_, tx_type, report, _, _, _, _) in exec_results {
             cumulative_gas_used += report.gas_spent;
             let receipt = Receipt::new(
                 tx_type,
diff --git a/crates/vm/levm/Cargo.toml b/crates/vm/levm/Cargo.toml
index 834650b5c4b..d45f56acf8d 100644
--- a/crates/vm/levm/Cargo.toml
+++ b/crates/vm/levm/Cargo.toml
@@ -20,6 +20,7 @@ malachite = "0.6.1"
 strum = { version = "0.27.1", features = ["derive"] }
 rustc-hash.workspace = true
 rayon.workspace = true
+dashmap = "6.1"
 
 
 [features]
diff --git a/crates/vm/levm/src/db/mod.rs b/crates/vm/levm/src/db/mod.rs
index cafdd1cbef0..4463f789b26 100644
--- a/crates/vm/levm/src/db/mod.rs
+++ b/crates/vm/levm/src/db/mod.rs
@@ -1,18 +1,23 @@
 use crate::{errors::DatabaseError, precompiles::PrecompileCache};
+use dashmap::DashMap;
 use ethrex_common::{
     Address, H256, U256,
     types::{AccountState, ChainConfig, Code, CodeMetadata},
 };
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
-use rustc_hash::FxHashMap;
-use std::sync::{Arc, OnceLock, PoisonError, RwLock, RwLockReadGuard, RwLockWriteGuard};
+use rustc_hash::FxBuildHasher;
+use std::sync::{Arc, OnceLock};
 
 pub mod gen_db;
 
-// Type aliases for cache storage maps
-type AccountCache = FxHashMap<Address, AccountState>;
-type StorageCache = FxHashMap<(Address, H256), U256>;
-type CodeCache = FxHashMap<H256, Code>;
+// Type aliases for cache storage maps. Sharded concurrent maps replace the old
+// `RwLock<FxHashMap>` design — under the parallel BAL path, 16 rayon workers all
+// hammered the single account RwLock, contributing ~11% of CPU to lock
+// contention (perf measured 2026-04-28). DashMap shards by key hash so workers
+// touching different addresses don't contend.
+type AccountCache = DashMap<Address, AccountState, FxBuildHasher>;
+type StorageCache = DashMap<(Address, H256), U256, FxBuildHasher>;
+type CodeCache = DashMap<H256, Code, FxBuildHasher>;
 
 pub trait Database: Send + Sync {
     fn get_account_state(&self, address: Address) -> Result<AccountState, DatabaseError>;
@@ -53,11 +58,11 @@ pub trait Database: Send + Sync {
 pub struct CachingDatabase {
     inner: Arc<dyn Database>,
     /// Cached account states (balance, nonce, code_hash, storage_root)
-    accounts: RwLock<AccountCache>,
+    accounts: AccountCache,
     /// Cached storage values
-    storage: RwLock<StorageCache>,
+    storage: StorageCache,
     /// Cached contract code
-    code: RwLock<CodeCache>,
+    code: CodeCache,
     /// Shared precompile result cache (warmer populates, executor reuses)
     precompile_cache: PrecompileCache,
     /// Cached chain config (constant for the lifetime of this database)
@@ -68,9 +73,9 @@ impl CachingDatabase {
     pub fn new(inner: Arc<dyn Database>) -> Self {
         Self {
             inner,
-            accounts: RwLock::new(FxHashMap::default()),
-            storage: RwLock::new(FxHashMap::default()),
-            code: RwLock::new(FxHashMap::default()),
+            accounts: AccountCache::with_hasher(FxBuildHasher),
+            storage: StorageCache::with_hasher(FxBuildHasher),
+            code: CodeCache::with_hasher(FxBuildHasher),
             precompile_cache: PrecompileCache::new(),
             chain_config: OnceLock::new(),
         }
@@ -80,40 +85,12 @@ impl CachingDatabase {
     pub fn precompile_cache(&self) -> &PrecompileCache {
         &self.precompile_cache
     }
-
-    fn read_accounts(&self) -> Result<RwLockReadGuard<'_, AccountCache>, DatabaseError> {
-        self.accounts.read().map_err(poison_error_to_db_error)
-    }
-
-    fn write_accounts(&self) -> Result<RwLockWriteGuard<'_, AccountCache>, DatabaseError> {
-        self.accounts.write().map_err(poison_error_to_db_error)
-    }
-
-    fn read_storage(&self) -> Result<RwLockReadGuard<'_, StorageCache>, DatabaseError> {
-        self.storage.read().map_err(poison_error_to_db_error)
-    }
-
-    fn write_storage(&self) -> Result<RwLockWriteGuard<'_, StorageCache>, DatabaseError> {
-        self.storage.write().map_err(poison_error_to_db_error)
-    }
-
-    fn read_code(&self) -> Result<RwLockReadGuard<'_, CodeCache>, DatabaseError> {
-        self.code.read().map_err(poison_error_to_db_error)
-    }
-
-    fn write_code(&self) -> Result<RwLockWriteGuard<'_, CodeCache>, DatabaseError> {
-        self.code.write().map_err(poison_error_to_db_error)
-    }
-}
-
-fn poison_error_to_db_error<T>(err: PoisonError<T>) -> DatabaseError {
-    DatabaseError::Custom(format!("Cache lock poisoned: {err}"))
 }
 
 impl Database for CachingDatabase {
     fn get_account_state(&self, address: Address) -> Result<AccountState, DatabaseError> {
-        // Check cache first
-        if let Some(state) = self.read_accounts()?.get(&address).copied() {
+        // Check cache first (per-shard lock; non-contended for distinct addresses)
+        if let Some(state) = self.accounts.get(&address).map(|r| *r) {
             return Ok(state);
         }
 
@@ -121,14 +98,14 @@ impl Database for CachingDatabase {
         let state = self.inner.get_account_state(address)?;
 
         // Populate cache (AccountState is Copy, no clone needed)
-        self.write_accounts()?.insert(address, state);
+        self.accounts.insert(address, state);
 
         Ok(state)
     }
 
     fn get_storage_value(&self, address: Address, key: H256) -> Result<U256, DatabaseError> {
         // Check cache first
-        if let Some(value) = self.read_storage()?.get(&(address, key)).copied() {
+        if let Some(value) = self.storage.get(&(address, key)).map(|r| *r) {
             return Ok(value);
         }
 
@@ -136,7 +113,7 @@ impl Database for CachingDatabase {
         let value = self.inner.get_storage_value(address, key)?;
 
         // Populate cache (U256 is Copy, no clone needed)
-        self.write_storage()?.insert((address, key), value);
+        self.storage.insert((address, key), value);
 
         Ok(value)
     }
@@ -159,7 +136,7 @@ impl Database for CachingDatabase {
 
     fn get_account_code(&self, code_hash: H256) -> Result<Code, DatabaseError> {
         // Check cache first
-        if let Some(code) = self.read_code()?.get(&code_hash).cloned() {
+        if let Some(code) = self.code.get(&code_hash).map(|r| r.clone()) {
             return Ok(code);
         }
 
@@ -167,7 +144,7 @@ impl Database for CachingDatabase {
         let code = self.inner.get_account_code(code_hash)?;
 
         // Populate cache (Code contains Bytes which is ref-counted, clone is cheap)
-        self.write_code()?.insert(code_hash, code.clone());
+        self.code.insert(code_hash, code.clone());
 
         Ok(code)
     }
@@ -184,32 +161,23 @@ impl Database for CachingDatabase {
     }
 
     fn prefetch_accounts(&self, addresses: &[Address]) -> Result<(), DatabaseError> {
-        // Fetch from inner in parallel (no lock contention), then single write-lock to populate cache.
-        let fetched: Vec<(Address, AccountState)> = addresses
+        // Fetch from inner in parallel; insert into sharded cache without serializing
+        // through a single write lock — DashMap inserts touch only the matching shard.
+        addresses
             .par_iter()
-            .map(|&addr| self.inner.get_account_state(addr).map(|s| (addr, s)))
-            .collect::<Result<_, _>>()?;
-        let mut cache = self.write_accounts()?;
-        for (addr, state) in fetched {
-            cache.entry(addr).or_insert(state);
-        }
-        Ok(())
+            .try_for_each(|&addr| -> Result<(), DatabaseError> {
+                let state = self.inner.get_account_state(addr)?;
+                self.accounts.entry(addr).or_insert(state);
+                Ok(())
+            })
     }
 
     fn prefetch_storage(&self, keys: &[(Address, H256)]) -> Result<(), DatabaseError> {
-        // Fetch from inner in parallel (no lock contention), then single write-lock to populate cache.
-        let fetched: Vec<((Address, H256), U256)> = keys
-            .par_iter()
-            .map(|&(addr, key)| {
-                self.inner
-                    .get_storage_value(addr, key)
-                    .map(|v| ((addr, key), v))
+        keys.par_iter()
+            .try_for_each(|&(addr, key)| -> Result<(), DatabaseError> {
+                let value = self.inner.get_storage_value(addr, key)?;
+                self.storage.entry((addr, key)).or_insert(value);
+                Ok(())
             })
-            .collect::<Result<_, _>>()?;
-        let mut cache = self.write_storage()?;
-        for (key, value) in fetched {
-            cache.entry(key).or_insert(value);
-        }
-        Ok(())
     }
 }