From 9b47f66140c5deaf5bd98fcb7f9457bbcc0a4f9c Mon Sep 17 00:00:00 2001
From: haraschax <6804392+haraschax@users.noreply.github.com>
Date: Thu, 21 May 2026 21:07:23 +0000
Subject: [PATCH 1/6] Compile modeld warps with symbolic camera sizes

---
 selfdrive/modeld/SConscript           |  12 +--
 selfdrive/modeld/compile_dm_warp.py   |  43 ++++----
 selfdrive/modeld/compile_modeld.py    | 146 ++++++++++++++++++--------
 selfdrive/modeld/dmonitoringmodeld.py |  21 +++-
 selfdrive/modeld/modeld.py            |  25 +++--
 5 files changed, 169 insertions(+), 78 deletions(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index eba951611697be..27ab5f9dfad77c 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -121,12 +121,12 @@ lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files + script_file
 
 dm_w, dm_h = DM_INPUT_SIZE
 compile_dm_warp_script = [File(f"{modeld_dir}/compile_dm_warp.py")]
-for cam_w, cam_h in CAMERA_CONFIGS:
-  dm_pkl_path = File(f"models/dm_warp_{cam_w}x{cam_h}_tinygrad.pkl").abspath
-  cmd = (f'{tg_flags} {mac_brew_string} python3 {modeld_dir}/compile_dm_warp.py '
-         f'--camera-resolution {cam_w}x{cam_h} --warp-to {dm_w}x{dm_h} '
-         f'--output {dm_pkl_path}')
-  lenv.Command(dm_pkl_path, tinygrad_files + compile_dm_warp_script + compile_modeld_script + [tg_devices_node], cmd)
+dm_pkl_path = File("models/dm_warp_tinygrad.pkl").abspath
+camera_res_args = ' '.join(f'{cw}x{ch}' for cw, ch in CAMERA_CONFIGS)
+cmd = (f'{tg_flags} {mac_brew_string} python3 {modeld_dir}/compile_dm_warp.py '
+       f'--camera-resolutions {camera_res_args} --warp-to {dm_w}x{dm_h} '
+       f'--output {dm_pkl_path}')
+lenv.Command(dm_pkl_path, tinygrad_files + compile_dm_warp_script + compile_modeld_script + [Value(camera_res_args), tg_devices_node], cmd)
 
 def tg_compile(flags, model_name):
   pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"'
diff --git a/selfdrive/modeld/compile_dm_warp.py b/selfdrive/modeld/compile_dm_warp.py
index 548990ee15a7d9..b308efc1e75cd9 100755
--- a/selfdrive/modeld/compile_dm_warp.py
+++ b/selfdrive/modeld/compile_dm_warp.py
@@ -8,49 +8,54 @@
 from tinygrad.engine.jit import TinyJit
 
 from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
-from openpilot.selfdrive.modeld.compile_modeld import NV12Frame, warp_perspective_tinygrad, _parse_size
+from openpilot.selfdrive.modeld.compile_modeld import NV12Frame, bind_camera_vars, make_camera_vars, warp_perspective_tinygrad, _parse_size
 
 
-def make_warp_dm(nv12: NV12Frame, dm_w, dm_h):
-  cam_w, cam_h, stride, _, _, _ = nv12
-  stride_pad = stride - cam_w
-
-  def warp_dm(input_frame, M_inv):
+def make_warp_dm(dm_w, dm_h):
+  def warp_dm(input_frame, M_inv, cam_w, cam_h, chroma_w, chroma_h, stride, uv_offset):
     M_inv = M_inv.to(Device.DEFAULT).realize()
-    return warp_perspective_tinygrad(input_frame[:cam_h*stride], M_inv,
-                                     (dm_w, dm_h), (cam_h, cam_w), stride_pad, border_fill_val=0).reshape(-1, dm_h * dm_w)
+    return warp_perspective_tinygrad(input_frame, M_inv, (dm_w, dm_h),
+                                     (cam_h, cam_w), stride, border_fill_val=0).reshape(-1, dm_h * dm_w)
   return warp_dm
 
 
-def compile_dm_warp(nv12: NV12Frame, dm_w, dm_h, pkl_path):
-  print(f"Compiling DM warp for {nv12.width}x{nv12.height} -> {dm_w}x{dm_h}...")
+def compile_dm_warp(camera_configs: list[NV12Frame], dm_w, dm_h, pkl_path):
+  print(f"Compiling DM warp for {len(camera_configs)} camera sizes -> {dm_w}x{dm_h}...")
 
-  warp_dm_jit = TinyJit(make_warp_dm(nv12, dm_w, dm_h), prune=True)
+  camera_vars, max_frame_size = make_camera_vars(camera_configs)
+  warp_dm_jit = TinyJit(make_warp_dm(dm_w, dm_h), prune=True)
 
   for i in range(10):
-    frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8').realize()
+    nv12 = camera_configs[i % len(camera_configs)]
+    frame = Tensor.randint(max_frame_size, low=0, high=256, dtype='uint8').realize()
     M_inv = Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY')
     Device.default.synchronize()
     st = time.perf_counter()
-    warp_dm_jit(frame, M_inv).realize()
+    warp_dm_jit(frame, M_inv, **bind_camera_vars(camera_vars, nv12)).realize()
     mt = time.perf_counter()
     Device.default.synchronize()
     et = time.perf_counter()
-    print(f"  [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
+    print(f"  [{i+1}/10] {nv12.width}x{nv12.height} enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
 
   with open(pkl_path, "wb") as f:
-    pickle.dump(warp_dm_jit, f)
+    pickle.dump({
+      'warp': warp_dm_jit,
+      'camera_configs': {nv12[:2]: nv12 for nv12 in camera_configs},
+      'max_frame_size': max_frame_size,
+    }, f)
   print(f"  Saved to {pkl_path}")
 
 
 if __name__ == "__main__":
   p = argparse.ArgumentParser()
-  p.add_argument('--camera-resolution', type=_parse_size, required=True, help='camera resolution WxH')
+  p.add_argument('--camera-resolution', type=_parse_size, help='camera resolution WxH')
+  p.add_argument('--camera-resolutions', type=_parse_size, nargs='+', help='camera resolutions WxH (one or more)')
   p.add_argument('--warp-to', type=_parse_size, required=True, help='DM input WxH')
   p.add_argument('--output', required=True)
   args = p.parse_args()
 
-  cam_w, cam_h = args.camera_resolution
-  nv12 = NV12Frame(cam_w, cam_h, *get_nv12_info(cam_w, cam_h))
+  camera_resolutions = args.camera_resolutions or ([args.camera_resolution] if args.camera_resolution else None)
+  assert camera_resolutions is not None, "one of --camera-resolution or --camera-resolutions is required"
+  camera_configs = [NV12Frame(cam_w, cam_h, *get_nv12_info(cam_w, cam_h)) for cam_w, cam_h in camera_resolutions]
   dm_w, dm_h = args.warp_to
-  compile_dm_warp(nv12, dm_w, dm_h, args.output)
+  compile_dm_warp(camera_configs, dm_w, dm_h, args.output)
diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index f919d1da23a0b7..883018f4799e4f 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -26,12 +26,12 @@ def fetch_fw(path, name, sha256):
 _patch_tinygrad_fetch_fw()
 
 from tinygrad.tensor import Tensor
+from tinygrad import Variable
 from tinygrad.helpers import Context
 from tinygrad.device import Device
 from tinygrad.engine.jit import TinyJit
 
 from openpilot.common.file_chunker import read_file_chunked
-from openpilot.system.hardware.hw import Paths
 
 
 NV12Frame = namedtuple("NV12Frame", ['width', 'height', 'stride', 'y_height', 'uv_height', 'size'])
@@ -41,8 +41,66 @@ def fetch_fw(path, name, sha256):
 
 WARP_DEV = os.getenv('WARP_DEV')
 
+_ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE = None
+
+
+def _optimize_local_size_or_skip(call, prg):
+  try:
+    return _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE(call, prg)
+  except AssertionError as e:
+    if str(e) != "all optimize_local_size exec failed":
+      raise
+    from dataclasses import replace
+    preferred = (32, 16, 1)
+    local_size = tuple(next(x for x in range(min(preferred[i] if i < len(preferred) else 1, g), 0, -1) if g % x == 0)
+                       for i, g in enumerate(prg.arg.global_size))
+    new_global = tuple(g//l if g % l == 0 else g/l for g, l in zip(prg.arg.global_size, local_size))
+    return call.replace(src=(prg.replace(arg=replace(prg.arg, global_size=new_global, local_size=local_size)), *call.src[1:]))
+
+
+def _patch_tinygrad_local_size_optimizer():
+  global _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE
+  from tinygrad.engine import realize
+  from tinygrad.uop.ops import Ops, PatternMatcher, UPat
+
+  _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE = realize.optimize_local_size
+  realize.pm_optimize_local_size = PatternMatcher([
+    (UPat(Ops.CALL, src=(UPat(Ops.PROGRAM, name="prg"),), name="call", allow_any_len=True), _optimize_local_size_or_skip),
+  ])
+
+
+_patch_tinygrad_local_size_optimizer()
+
+
+def make_camera_vars(camera_configs: list[NV12Frame]):
+  max_cam_w = max(nv12.width for nv12 in camera_configs)
+  max_cam_h = max(nv12.height for nv12 in camera_configs)
+  max_stride = max(nv12.stride for nv12 in camera_configs)
+  max_uv_offset = max(nv12.stride * nv12.y_height for nv12 in camera_configs)
+  max_frame_size = max(nv12.size for nv12 in camera_configs)
+  return {
+    'cam_w': Variable('cam_w', 1, max_cam_w),
+    'cam_h': Variable('cam_h', 1, max_cam_h),
+    'chroma_w': Variable('chroma_w', 1, max_cam_w // 2),
+    'chroma_h': Variable('chroma_h', 1, max_cam_h // 2),
+    'stride': Variable('stride', 1, max_stride),
+    'uv_offset': Variable('uv_offset', 1, max_uv_offset),
+  }, max_frame_size
+
+
+def bind_camera_vars(camera_vars, nv12: NV12Frame):
+  values = {
+    'cam_w': nv12.width,
+    'cam_h': nv12.height,
+    'chroma_w': nv12.width // 2,
+    'chroma_h': nv12.height // 2,
+    'stride': nv12.stride,
+    'uv_offset': nv12.stride * nv12.y_height,
+  }
+  return {k: v.bind(values[k]) for k, v in camera_vars.items()}
+
 
-def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad, border_fill_val=None):
+def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride, src_offset=0, x_step=1, channel=0, border_fill_val=None):
   w_dst, h_dst = dst_shape
   h_src, w_src = src_shape
 
@@ -61,7 +119,7 @@ def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad,
   y_round = Tensor.round(src_y)
   x_nn_clipped = x_round.clip(0, w_src - 1).cast('int')
   y_nn_clipped = y_round.clip(0, h_src - 1).cast('int')
-  idx = y_nn_clipped * (w_src + stride_pad) + x_nn_clipped
+  idx = y_nn_clipped * stride + x_nn_clipped * x_step + src_offset + channel
   sampled = src_flat[idx]
 
   if border_fill_val is None:
@@ -84,26 +142,18 @@ def frames_to_tensor(frames):
   return in_img1
 
 
-def make_frame_prepare(nv12: NV12Frame, model_w, model_h):
-  cam_w, cam_h, stride, y_height, uv_height, _ = nv12
-  uv_offset = stride * y_height
-  stride_pad = stride - cam_w
-
-  def frame_prepare_tinygrad(input_frame, M_inv):
+def make_frame_prepare(model_w, model_h):
+  def frame_prepare_tinygrad(input_frame, M_inv, cam_w, cam_h, chroma_w, chroma_h, stride, uv_offset):
     # UV_SCALE @ M_inv @ UV_SCALE_INV simplifies to elementwise scaling
     M_inv_uv = M_inv * Tensor([[1.0, 1.0, 0.5], [1.0, 1.0, 0.5], [2.0, 2.0, 1.0]], device=WARP_DEV)
-    # deinterleave NV12 UV plane (UVUV... -> separate U, V)
-    uv = input_frame[uv_offset:uv_offset + uv_height * stride].reshape(uv_height, stride)
     with Context(SPLIT_REDUCEOP=0):
-      y = warp_perspective_tinygrad(input_frame[:cam_h*stride],
-                                    M_inv, (model_w, model_h),
-                                    (cam_h, cam_w), stride_pad).realize()
-      u = warp_perspective_tinygrad(uv[:cam_h//2, :cam_w:2].flatten(),
-                                    M_inv_uv, (model_w//2, model_h//2),
-                                    (cam_h//2, cam_w//2), 0).realize()
-      v = warp_perspective_tinygrad(uv[:cam_h//2, 1:cam_w:2].flatten(),
-                                    M_inv_uv, (model_w//2, model_h//2),
-                                    (cam_h//2, cam_w//2), 0).realize()
+      y = warp_perspective_tinygrad(input_frame, M_inv, (model_w, model_h),
+                                    (cam_h, cam_w), stride).realize()
+      # Gather directly from interleaved NV12 UV memory so symbolic widths avoid step=2 slicing.
+      u = warp_perspective_tinygrad(input_frame, M_inv_uv, (model_w//2, model_h//2),
+                                    (chroma_h, chroma_w), stride, uv_offset, x_step=2, channel=0).realize()
+      v = warp_perspective_tinygrad(input_frame, M_inv_uv, (model_w//2, model_h//2),
+                                    (chroma_h, chroma_w), stride, uv_offset, x_step=2, channel=1).realize()
     yuv = y.cat(u).cat(v).reshape((model_h * 3 // 2, model_w))
     tensor = frames_to_tensor(yuv)
     return tensor
@@ -148,21 +198,22 @@ def sample_desire(buf, frame_skip):
   return buf.reshape(-1, frame_skip, *buf.shape[1:]).max(1).flatten(0, 1).unsqueeze(0)
 
 
-def make_run_policy(vision_runner, policy_runner, nv12: NV12Frame, model_w, model_h,
+def make_run_policy(vision_runner, policy_runner, model_w, model_h,
                     vision_features_slice, frame_skip, prepare_only=False):
-  frame_prepare = make_frame_prepare(nv12, model_w, model_h)
+  frame_prepare = make_frame_prepare(model_w, model_h)
   sample_skip_fn = partial(sample_skip, frame_skip=frame_skip)
   sample_desire_fn = partial(sample_desire, frame_skip=frame_skip)
 
-  def run_policy(img_q, big_img_q, feat_q, desire_q, desire, traffic_convention, tfm, big_tfm, frame, big_frame):
+  def run_policy(img_q, big_img_q, feat_q, desire_q, desire, traffic_convention, tfm, big_tfm,
+                 frame, big_frame, cam_w, cam_h, chroma_w, chroma_h, stride, uv_offset):
     tfm = tfm.to(WARP_DEV)
     big_tfm = big_tfm.to(WARP_DEV)
     desire = desire.to(Device.DEFAULT)
     traffic_convention = traffic_convention.to(Device.DEFAULT)
     Tensor.realize(tfm, big_tfm, desire, traffic_convention)
 
-    warped_frame = frame_prepare(frame, tfm).unsqueeze(0).to(Device.DEFAULT)
-    warped_big_frame = frame_prepare(big_frame, big_tfm).unsqueeze(0).to(Device.DEFAULT)
+    warped_frame = frame_prepare(frame, tfm, cam_w, cam_h, chroma_w, chroma_h, stride, uv_offset).unsqueeze(0).to(Device.DEFAULT)
+    warped_big_frame = frame_prepare(big_frame, big_tfm, cam_w, cam_h, chroma_w, chroma_h, stride, uv_offset).unsqueeze(0).to(Device.DEFAULT)
     img = shift_and_sample(img_q, warped_frame, sample_skip_fn)
     big_img = shift_and_sample(big_img_q, warped_big_frame, sample_skip_fn)
 
@@ -182,21 +233,24 @@ def run_policy(img_q, big_img_q, feat_q, desire_q, desire, traffic_convention, t
   return run_policy
 
 
-def compile_modeld(nv12: NV12Frame, model_w, model_h, prepare_only, frame_skip,
+def compile_modeld(camera_configs: list[NV12Frame], model_w, model_h, prepare_only, frame_skip,
                    vision_runner, policy_runner, vision_metadata, policy_metadata):
-  print(f"Compiling combined policy JIT for {nv12.width}x{nv12.height} (prepare_only={prepare_only})...")
+  print(f"Compiling combined policy JIT for {len(camera_configs)} camera sizes (prepare_only={prepare_only})...")
 
   vision_features_slice = vision_metadata['output_slices']['hidden_state']
   vision_input_shapes = vision_metadata['input_shapes']
   policy_input_shapes = policy_metadata['input_shapes']
 
-  _run = make_run_policy(vision_runner, policy_runner, nv12, model_w, model_h,
+  camera_vars, max_frame_size = make_camera_vars(camera_configs)
+  max_nv12 = max(camera_configs, key=lambda n: n.size)
+
+  _run = make_run_policy(vision_runner, policy_runner, model_w, model_h,
                          vision_features_slice, frame_skip, prepare_only)
   run_policy_jit = TinyJit(_run, prune=True)
 
   SEED = 42
 
-  def random_inputs_run_fn(fn, seed, test_val=None, test_buffers=None, expect_match=True):
+  def random_inputs_run_fn(fn, seed, test_val=None, test_buffers=None, expect_match=True, camera_config=None):
     input_queues, npy = make_input_queues(vision_input_shapes, policy_input_shapes, frame_skip, Device.DEFAULT)
     np.random.seed(seed)
     Tensor.manual_seed(seed)
@@ -205,17 +259,19 @@ def random_inputs_run_fn(fn, seed, test_val=None, test_buffers=None, expect_matc
     n_runs = 1 if testing else 3
 
     for i in range(n_runs):
-      frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8', device=WARP_DEV).realize()
-      big_frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8', device=WARP_DEV).realize()
+      nv12 = camera_config or camera_configs[0]
+      camera_args = bind_camera_vars(camera_vars, nv12)
+      frame = Tensor.randint(max_frame_size, low=0, high=256, dtype='uint8', device=WARP_DEV).realize()
+      big_frame = Tensor.randint(max_frame_size, low=0, high=256, dtype='uint8', device=WARP_DEV).realize()
       for v in npy.values():
         v[:] = np.random.randn(*v.shape).astype(v.dtype)
       Device.default.synchronize()
       st = time.perf_counter()
-      outs = fn(**input_queues, frame=frame, big_frame=big_frame)
+      outs = fn(**input_queues, frame=frame, big_frame=big_frame, **camera_args)
       mt = time.perf_counter()
       Device.default.synchronize()
       et = time.perf_counter()
-      print(f"  [{i+1}/{n_runs}] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
+      print(f"  [{i+1}/{n_runs}] {nv12.width}x{nv12.height} enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms")
 
       if i == 0:
         val = [np.copy(v.numpy()) for v in outs]
@@ -236,6 +292,11 @@ def random_inputs_run_fn(fn, seed, test_val=None, test_buffers=None, expect_matc
   run_policy_jit = pickle.loads(pickle.dumps(run_policy_jit))
   random_inputs_run_fn(run_policy_jit, SEED, test_val, test_buffers, expect_match=True)
   random_inputs_run_fn(run_policy_jit, SEED+1, test_val, test_buffers, expect_match=False)
+  for i, nv12 in enumerate(camera_configs[1:]):
+    print(f'symbolic replay {nv12.width}x{nv12.height}')
+    random_inputs_run_fn(run_policy_jit, SEED+2+i, camera_config=nv12)
+  run_policy_jit.max_frame_size = max_frame_size
+  run_policy_jit.max_camera_size = (max_nv12.width, max_nv12.height)
   return run_policy_jit
 
 
@@ -245,6 +306,8 @@ def _parse_size(s):
 
 
 def read_file_chunked_to_shm(path):
+  from openpilot.system.hardware.hw import Paths
+
   shm_path = os.path.join(Paths.shm_path(), os.path.basename(path))
   atexit.register(lambda: os.path.exists(shm_path) and os.remove(shm_path))
   with open(shm_path, 'wb') as f:
@@ -274,14 +337,15 @@ def read_file_chunked_to_shm(path):
   out['metadata']['vision'] = make_metadata_dict(vision_path)
   out['metadata']['policy'] = make_metadata_dict(policy_path)
 
-  for cam_w, cam_h in args.camera_resolutions:
-    nv12 = NV12Frame(cam_w, cam_h, *get_nv12_info(cam_w, cam_h))
-    model_w, model_h = args.model_size
-    out[(cam_w,cam_h)] = {
-      name: compile_modeld(nv12, model_w, model_h, prepare_only, args.frame_skip,
-                           vision_runner, policy_runner, out['metadata']['vision'], out['metadata']['policy'])
-      for name, prepare_only in [('warp_enqueue', True), ('run_policy', False)]
-    }
+  camera_configs = [NV12Frame(cam_w, cam_h, *get_nv12_info(cam_w, cam_h)) for cam_w, cam_h in args.camera_resolutions]
+  model_w, model_h = args.model_size
+  out['camera_configs'] = {nv12[:2]: nv12 for nv12 in camera_configs}
+  out['max_frame_size'] = max(nv12.size for nv12 in camera_configs)
+  out['symbolic'] = {
+    name: compile_modeld(camera_configs, model_w, model_h, prepare_only, args.frame_skip,
+                         vision_runner, policy_runner, out['metadata']['vision'], out['metadata']['policy'])
+    for name, prepare_only in [('warp_enqueue', True), ('run_policy', False)]
+  }
 
   with open(args.output, "wb") as f:
     pickle.dump(out, f)
diff --git a/selfdrive/modeld/dmonitoringmodeld.py b/selfdrive/modeld/dmonitoringmodeld.py
index eaf423e7beaa09..918a1e9f50f223 100755
--- a/selfdrive/modeld/dmonitoringmodeld.py
+++ b/selfdrive/modeld/dmonitoringmodeld.py
@@ -16,6 +16,7 @@
 from openpilot.system.camerad.cameras.nv12_info import get_nv12_info
 from openpilot.common.file_chunker import read_file_chunked
 from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid, safe_exp
+from openpilot.selfdrive.modeld.compile_modeld import NV12Frame, bind_camera_vars, make_camera_vars
 
 PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
@@ -44,8 +45,20 @@ def __init__(self, cam_w: int, cam_h: int):
     self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
     self._blob_cache : dict[int, Tensor] = {}
     self.model_run = pickle.loads(read_file_chunked(str(MODEL_PKL_PATH)))
-    with open(MODELS_DIR / f'dm_warp_{cam_w}x{cam_h}_tinygrad.pkl', "rb") as f:
-      self.image_warp = pickle.load(f)
+    self.nv12 = NV12Frame(cam_w, cam_h, *self.frame_buf_params)
+    dm_warp_path = MODELS_DIR / 'dm_warp_tinygrad.pkl'
+    if dm_warp_path.is_file():
+      with open(dm_warp_path, "rb") as f:
+        dm_warp = pickle.load(f)
+      self.image_warp = dm_warp['warp']
+      self.max_frame_size = dm_warp['max_frame_size']
+      self.camera_vars, _ = make_camera_vars(list(dm_warp['camera_configs'].values()))
+      self.camera_args = bind_camera_vars(self.camera_vars, self.nv12)
+    else:
+      with open(MODELS_DIR / f'dm_warp_{cam_w}x{cam_h}_tinygrad.pkl', "rb") as f:
+        self.image_warp = pickle.load(f)
+      self.max_frame_size = self.frame_buf_params[3]
+      self.camera_args = {}
 
   def run(self, buf: VisionBuf, calib: np.ndarray, transform: np.ndarray) -> tuple[np.ndarray, float]:
     self.numpy_inputs['calib'][0,:] = calib
@@ -55,10 +68,10 @@ def run(self, buf: VisionBuf, calib: np.ndarray, transform: np.ndarray) -> tuple
     ptr = np.frombuffer(buf.data, dtype=np.uint8).ctypes.data
     # There is a ringbuffer of imgs, just cache tensors pointing to all of them
     if ptr not in self._blob_cache:
-      self._blob_cache[ptr] = Tensor.from_blob(ptr, (self.frame_buf_params[3],), dtype='uint8', device=self.DEV)
+      self._blob_cache[ptr] = Tensor.from_blob(ptr, (self.max_frame_size,), dtype='uint8', device=self.DEV)
 
     self.warp_inputs_np['transform'][:] = transform[:]
-    self.tensor_inputs['input_img'] = self.image_warp(self._blob_cache[ptr], self.warp_inputs['transform'])
+    self.tensor_inputs['input_img'] = self.image_warp(self._blob_cache[ptr], self.warp_inputs['transform'], **self.camera_args)
 
     output = self.model_run(**self.tensor_inputs).numpy().flatten()
 
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index 0c13b322df3b69..f185d605c4f438 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -21,6 +21,7 @@
 from openpilot.selfdrive.controls.lib.drive_helpers import get_accel_from_plan, smooth_value, get_curvature_from_plan
 from openpilot.selfdrive.modeld.parse_model_outputs import Parser
 from openpilot.selfdrive.modeld.compile_modeld import make_input_queues
+from openpilot.selfdrive.modeld.compile_modeld import NV12Frame, bind_camera_vars, make_camera_vars
 from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState
 from openpilot.common.file_chunker import read_file_chunked, get_manifest_path
 from openpilot.selfdrive.modeld.constants import ModelConstants, Plan
@@ -93,12 +94,21 @@ def __init__(self, cam_w: int, cam_h: int, usbgpu: bool):
     self._blob_cache : dict[int, Tensor] = {}
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in ('img', 'big_img')}
-    self.run_policy = jits[(cam_w,cam_h)]['run_policy']
-    self.warp_enqueue = jits[(cam_w,cam_h)]['warp_enqueue']
+    self.nv12 = NV12Frame(cam_w, cam_h, *self.frame_buf_params['img'])
+    self.max_frame_size = jits.get('max_frame_size', self.nv12.size)
+    self.camera_vars, _ = make_camera_vars(list(jits.get('camera_configs', {self.nv12[:2]: self.nv12}).values()))
+    if 'symbolic' in jits:
+      self.run_policy = jits['symbolic']['run_policy']
+      self.warp_enqueue = jits['symbolic']['warp_enqueue']
+    else:
+      self.run_policy = jits[(cam_w,cam_h)]['run_policy']
+      self.warp_enqueue = jits[(cam_w,cam_h)]['warp_enqueue']
+    self.camera_args = bind_camera_vars(self.camera_vars, self.nv12) if 'symbolic' in jits else {}
     self.warp_enqueue(
       **self.input_queues,
-      frame=Tensor(np.zeros(self.frame_buf_params['img'][3], dtype=np.uint8), device=self.WARP_DEV).contiguous().realize(),
-      big_frame=Tensor(np.zeros(self.frame_buf_params['big_img'][3], dtype=np.uint8), device=self.WARP_DEV).contiguous().realize())
+      frame=Tensor(np.zeros(self.max_frame_size, dtype=np.uint8), device=self.WARP_DEV).contiguous().realize(),
+      big_frame=Tensor(np.zeros(self.max_frame_size, dtype=np.uint8), device=self.WARP_DEV).contiguous().realize(),
+      **self.camera_args)
 
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
@@ -108,11 +118,10 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
                 inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None:
     for key in bufs.keys():
       ptr = np.frombuffer(bufs[key].data, dtype=np.uint8).ctypes.data
-      yuv_size = self.frame_buf_params[key][3]
       # There is a ringbuffer of imgs, just cache tensors pointing to all of them
       cache_key = (key, ptr)
       if cache_key not in self._blob_cache:
-        self._blob_cache[cache_key] = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8', device=self.WARP_DEV)
+        self._blob_cache[cache_key] = Tensor.from_blob(ptr, (self.max_frame_size,), dtype='uint8', device=self.WARP_DEV)
       self.full_frames[key] = self._blob_cache[cache_key]
 
     # Model decides when action is completed, so desire input is just a pulse triggered on rising edge
@@ -124,11 +133,11 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
     self.npy['big_tfm'][:,:] = transforms['big_img'][:,:]
 
     if prepare_only:
-      self.warp_enqueue(**self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img'])
+      self.warp_enqueue(**self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img'], **self.camera_args)
       return None
 
     vision_output, policy_output = self.run_policy(
-      **self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img']
+      **self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img'], **self.camera_args
     )
 
     vision_output = vision_output.numpy().flatten()

From 3a4218a3f1502fd90e24528ad9e5634c1176a1be Mon Sep 17 00:00:00 2001
From: haraschax <6804392+haraschax@users.noreply.github.com>
Date: Thu, 21 May 2026 17:00:55 -0700
Subject: [PATCH 2/6] modeld: satisfy zip strict lint

---
 selfdrive/modeld/compile_modeld.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 883018f4799e4f..4667725796be7e 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -54,7 +54,7 @@ def _optimize_local_size_or_skip(call, prg):
     preferred = (32, 16, 1)
     local_size = tuple(next(x for x in range(min(preferred[i] if i < len(preferred) else 1, g), 0, -1) if g % x == 0)
                        for i, g in enumerate(prg.arg.global_size))
-    new_global = tuple(g//l if g % l == 0 else g/l for g, l in zip(prg.arg.global_size, local_size))
+    new_global = tuple(g//l if g % l == 0 else g/l for g, l in zip(prg.arg.global_size, local_size, strict=True))
     return call.replace(src=(prg.replace(arg=replace(prg.arg, global_size=new_global, local_size=local_size)), *call.src[1:]))
 
 

From e34a84d5fc54bb87a0626c31d16d1a1bc402cf6b Mon Sep 17 00:00:00 2001
From: haraschax <6804392+haraschax@users.noreply.github.com>
Date: Thu, 21 May 2026 17:10:05 -0700
Subject: [PATCH 3/6] modeld: pad symbolic camera buffers

---
 selfdrive/modeld/dmonitoringmodeld.py | 12 +++++++++++-
 selfdrive/modeld/modeld.py            | 12 +++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/selfdrive/modeld/dmonitoringmodeld.py b/selfdrive/modeld/dmonitoringmodeld.py
index 918a1e9f50f223..75f1dc2222a87f 100755
--- a/selfdrive/modeld/dmonitoringmodeld.py
+++ b/selfdrive/modeld/dmonitoringmodeld.py
@@ -60,6 +60,16 @@ def __init__(self, cam_w: int, cam_h: int):
       self.max_frame_size = self.frame_buf_params[3]
       self.camera_args = {}
 
+  def frame_from_blob(self, ptr: int) -> Tensor:
+    yuv_size = self.frame_buf_params[3]
+    if yuv_size > self.max_frame_size:
+      raise RuntimeError(f"driver frame size {yuv_size} exceeds compiled max frame size {self.max_frame_size}")
+
+    frame = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8', device=self.DEV)
+    if yuv_size < self.max_frame_size:
+      frame = frame.cat(Tensor.zeros(self.max_frame_size - yuv_size, dtype='uint8', device=self.DEV))
+    return frame
+
   def run(self, buf: VisionBuf, calib: np.ndarray, transform: np.ndarray) -> tuple[np.ndarray, float]:
     self.numpy_inputs['calib'][0,:] = calib
 
@@ -68,7 +78,7 @@ def run(self, buf: VisionBuf, calib: np.ndarray, transform: np.ndarray) -> tuple
     ptr = np.frombuffer(buf.data, dtype=np.uint8).ctypes.data
     # There is a ringbuffer of imgs, just cache tensors pointing to all of them
     if ptr not in self._blob_cache:
-      self._blob_cache[ptr] = Tensor.from_blob(ptr, (self.max_frame_size,), dtype='uint8', device=self.DEV)
+      self._blob_cache[ptr] = self.frame_from_blob(ptr)
 
     self.warp_inputs_np['transform'][:] = transform[:]
     self.tensor_inputs['input_img'] = self.image_warp(self._blob_cache[ptr], self.warp_inputs['transform'], **self.camera_args)
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
index f185d605c4f438..8015ad4cfc4847 100755
--- a/selfdrive/modeld/modeld.py
+++ b/selfdrive/modeld/modeld.py
@@ -110,6 +110,16 @@ def __init__(self, cam_w: int, cam_h: int, usbgpu: bool):
       big_frame=Tensor(np.zeros(self.max_frame_size, dtype=np.uint8), device=self.WARP_DEV).contiguous().realize(),
       **self.camera_args)
 
+  def frame_from_blob(self, key: str, ptr: int) -> Tensor:
+    yuv_size = self.frame_buf_params[key][3]
+    if yuv_size > self.max_frame_size:
+      raise RuntimeError(f"{key} frame size {yuv_size} exceeds compiled max frame size {self.max_frame_size}")
+
+    frame = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8', device=self.WARP_DEV)
+    if yuv_size < self.max_frame_size:
+      frame = frame.cat(Tensor.zeros(self.max_frame_size - yuv_size, dtype='uint8', device=self.WARP_DEV))
+    return frame
+
   def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
     parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
     return parsed_model_outputs
@@ -121,7 +131,7 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
       # There is a ringbuffer of imgs, just cache tensors pointing to all of them
       cache_key = (key, ptr)
       if cache_key not in self._blob_cache:
-        self._blob_cache[cache_key] = Tensor.from_blob(ptr, (self.max_frame_size,), dtype='uint8', device=self.WARP_DEV)
+        self._blob_cache[cache_key] = self.frame_from_blob(key, ptr)
       self.full_frames[key] = self._blob_cache[cache_key]
 
     # Model decides when action is completed, so desire input is just a pulse triggered on rising edge

From 55ba1d21b2fef16e72dee44021e6d474716e4cec Mon Sep 17 00:00:00 2001
From: haraschax <6804392+haraschax@users.noreply.github.com>
Date: Thu, 21 May 2026 17:19:04 -0700
Subject: [PATCH 4/6] modeld: stabilize camera config pickle type

---
 selfdrive/modeld/camera_frame.py   | 4 ++++
 selfdrive/modeld/compile_modeld.py | 5 ++---
 2 files changed, 6 insertions(+), 3 deletions(-)
 create mode 100644 selfdrive/modeld/camera_frame.py

diff --git a/selfdrive/modeld/camera_frame.py b/selfdrive/modeld/camera_frame.py
new file mode 100644
index 00000000000000..ad0e610f6f43a2
--- /dev/null
+++ b/selfdrive/modeld/camera_frame.py
@@ -0,0 +1,4 @@
+from collections import namedtuple
+
+
+NV12Frame = namedtuple("NV12Frame", ['width', 'height', 'stride', 'y_height', 'uv_height', 'size'])
diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index 4667725796be7e..d33709a16ada9d 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -5,7 +5,7 @@
 import pickle
 import time
 from functools import partial
-from collections import namedtuple, defaultdict
+from collections import defaultdict
 
 import numpy as np
 
@@ -32,10 +32,9 @@ def fetch_fw(path, name, sha256):
 from tinygrad.engine.jit import TinyJit
 
 from openpilot.common.file_chunker import read_file_chunked
+from openpilot.selfdrive.modeld.camera_frame import NV12Frame
 
 
-NV12Frame = namedtuple("NV12Frame", ['width', 'height', 'stride', 'y_height', 'uv_height', 'size'])
-
 UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32)
 UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX)
 

From ac0d868c465a398efadc1625cfa6c9381e5164fd Mon Sep 17 00:00:00 2001
From: Bruce Wayne <harald.the.engineer@gmail.com>
Date: Fri, 22 May 2026 16:17:06 -0700
Subject: [PATCH 5/6] small fixes

---
 selfdrive/modeld/SConscript        |  4 +---
 selfdrive/modeld/compile_modeld.py | 30 ------------------------------
 2 files changed, 1 insertion(+), 33 deletions(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 27ab5f9dfad77c..89742fd0f113f7 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -19,8 +19,6 @@ def get_camera_configs():
     "tizi": (_ar_ox_fisheye.width, _ar_ox_fisheye.height),
     "mici": (_os_fisheye.width, _os_fisheye.height),
   }
-  if release or PC or 'CI' in os.environ:
-    return set(DEVICE_RESOLUTIONS.values())
   return [DEVICE_RESOLUTIONS[HARDWARE.get_device_type()]]
 
 CAMERA_CONFIGS = get_camera_configs()
@@ -48,7 +46,7 @@ if 'CUDA' in available:
   tg_flags = f'DEV={tg_backend}'
 elif 'QCOM' in available:
   tg_backend = 'QCOM'
-  tg_flags = f'DEV={tg_backend} IMAGE=1 FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0 OPENPILOT_HACKS=1'
+  tg_flags = f'DEV={tg_backend} IMAGE=1 FLOAT16=1 JIT_BATCH_SIZE=0 OPENPILOT_HACKS=1'
 else:
   tg_backend = 'CPU'
   tg_flags = f'DEV=CPU' if arch == 'Darwin' else 'DEV=CPU:LLVM'
diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
index d33709a16ada9d..03155b003c56b5 100755
--- a/selfdrive/modeld/compile_modeld.py
+++ b/selfdrive/modeld/compile_modeld.py
@@ -40,36 +40,6 @@ def fetch_fw(path, name, sha256):
 
 WARP_DEV = os.getenv('WARP_DEV')
 
-_ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE = None
-
-
-def _optimize_local_size_or_skip(call, prg):
-  try:
-    return _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE(call, prg)
-  except AssertionError as e:
-    if str(e) != "all optimize_local_size exec failed":
-      raise
-    from dataclasses import replace
-    preferred = (32, 16, 1)
-    local_size = tuple(next(x for x in range(min(preferred[i] if i < len(preferred) else 1, g), 0, -1) if g % x == 0)
-                       for i, g in enumerate(prg.arg.global_size))
-    new_global = tuple(g//l if g % l == 0 else g/l for g, l in zip(prg.arg.global_size, local_size, strict=True))
-    return call.replace(src=(prg.replace(arg=replace(prg.arg, global_size=new_global, local_size=local_size)), *call.src[1:]))
-
-
-def _patch_tinygrad_local_size_optimizer():
-  global _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE
-  from tinygrad.engine import realize
-  from tinygrad.uop.ops import Ops, PatternMatcher, UPat
-
-  _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE = realize.optimize_local_size
-  realize.pm_optimize_local_size = PatternMatcher([
-    (UPat(Ops.CALL, src=(UPat(Ops.PROGRAM, name="prg"),), name="call", allow_any_len=True), _optimize_local_size_or_skip),
-  ])
-
-
-_patch_tinygrad_local_size_optimizer()
-
 
 def make_camera_vars(camera_configs: list[NV12Frame]):
   max_cam_w = max(nv12.width for nv12 in camera_configs)

From f080742b3421b4d05b849f9bd782ec41a26823ab Mon Sep 17 00:00:00 2001
From: Bruce Wayne <harald.the.engineer@gmail.com>
Date: Fri, 22 May 2026 16:25:42 -0700
Subject: [PATCH 6/6] fix

---
 selfdrive/modeld/SConscript | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
index 89742fd0f113f7..2a913795eb3874 100644
--- a/selfdrive/modeld/SConscript
+++ b/selfdrive/modeld/SConscript
@@ -19,7 +19,8 @@ def get_camera_configs():
     "tizi": (_ar_ox_fisheye.width, _ar_ox_fisheye.height),
     "mici": (_os_fisheye.width, _os_fisheye.height),
   }
-  return [DEVICE_RESOLUTIONS[HARDWARE.get_device_type()]]
+  return set(DEVICE_RESOLUTIONS.values())
+
 
 CAMERA_CONFIGS = get_camera_configs()