From 9b47f66140c5deaf5bd98fcb7f9457bbcc0a4f9c Mon Sep 17 00:00:00 2001 From: haraschax <6804392+haraschax@users.noreply.github.com> Date: Thu, 21 May 2026 21:07:23 +0000 Subject: [PATCH 1/6] Compile modeld warps with symbolic camera sizes --- selfdrive/modeld/SConscript | 12 +-- selfdrive/modeld/compile_dm_warp.py | 43 ++++---- selfdrive/modeld/compile_modeld.py | 146 ++++++++++++++++++-------- selfdrive/modeld/dmonitoringmodeld.py | 21 +++- selfdrive/modeld/modeld.py | 25 +++-- 5 files changed, 169 insertions(+), 78 deletions(-) diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript index eba951611697be..27ab5f9dfad77c 100644 --- a/selfdrive/modeld/SConscript +++ b/selfdrive/modeld/SConscript @@ -121,12 +121,12 @@ lenv.Command(fn + "_metadata.pkl", [fn + ".onnx"] + tinygrad_files + script_file dm_w, dm_h = DM_INPUT_SIZE compile_dm_warp_script = [File(f"{modeld_dir}/compile_dm_warp.py")] -for cam_w, cam_h in CAMERA_CONFIGS: - dm_pkl_path = File(f"models/dm_warp_{cam_w}x{cam_h}_tinygrad.pkl").abspath - cmd = (f'{tg_flags} {mac_brew_string} python3 {modeld_dir}/compile_dm_warp.py ' - f'--camera-resolution {cam_w}x{cam_h} --warp-to {dm_w}x{dm_h} ' - f'--output {dm_pkl_path}') - lenv.Command(dm_pkl_path, tinygrad_files + compile_dm_warp_script + compile_modeld_script + [tg_devices_node], cmd) +dm_pkl_path = File("models/dm_warp_tinygrad.pkl").abspath +camera_res_args = ' '.join(f'{cw}x{ch}' for cw, ch in CAMERA_CONFIGS) +cmd = (f'{tg_flags} {mac_brew_string} python3 {modeld_dir}/compile_dm_warp.py ' + f'--camera-resolutions {camera_res_args} --warp-to {dm_w}x{dm_h} ' + f'--output {dm_pkl_path}') +lenv.Command(dm_pkl_path, tinygrad_files + compile_dm_warp_script + compile_modeld_script + [Value(camera_res_args), tg_devices_node], cmd) def tg_compile(flags, model_name): pythonpath_string = 'PYTHONPATH="${PYTHONPATH}:' + env.Dir("#tinygrad_repo").abspath + '"' diff --git a/selfdrive/modeld/compile_dm_warp.py b/selfdrive/modeld/compile_dm_warp.py index 548990ee15a7d9..b308efc1e75cd9 100755 --- a/selfdrive/modeld/compile_dm_warp.py +++ b/selfdrive/modeld/compile_dm_warp.py @@ -8,49 +8,54 @@ from tinygrad.engine.jit import TinyJit from openpilot.system.camerad.cameras.nv12_info import get_nv12_info -from openpilot.selfdrive.modeld.compile_modeld import NV12Frame, warp_perspective_tinygrad, _parse_size +from openpilot.selfdrive.modeld.compile_modeld import NV12Frame, bind_camera_vars, make_camera_vars, warp_perspective_tinygrad, _parse_size -def make_warp_dm(nv12: NV12Frame, dm_w, dm_h): - cam_w, cam_h, stride, _, _, _ = nv12 - stride_pad = stride - cam_w - - def warp_dm(input_frame, M_inv): +def make_warp_dm(dm_w, dm_h): + def warp_dm(input_frame, M_inv, cam_w, cam_h, chroma_w, chroma_h, stride, uv_offset): M_inv = M_inv.to(Device.DEFAULT).realize() - return warp_perspective_tinygrad(input_frame[:cam_h*stride], M_inv, - (dm_w, dm_h), (cam_h, cam_w), stride_pad, border_fill_val=0).reshape(-1, dm_h * dm_w) + return warp_perspective_tinygrad(input_frame, M_inv, (dm_w, dm_h), + (cam_h, cam_w), stride, border_fill_val=0).reshape(-1, dm_h * dm_w) return warp_dm -def compile_dm_warp(nv12: NV12Frame, dm_w, dm_h, pkl_path): - print(f"Compiling DM warp for {nv12.width}x{nv12.height} -> {dm_w}x{dm_h}...") +def compile_dm_warp(camera_configs: list[NV12Frame], dm_w, dm_h, pkl_path): + print(f"Compiling DM warp for {len(camera_configs)} camera sizes -> {dm_w}x{dm_h}...") - warp_dm_jit = TinyJit(make_warp_dm(nv12, dm_w, dm_h), prune=True) + camera_vars, max_frame_size = make_camera_vars(camera_configs) + warp_dm_jit = TinyJit(make_warp_dm(dm_w, dm_h), prune=True) for i in range(10): - frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8').realize() + nv12 = camera_configs[i % len(camera_configs)] + frame = Tensor.randint(max_frame_size, low=0, high=256, dtype='uint8').realize() M_inv = Tensor(Tensor.randn(3, 3).mul(8).realize().numpy(), device='NPY') Device.default.synchronize() st = time.perf_counter() - warp_dm_jit(frame, M_inv).realize() + warp_dm_jit(frame, M_inv, **bind_camera_vars(camera_vars, nv12)).realize() mt = time.perf_counter() Device.default.synchronize() et = time.perf_counter() - print(f" [{i+1}/10] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms") + print(f" [{i+1}/10] {nv12.width}x{nv12.height} enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms") with open(pkl_path, "wb") as f: - pickle.dump(warp_dm_jit, f) + pickle.dump({ + 'warp': warp_dm_jit, + 'camera_configs': {nv12[:2]: nv12 for nv12 in camera_configs}, + 'max_frame_size': max_frame_size, + }, f) print(f" Saved to {pkl_path}") if __name__ == "__main__": p = argparse.ArgumentParser() - p.add_argument('--camera-resolution', type=_parse_size, required=True, help='camera resolution WxH') + p.add_argument('--camera-resolution', type=_parse_size, help='camera resolution WxH') + p.add_argument('--camera-resolutions', type=_parse_size, nargs='+', help='camera resolutions WxH (one or more)') p.add_argument('--warp-to', type=_parse_size, required=True, help='DM input WxH') p.add_argument('--output', required=True) args = p.parse_args() - cam_w, cam_h = args.camera_resolution - nv12 = NV12Frame(cam_w, cam_h, *get_nv12_info(cam_w, cam_h)) + camera_resolutions = args.camera_resolutions or ([args.camera_resolution] if args.camera_resolution else None) + assert camera_resolutions is not None, "one of --camera-resolution or --camera-resolutions is required" + camera_configs = [NV12Frame(cam_w, cam_h, *get_nv12_info(cam_w, cam_h)) for cam_w, cam_h in camera_resolutions] dm_w, dm_h = args.warp_to - compile_dm_warp(nv12, dm_w, dm_h, args.output) + compile_dm_warp(camera_configs, dm_w, dm_h, args.output) diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py index f919d1da23a0b7..883018f4799e4f 100755 --- a/selfdrive/modeld/compile_modeld.py +++ b/selfdrive/modeld/compile_modeld.py @@ -26,12 +26,12 @@ def fetch_fw(path, name, sha256): _patch_tinygrad_fetch_fw() from tinygrad.tensor import Tensor +from tinygrad import Variable from tinygrad.helpers import Context from tinygrad.device import Device from tinygrad.engine.jit import TinyJit from openpilot.common.file_chunker import read_file_chunked -from openpilot.system.hardware.hw import Paths NV12Frame = namedtuple("NV12Frame", ['width', 'height', 'stride', 'y_height', 'uv_height', 'size']) @@ -41,8 +41,66 @@ def fetch_fw(path, name, sha256): WARP_DEV = os.getenv('WARP_DEV') +_ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE = None + + +def _optimize_local_size_or_skip(call, prg): + try: + return _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE(call, prg) + except AssertionError as e: + if str(e) != "all optimize_local_size exec failed": + raise + from dataclasses import replace + preferred = (32, 16, 1) + local_size = tuple(next(x for x in range(min(preferred[i] if i < len(preferred) else 1, g), 0, -1) if g % x == 0) + for i, g in enumerate(prg.arg.global_size)) + new_global = tuple(g//l if g % l == 0 else g/l for g, l in zip(prg.arg.global_size, local_size)) + return call.replace(src=(prg.replace(arg=replace(prg.arg, global_size=new_global, local_size=local_size)), *call.src[1:])) + + +def _patch_tinygrad_local_size_optimizer(): + global _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE + from tinygrad.engine import realize + from tinygrad.uop.ops import Ops, PatternMatcher, UPat + + _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE = realize.optimize_local_size + realize.pm_optimize_local_size = PatternMatcher([ + (UPat(Ops.CALL, src=(UPat(Ops.PROGRAM, name="prg"),), name="call", allow_any_len=True), _optimize_local_size_or_skip), + ]) + + +_patch_tinygrad_local_size_optimizer() + + +def make_camera_vars(camera_configs: list[NV12Frame]): + max_cam_w = max(nv12.width for nv12 in camera_configs) + max_cam_h = max(nv12.height for nv12 in camera_configs) + max_stride = max(nv12.stride for nv12 in camera_configs) + max_uv_offset = max(nv12.stride * nv12.y_height for nv12 in camera_configs) + max_frame_size = max(nv12.size for nv12 in camera_configs) + return { + 'cam_w': Variable('cam_w', 1, max_cam_w), + 'cam_h': Variable('cam_h', 1, max_cam_h), + 'chroma_w': Variable('chroma_w', 1, max_cam_w // 2), + 'chroma_h': Variable('chroma_h', 1, max_cam_h // 2), + 'stride': Variable('stride', 1, max_stride), + 'uv_offset': Variable('uv_offset', 1, max_uv_offset), + }, max_frame_size + + +def bind_camera_vars(camera_vars, nv12: NV12Frame): + values = { + 'cam_w': nv12.width, + 'cam_h': nv12.height, + 'chroma_w': nv12.width // 2, + 'chroma_h': nv12.height // 2, + 'stride': nv12.stride, + 'uv_offset': nv12.stride * nv12.y_height, + } + return {k: v.bind(values[k]) for k, v in camera_vars.items()} + -def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad, border_fill_val=None): +def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride, src_offset=0, x_step=1, channel=0, border_fill_val=None): w_dst, h_dst = dst_shape h_src, w_src = src_shape @@ -61,7 +119,7 @@ def warp_perspective_tinygrad(src_flat, M_inv, dst_shape, src_shape, stride_pad, y_round = Tensor.round(src_y) x_nn_clipped = x_round.clip(0, w_src - 1).cast('int') y_nn_clipped = y_round.clip(0, h_src - 1).cast('int') - idx = y_nn_clipped * (w_src + stride_pad) + x_nn_clipped + idx = y_nn_clipped * stride + x_nn_clipped * x_step + src_offset + channel sampled = src_flat[idx] if border_fill_val is None: @@ -84,26 +142,18 @@ def frames_to_tensor(frames): return in_img1 -def make_frame_prepare(nv12: NV12Frame, model_w, model_h): - cam_w, cam_h, stride, y_height, uv_height, _ = nv12 - uv_offset = stride * y_height - stride_pad = stride - cam_w - - def frame_prepare_tinygrad(input_frame, M_inv): +def make_frame_prepare(model_w, model_h): + def frame_prepare_tinygrad(input_frame, M_inv, cam_w, cam_h, chroma_w, chroma_h, stride, uv_offset): # UV_SCALE @ M_inv @ UV_SCALE_INV simplifies to elementwise scaling M_inv_uv = M_inv * Tensor([[1.0, 1.0, 0.5], [1.0, 1.0, 0.5], [2.0, 2.0, 1.0]], device=WARP_DEV) - # deinterleave NV12 UV plane (UVUV... -> separate U, V) - uv = input_frame[uv_offset:uv_offset + uv_height * stride].reshape(uv_height, stride) with Context(SPLIT_REDUCEOP=0): - y = warp_perspective_tinygrad(input_frame[:cam_h*stride], - M_inv, (model_w, model_h), - (cam_h, cam_w), stride_pad).realize() - u = warp_perspective_tinygrad(uv[:cam_h//2, :cam_w:2].flatten(), - M_inv_uv, (model_w//2, model_h//2), - (cam_h//2, cam_w//2), 0).realize() - v = warp_perspective_tinygrad(uv[:cam_h//2, 1:cam_w:2].flatten(), - M_inv_uv, (model_w//2, model_h//2), - (cam_h//2, cam_w//2), 0).realize() + y = warp_perspective_tinygrad(input_frame, M_inv, (model_w, model_h), + (cam_h, cam_w), stride).realize() + # Gather directly from interleaved NV12 UV memory so symbolic widths avoid step=2 slicing. + u = warp_perspective_tinygrad(input_frame, M_inv_uv, (model_w//2, model_h//2), + (chroma_h, chroma_w), stride, uv_offset, x_step=2, channel=0).realize() + v = warp_perspective_tinygrad(input_frame, M_inv_uv, (model_w//2, model_h//2), + (chroma_h, chroma_w), stride, uv_offset, x_step=2, channel=1).realize() yuv = y.cat(u).cat(v).reshape((model_h * 3 // 2, model_w)) tensor = frames_to_tensor(yuv) return tensor @@ -148,21 +198,22 @@ def sample_desire(buf, frame_skip): return buf.reshape(-1, frame_skip, *buf.shape[1:]).max(1).flatten(0, 1).unsqueeze(0) -def make_run_policy(vision_runner, policy_runner, nv12: NV12Frame, model_w, model_h, +def make_run_policy(vision_runner, policy_runner, model_w, model_h, vision_features_slice, frame_skip, prepare_only=False): - frame_prepare = make_frame_prepare(nv12, model_w, model_h) + frame_prepare = make_frame_prepare(model_w, model_h) sample_skip_fn = partial(sample_skip, frame_skip=frame_skip) sample_desire_fn = partial(sample_desire, frame_skip=frame_skip) - def run_policy(img_q, big_img_q, feat_q, desire_q, desire, traffic_convention, tfm, big_tfm, frame, big_frame): + def run_policy(img_q, big_img_q, feat_q, desire_q, desire, traffic_convention, tfm, big_tfm, + frame, big_frame, cam_w, cam_h, chroma_w, chroma_h, stride, uv_offset): tfm = tfm.to(WARP_DEV) big_tfm = big_tfm.to(WARP_DEV) desire = desire.to(Device.DEFAULT) traffic_convention = traffic_convention.to(Device.DEFAULT) Tensor.realize(tfm, big_tfm, desire, traffic_convention) - warped_frame = frame_prepare(frame, tfm).unsqueeze(0).to(Device.DEFAULT) - warped_big_frame = frame_prepare(big_frame, big_tfm).unsqueeze(0).to(Device.DEFAULT) + warped_frame = frame_prepare(frame, tfm, cam_w, cam_h, chroma_w, chroma_h, stride, uv_offset).unsqueeze(0).to(Device.DEFAULT) + warped_big_frame = frame_prepare(big_frame, big_tfm, cam_w, cam_h, chroma_w, chroma_h, stride, uv_offset).unsqueeze(0).to(Device.DEFAULT) img = shift_and_sample(img_q, warped_frame, sample_skip_fn) big_img = shift_and_sample(big_img_q, warped_big_frame, sample_skip_fn) @@ -182,21 +233,24 @@ def run_policy(img_q, big_img_q, feat_q, desire_q, desire, traffic_convention, t return run_policy -def compile_modeld(nv12: NV12Frame, model_w, model_h, prepare_only, frame_skip, +def compile_modeld(camera_configs: list[NV12Frame], model_w, model_h, prepare_only, frame_skip, vision_runner, policy_runner, vision_metadata, policy_metadata): - print(f"Compiling combined policy JIT for {nv12.width}x{nv12.height} (prepare_only={prepare_only})...") + print(f"Compiling combined policy JIT for {len(camera_configs)} camera sizes (prepare_only={prepare_only})...") vision_features_slice = vision_metadata['output_slices']['hidden_state'] vision_input_shapes = vision_metadata['input_shapes'] policy_input_shapes = policy_metadata['input_shapes'] - _run = make_run_policy(vision_runner, policy_runner, nv12, model_w, model_h, + camera_vars, max_frame_size = make_camera_vars(camera_configs) + max_nv12 = max(camera_configs, key=lambda n: n.size) + + _run = make_run_policy(vision_runner, policy_runner, model_w, model_h, vision_features_slice, frame_skip, prepare_only) run_policy_jit = TinyJit(_run, prune=True) SEED = 42 - def random_inputs_run_fn(fn, seed, test_val=None, test_buffers=None, expect_match=True): + def random_inputs_run_fn(fn, seed, test_val=None, test_buffers=None, expect_match=True, camera_config=None): input_queues, npy = make_input_queues(vision_input_shapes, policy_input_shapes, frame_skip, Device.DEFAULT) np.random.seed(seed) Tensor.manual_seed(seed) @@ -205,17 +259,19 @@ def random_inputs_run_fn(fn, seed, test_val=None, test_buffers=None, expect_matc n_runs = 1 if testing else 3 for i in range(n_runs): - frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8', device=WARP_DEV).realize() - big_frame = Tensor.randint(nv12.size, low=0, high=256, dtype='uint8', device=WARP_DEV).realize() + nv12 = camera_config or camera_configs[0] + camera_args = bind_camera_vars(camera_vars, nv12) + frame = Tensor.randint(max_frame_size, low=0, high=256, dtype='uint8', device=WARP_DEV).realize() + big_frame = Tensor.randint(max_frame_size, low=0, high=256, dtype='uint8', device=WARP_DEV).realize() for v in npy.values(): v[:] = np.random.randn(*v.shape).astype(v.dtype) Device.default.synchronize() st = time.perf_counter() - outs = fn(**input_queues, frame=frame, big_frame=big_frame) + outs = fn(**input_queues, frame=frame, big_frame=big_frame, **camera_args) mt = time.perf_counter() Device.default.synchronize() et = time.perf_counter() - print(f" [{i+1}/{n_runs}] enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms") + print(f" [{i+1}/{n_runs}] {nv12.width}x{nv12.height} enqueue {(mt-st)*1e3:6.2f} ms -- total {(et-st)*1e3:6.2f} ms") if i == 0: val = [np.copy(v.numpy()) for v in outs] @@ -236,6 +292,11 @@ def random_inputs_run_fn(fn, seed, test_val=None, test_buffers=None, expect_matc run_policy_jit = pickle.loads(pickle.dumps(run_policy_jit)) random_inputs_run_fn(run_policy_jit, SEED, test_val, test_buffers, expect_match=True) random_inputs_run_fn(run_policy_jit, SEED+1, test_val, test_buffers, expect_match=False) + for i, nv12 in enumerate(camera_configs[1:]): + print(f'symbolic replay {nv12.width}x{nv12.height}') + random_inputs_run_fn(run_policy_jit, SEED+2+i, camera_config=nv12) + run_policy_jit.max_frame_size = max_frame_size + run_policy_jit.max_camera_size = (max_nv12.width, max_nv12.height) return run_policy_jit @@ -245,6 +306,8 @@ def _parse_size(s): def read_file_chunked_to_shm(path): + from openpilot.system.hardware.hw import Paths + shm_path = os.path.join(Paths.shm_path(), os.path.basename(path)) atexit.register(lambda: os.path.exists(shm_path) and os.remove(shm_path)) with open(shm_path, 'wb') as f: @@ -274,14 +337,15 @@ def read_file_chunked_to_shm(path): out['metadata']['vision'] = make_metadata_dict(vision_path) out['metadata']['policy'] = make_metadata_dict(policy_path) - for cam_w, cam_h in args.camera_resolutions: - nv12 = NV12Frame(cam_w, cam_h, *get_nv12_info(cam_w, cam_h)) - model_w, model_h = args.model_size - out[(cam_w,cam_h)] = { - name: compile_modeld(nv12, model_w, model_h, prepare_only, args.frame_skip, - vision_runner, policy_runner, out['metadata']['vision'], out['metadata']['policy']) - for name, prepare_only in [('warp_enqueue', True), ('run_policy', False)] - } + camera_configs = [NV12Frame(cam_w, cam_h, *get_nv12_info(cam_w, cam_h)) for cam_w, cam_h in args.camera_resolutions] + model_w, model_h = args.model_size + out['camera_configs'] = {nv12[:2]: nv12 for nv12 in camera_configs} + out['max_frame_size'] = max(nv12.size for nv12 in camera_configs) + out['symbolic'] = { + name: compile_modeld(camera_configs, model_w, model_h, prepare_only, args.frame_skip, + vision_runner, policy_runner, out['metadata']['vision'], out['metadata']['policy']) + for name, prepare_only in [('warp_enqueue', True), ('run_policy', False)] + } with open(args.output, "wb") as f: pickle.dump(out, f) diff --git a/selfdrive/modeld/dmonitoringmodeld.py b/selfdrive/modeld/dmonitoringmodeld.py index eaf423e7beaa09..918a1e9f50f223 100755 --- a/selfdrive/modeld/dmonitoringmodeld.py +++ b/selfdrive/modeld/dmonitoringmodeld.py @@ -16,6 +16,7 @@ from openpilot.system.camerad.cameras.nv12_info import get_nv12_info from openpilot.common.file_chunker import read_file_chunked from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid, safe_exp +from openpilot.selfdrive.modeld.compile_modeld import NV12Frame, bind_camera_vars, make_camera_vars PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld" SEND_RAW_PRED = os.getenv('SEND_RAW_PRED') @@ -44,8 +45,20 @@ def __init__(self, cam_w: int, cam_h: int): self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()} self._blob_cache : dict[int, Tensor] = {} self.model_run = pickle.loads(read_file_chunked(str(MODEL_PKL_PATH))) - with open(MODELS_DIR / f'dm_warp_{cam_w}x{cam_h}_tinygrad.pkl', "rb") as f: - self.image_warp = pickle.load(f) + self.nv12 = NV12Frame(cam_w, cam_h, *self.frame_buf_params) + dm_warp_path = MODELS_DIR / 'dm_warp_tinygrad.pkl' + if dm_warp_path.is_file(): + with open(dm_warp_path, "rb") as f: + dm_warp = pickle.load(f) + self.image_warp = dm_warp['warp'] + self.max_frame_size = dm_warp['max_frame_size'] + self.camera_vars, _ = make_camera_vars(list(dm_warp['camera_configs'].values())) + self.camera_args = bind_camera_vars(self.camera_vars, self.nv12) + else: + with open(MODELS_DIR / f'dm_warp_{cam_w}x{cam_h}_tinygrad.pkl', "rb") as f: + self.image_warp = pickle.load(f) + self.max_frame_size = self.frame_buf_params[3] + self.camera_args = {} def run(self, buf: VisionBuf, calib: np.ndarray, transform: np.ndarray) -> tuple[np.ndarray, float]: self.numpy_inputs['calib'][0,:] = calib @@ -55,10 +68,10 @@ def run(self, buf: VisionBuf, calib: np.ndarray, transform: np.ndarray) -> tuple ptr = np.frombuffer(buf.data, dtype=np.uint8).ctypes.data # There is a ringbuffer of imgs, just cache tensors pointing to all of them if ptr not in self._blob_cache: - self._blob_cache[ptr] = Tensor.from_blob(ptr, (self.frame_buf_params[3],), dtype='uint8', device=self.DEV) + self._blob_cache[ptr] = Tensor.from_blob(ptr, (self.max_frame_size,), dtype='uint8', device=self.DEV) self.warp_inputs_np['transform'][:] = transform[:] - self.tensor_inputs['input_img'] = self.image_warp(self._blob_cache[ptr], self.warp_inputs['transform']) + self.tensor_inputs['input_img'] = self.image_warp(self._blob_cache[ptr], self.warp_inputs['transform'], **self.camera_args) output = self.model_run(**self.tensor_inputs).numpy().flatten() diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py index 0c13b322df3b69..f185d605c4f438 100755 --- a/selfdrive/modeld/modeld.py +++ b/selfdrive/modeld/modeld.py @@ -21,6 +21,7 @@ from openpilot.selfdrive.controls.lib.drive_helpers import get_accel_from_plan, smooth_value, get_curvature_from_plan from openpilot.selfdrive.modeld.parse_model_outputs import Parser from openpilot.selfdrive.modeld.compile_modeld import make_input_queues +from openpilot.selfdrive.modeld.compile_modeld import NV12Frame, bind_camera_vars, make_camera_vars from openpilot.selfdrive.modeld.fill_model_msg import fill_model_msg, fill_pose_msg, PublishState from openpilot.common.file_chunker import read_file_chunked, get_manifest_path from openpilot.selfdrive.modeld.constants import ModelConstants, Plan @@ -93,12 +94,21 @@ def __init__(self, cam_w: int, cam_h: int, usbgpu: bool): self._blob_cache : dict[int, Tensor] = {} self.parser = Parser() self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in ('img', 'big_img')} - self.run_policy = jits[(cam_w,cam_h)]['run_policy'] - self.warp_enqueue = jits[(cam_w,cam_h)]['warp_enqueue'] + self.nv12 = NV12Frame(cam_w, cam_h, *self.frame_buf_params['img']) + self.max_frame_size = jits.get('max_frame_size', self.nv12.size) + self.camera_vars, _ = make_camera_vars(list(jits.get('camera_configs', {self.nv12[:2]: self.nv12}).values())) + if 'symbolic' in jits: + self.run_policy = jits['symbolic']['run_policy'] + self.warp_enqueue = jits['symbolic']['warp_enqueue'] + else: + self.run_policy = jits[(cam_w,cam_h)]['run_policy'] + self.warp_enqueue = jits[(cam_w,cam_h)]['warp_enqueue'] + self.camera_args = bind_camera_vars(self.camera_vars, self.nv12) if 'symbolic' in jits else {} self.warp_enqueue( **self.input_queues, - frame=Tensor(np.zeros(self.frame_buf_params['img'][3], dtype=np.uint8), device=self.WARP_DEV).contiguous().realize(), - big_frame=Tensor(np.zeros(self.frame_buf_params['big_img'][3], dtype=np.uint8), device=self.WARP_DEV).contiguous().realize()) + frame=Tensor(np.zeros(self.max_frame_size, dtype=np.uint8), device=self.WARP_DEV).contiguous().realize(), + big_frame=Tensor(np.zeros(self.max_frame_size, dtype=np.uint8), device=self.WARP_DEV).contiguous().realize(), + **self.camera_args) def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]: parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()} @@ -108,11 +118,10 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray], inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None: for key in bufs.keys(): ptr = np.frombuffer(bufs[key].data, dtype=np.uint8).ctypes.data - yuv_size = self.frame_buf_params[key][3] # There is a ringbuffer of imgs, just cache tensors pointing to all of them cache_key = (key, ptr) if cache_key not in self._blob_cache: - self._blob_cache[cache_key] = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8', device=self.WARP_DEV) + self._blob_cache[cache_key] = Tensor.from_blob(ptr, (self.max_frame_size,), dtype='uint8', device=self.WARP_DEV) self.full_frames[key] = self._blob_cache[cache_key] # Model decides when action is completed, so desire input is just a pulse triggered on rising edge @@ -124,11 +133,11 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray], self.npy['big_tfm'][:,:] = transforms['big_img'][:,:] if prepare_only: - self.warp_enqueue(**self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img']) + self.warp_enqueue(**self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img'], **self.camera_args) return None vision_output, policy_output = self.run_policy( - **self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img'] + **self.input_queues, frame=self.full_frames['img'], big_frame=self.full_frames['big_img'], **self.camera_args ) vision_output = vision_output.numpy().flatten() From 3a4218a3f1502fd90e24528ad9e5634c1176a1be Mon Sep 17 00:00:00 2001 From: haraschax <6804392+haraschax@users.noreply.github.com> Date: Thu, 21 May 2026 17:00:55 -0700 Subject: [PATCH 2/6] modeld: satisfy zip strict lint --- selfdrive/modeld/compile_modeld.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py index 883018f4799e4f..4667725796be7e 100755 --- a/selfdrive/modeld/compile_modeld.py +++ b/selfdrive/modeld/compile_modeld.py @@ -54,7 +54,7 @@ def _optimize_local_size_or_skip(call, prg): preferred = (32, 16, 1) local_size = tuple(next(x for x in range(min(preferred[i] if i < len(preferred) else 1, g), 0, -1) if g % x == 0) for i, g in enumerate(prg.arg.global_size)) - new_global = tuple(g//l if g % l == 0 else g/l for g, l in zip(prg.arg.global_size, local_size)) + new_global = tuple(g//l if g % l == 0 else g/l for g, l in zip(prg.arg.global_size, local_size, strict=True)) return call.replace(src=(prg.replace(arg=replace(prg.arg, global_size=new_global, local_size=local_size)), *call.src[1:])) From e34a84d5fc54bb87a0626c31d16d1a1bc402cf6b Mon Sep 17 00:00:00 2001 From: haraschax <6804392+haraschax@users.noreply.github.com> Date: Thu, 21 May 2026 17:10:05 -0700 Subject: [PATCH 3/6] modeld: pad symbolic camera buffers --- selfdrive/modeld/dmonitoringmodeld.py | 12 +++++++++++- selfdrive/modeld/modeld.py | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/selfdrive/modeld/dmonitoringmodeld.py b/selfdrive/modeld/dmonitoringmodeld.py index 918a1e9f50f223..75f1dc2222a87f 100755 --- a/selfdrive/modeld/dmonitoringmodeld.py +++ b/selfdrive/modeld/dmonitoringmodeld.py @@ -60,6 +60,16 @@ def __init__(self, cam_w: int, cam_h: int): self.max_frame_size = self.frame_buf_params[3] self.camera_args = {} + def frame_from_blob(self, ptr: int) -> Tensor: + yuv_size = self.frame_buf_params[3] + if yuv_size > self.max_frame_size: + raise RuntimeError(f"driver frame size {yuv_size} exceeds compiled max frame size {self.max_frame_size}") + + frame = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8', device=self.DEV) + if yuv_size < self.max_frame_size: + frame = frame.cat(Tensor.zeros(self.max_frame_size - yuv_size, dtype='uint8', device=self.DEV)) + return frame + def run(self, buf: VisionBuf, calib: np.ndarray, transform: np.ndarray) -> tuple[np.ndarray, float]: self.numpy_inputs['calib'][0,:] = calib @@ -68,7 +78,7 @@ def run(self, buf: VisionBuf, calib: np.ndarray, transform: np.ndarray) -> tuple ptr = np.frombuffer(buf.data, dtype=np.uint8).ctypes.data # There is a ringbuffer of imgs, just cache tensors pointing to all of them if ptr not in self._blob_cache: - self._blob_cache[ptr] = Tensor.from_blob(ptr, (self.max_frame_size,), dtype='uint8', device=self.DEV) + self._blob_cache[ptr] = self.frame_from_blob(ptr) self.warp_inputs_np['transform'][:] = transform[:] self.tensor_inputs['input_img'] = self.image_warp(self._blob_cache[ptr], self.warp_inputs['transform'], **self.camera_args) diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py index f185d605c4f438..8015ad4cfc4847 100755 --- a/selfdrive/modeld/modeld.py +++ b/selfdrive/modeld/modeld.py @@ -110,6 +110,16 @@ def __init__(self, cam_w: int, cam_h: int, usbgpu: bool): big_frame=Tensor(np.zeros(self.max_frame_size, dtype=np.uint8), device=self.WARP_DEV).contiguous().realize(), **self.camera_args) + def frame_from_blob(self, key: str, ptr: int) -> Tensor: + yuv_size = self.frame_buf_params[key][3] + if yuv_size > self.max_frame_size: + raise RuntimeError(f"{key} frame size {yuv_size} exceeds compiled max frame size {self.max_frame_size}") + + frame = Tensor.from_blob(ptr, (yuv_size,), dtype='uint8', device=self.WARP_DEV) + if yuv_size < self.max_frame_size: + frame = frame.cat(Tensor.zeros(self.max_frame_size - yuv_size, dtype='uint8', device=self.WARP_DEV)) + return frame + def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]: parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()} return parsed_model_outputs @@ -121,7 +131,7 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray], # There is a ringbuffer of imgs, just cache tensors pointing to all of them cache_key = (key, ptr) if cache_key not in self._blob_cache: - self._blob_cache[cache_key] = Tensor.from_blob(ptr, (self.max_frame_size,), dtype='uint8', device=self.WARP_DEV) + self._blob_cache[cache_key] = self.frame_from_blob(key, ptr) self.full_frames[key] = self._blob_cache[cache_key] # Model decides when action is completed, so desire input is just a pulse triggered on rising edge From 55ba1d21b2fef16e72dee44021e6d474716e4cec Mon Sep 17 00:00:00 2001 From: haraschax <6804392+haraschax@users.noreply.github.com> Date: Thu, 21 May 2026 17:19:04 -0700 Subject: [PATCH 4/6] modeld: stabilize camera config pickle type --- selfdrive/modeld/camera_frame.py | 4 ++++ selfdrive/modeld/compile_modeld.py | 5 ++--- 2 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 selfdrive/modeld/camera_frame.py diff --git a/selfdrive/modeld/camera_frame.py b/selfdrive/modeld/camera_frame.py new file mode 100644 index 00000000000000..ad0e610f6f43a2 --- /dev/null +++ b/selfdrive/modeld/camera_frame.py @@ -0,0 +1,4 @@ +from collections import namedtuple + + +NV12Frame = namedtuple("NV12Frame", ['width', 'height', 'stride', 'y_height', 'uv_height', 'size']) diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py index 4667725796be7e..d33709a16ada9d 100755 --- a/selfdrive/modeld/compile_modeld.py +++ b/selfdrive/modeld/compile_modeld.py @@ -5,7 +5,7 @@ import pickle import time from functools import partial -from collections import namedtuple, defaultdict +from collections import defaultdict import numpy as np @@ -32,10 +32,9 @@ def fetch_fw(path, name, sha256): from tinygrad.engine.jit import TinyJit from openpilot.common.file_chunker import read_file_chunked +from openpilot.selfdrive.modeld.camera_frame import NV12Frame -NV12Frame = namedtuple("NV12Frame", ['width', 'height', 'stride', 'y_height', 'uv_height', 'size']) - UV_SCALE_MATRIX = np.array([[0.5, 0, 0], [0, 0.5, 0], [0, 0, 1]], dtype=np.float32) UV_SCALE_MATRIX_INV = np.linalg.inv(UV_SCALE_MATRIX) From ac0d868c465a398efadc1625cfa6c9381e5164fd Mon Sep 17 00:00:00 2001 From: Bruce Wayne Date: Fri, 22 May 2026 16:17:06 -0700 Subject: [PATCH 5/6] small fixes --- selfdrive/modeld/SConscript | 4 +--- selfdrive/modeld/compile_modeld.py | 30 ------------------------------ 2 files changed, 1 insertion(+), 33 deletions(-) diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript index 27ab5f9dfad77c..89742fd0f113f7 100644 --- a/selfdrive/modeld/SConscript +++ b/selfdrive/modeld/SConscript @@ -19,8 +19,6 @@ def get_camera_configs(): "tizi": (_ar_ox_fisheye.width, _ar_ox_fisheye.height), "mici": (_os_fisheye.width, _os_fisheye.height), } - if release or PC or 'CI' in os.environ: - return set(DEVICE_RESOLUTIONS.values()) return [DEVICE_RESOLUTIONS[HARDWARE.get_device_type()]] CAMERA_CONFIGS = get_camera_configs() @@ -48,7 +46,7 @@ if 'CUDA' in available: tg_flags = f'DEV={tg_backend}' elif 'QCOM' in available: tg_backend = 'QCOM' - tg_flags = f'DEV={tg_backend} IMAGE=1 FLOAT16=1 NOLOCALS=1 JIT_BATCH_SIZE=0 OPENPILOT_HACKS=1' + tg_flags = f'DEV={tg_backend} IMAGE=1 FLOAT16=1 JIT_BATCH_SIZE=0 OPENPILOT_HACKS=1' else: tg_backend = 'CPU' tg_flags = f'DEV=CPU' if arch == 'Darwin' else 'DEV=CPU:LLVM' diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py index d33709a16ada9d..03155b003c56b5 100755 --- a/selfdrive/modeld/compile_modeld.py +++ b/selfdrive/modeld/compile_modeld.py @@ -40,36 +40,6 @@ def fetch_fw(path, name, sha256): WARP_DEV = os.getenv('WARP_DEV') -_ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE = None - - -def _optimize_local_size_or_skip(call, prg): - try: - return _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE(call, prg) - except AssertionError as e: - if str(e) != "all optimize_local_size exec failed": - raise - from dataclasses import replace - preferred = (32, 16, 1) - local_size = tuple(next(x for x in range(min(preferred[i] if i < len(preferred) else 1, g), 0, -1) if g % x == 0) - for i, g in enumerate(prg.arg.global_size)) - new_global = tuple(g//l if g % l == 0 else g/l for g, l in zip(prg.arg.global_size, local_size, strict=True)) - return call.replace(src=(prg.replace(arg=replace(prg.arg, global_size=new_global, local_size=local_size)), *call.src[1:])) - - -def _patch_tinygrad_local_size_optimizer(): - global _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE - from tinygrad.engine import realize - from tinygrad.uop.ops import Ops, PatternMatcher, UPat - - _ORIG_TINYGRAD_OPTIMIZE_LOCAL_SIZE = realize.optimize_local_size - realize.pm_optimize_local_size = PatternMatcher([ - (UPat(Ops.CALL, src=(UPat(Ops.PROGRAM, name="prg"),), name="call", allow_any_len=True), _optimize_local_size_or_skip), - ]) - - -_patch_tinygrad_local_size_optimizer() - def make_camera_vars(camera_configs: list[NV12Frame]): max_cam_w = max(nv12.width for nv12 in camera_configs) From f080742b3421b4d05b849f9bd782ec41a26823ab Mon Sep 17 00:00:00 2001 From: Bruce Wayne Date: Fri, 22 May 2026 16:25:42 -0700 Subject: [PATCH 6/6] fix --- selfdrive/modeld/SConscript | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript index 89742fd0f113f7..2a913795eb3874 100644 --- a/selfdrive/modeld/SConscript +++ b/selfdrive/modeld/SConscript @@ -19,7 +19,8 @@ def get_camera_configs(): "tizi": (_ar_ox_fisheye.width, _ar_ox_fisheye.height), "mici": (_os_fisheye.width, _os_fisheye.height), } - return [DEVICE_RESOLUTIONS[HARDWARE.get_device_type()]] + return set(DEVICE_RESOLUTIONS.values()) + CAMERA_CONFIGS = get_camera_configs()