commaai · haraschax · Jun 3, 2026
diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
@@ -87,15 +87,12 @@ frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
 for usbgpu in [False, True] if USBGPU else [False]:
   target_pkl_path = File(modeld_pkl_path(usbgpu)).abspath
   file_prefix, cmd_flags = ('big_', usbgpu_tg_flags) if usbgpu else ('', tg_flags)
-  driving_onnx_deps = [p for m in [f'{file_prefix}driving_vision', f'{file_prefix}driving_on_policy', f'{file_prefix}driving_off_policy']
-                       for p in get_existing_chunks(File(f"models/{m}.onnx").abspath)]
+  driving_onnx_deps = get_existing_chunks(File(f"models/{file_prefix}driving.onnx").abspath)
   camera_res_args = ' '.join(f'{cw}x{ch}' for cw, ch in CAMERA_CONFIGS)
   cmd = (f'{cmd_flags} {mac_brew_string} python3 {modeld_dir}/compile_modeld.py '
         f'--model-size {model_w}x{model_h} '
         f'--camera-resolutions {camera_res_args} '
-        f'--vision-onnx {File(f"models/{file_prefix}driving_vision.onnx").abspath} '
-        f'--off-policy-onnx {File(f"models/{file_prefix}driving_off_policy.onnx").abspath} '
-        f'--on-policy-onnx {File(f"models/{file_prefix}driving_on_policy.onnx").abspath} '
+        f'--driving-onnx {File(f"models/{file_prefix}driving.onnx").abspath} '
         f'--output {target_pkl_path} --frame-skip {frame_skip}')
   onnx_sizes_sum = sum(os.path.getsize(f) for f in driving_onnx_deps)
   chunk_targets = get_chunk_targets(target_pkl_path, estimate_pickle_max_size(onnx_sizes_sum))

diff --git a/selfdrive/modeld/compile_modeld.py b/selfdrive/modeld/compile_modeld.py
@@ -113,8 +113,8 @@ def frame_prepare_tinygrad(input_frame, M_inv):
   return frame_prepare_tinygrad
 
 
-def make_warp_input_queues(vision_input_shapes, frame_skip, device):
-  img = vision_input_shapes['img']  # (1, 12, 128, 256)
+def make_warp_input_queues(model_input_shapes, frame_skip, device):
+  img = model_input_shapes['img']  # (1, 12, 128, 256)
   n_frames = img[1] // 6
   img_buf_shape = (frame_skip * (n_frames - 1) + 1, 6, img[2], img[3])
 
@@ -130,14 +130,13 @@ def make_warp_input_queues(vision_input_shapes, frame_skip, device):
   return input_queues, npy
 
 
-def make_input_queues(vision_input_shapes, policy_input_shapes, frame_skip, device):
-  input_queues, npy = make_warp_input_queues(vision_input_shapes, frame_skip, device)
+def make_input_queues(model_input_shapes, frame_skip, device):
+  input_queues, npy = make_warp_input_queues(model_input_shapes, frame_skip, device)
 
-  fb = policy_input_shapes['features_buffer']  # (1, 25, 512)
-  dp = policy_input_shapes['desire_pulse']  # (1, 25, 8)
-  tc = policy_input_shapes['traffic_convention']  # (1, 2)
-  #TODO action_t is hardcoded to match tc for future compatibility
-  at = tc
+  fb = model_input_shapes['features_buffer']  # full model-run feature queue, time-major
+  dp = model_input_shapes['desire_pulse']  # sampled temporal desire input, (1, T, 8)
+  tc = model_input_shapes['traffic_convention']  # (1, 2)
+  at = model_input_shapes['action_t']  # (1, 2)
 
   policy_npy = {
     'desire': np.zeros(dp[2], dtype=np.float32),
@@ -146,7 +145,7 @@ def make_input_queues(vision_input_shapes, policy_input_shapes, frame_skip, devi
   }
   npy.update(policy_npy)
   input_queues.update({
-    'feat_q': Tensor(np.zeros((frame_skip * (fb[1] - 1) + 1, fb[0], fb[2]), dtype=np.float32), device=device).contiguous().realize(),
+    'feat_q': Tensor(np.zeros(fb, dtype=np.float32), device=device).contiguous().realize(),
     'desire_q': Tensor(np.zeros((frame_skip * dp[1], dp[0], dp[2]), dtype=np.float32), device=device).contiguous().realize(),
     **{k: Tensor(v, device='NPY').realize() for k, v in policy_npy.items()},
   })
@@ -183,31 +182,28 @@ def warp_enqueue(img_q, big_img_q, tfm, big_tfm, frame, big_frame):
   return warp_enqueue
 
 
-def make_run_policy(model_runners, model_metadata, frame_skip):
+def make_run_policy(model_runner, vision_features_slice, frame_skip):
   sample_desire_fn = partial(sample_desire, frame_skip=frame_skip)
-  sample_skip_fn = partial(sample_skip, frame_skip=frame_skip)
-  vision_features_slice = model_metadata['vision']['output_slices']['hidden_state']
 
   def run_policy(img, big_img, feat_q, desire_q, desire, traffic_convention, action_t):
     desire = desire.to(Device.DEFAULT)
     traffic_convention = traffic_convention.to(Device.DEFAULT)
     action_t = action_t.to(Device.DEFAULT)
     Tensor.realize(desire, traffic_convention, action_t)
     desire_buf = shift_and_sample(desire_q, desire.reshape(1, 1, -1), sample_desire_fn)
-    vision_out = next(iter(model_runners['vision']({'img': img, 'big_img': big_img}).values())).cast('float32')
-
-    new_feat = vision_out[:, vision_features_slice].reshape(1, -1).unsqueeze(0)
-    feat_buf = shift_and_sample(feat_q, new_feat, sample_skip_fn)
 
     inputs = {
-      'features_buffer': feat_buf,
+      'img': img,
+      'big_img': big_img,
+      'features_buffer': feat_q,
       'desire_pulse': desire_buf,
       'traffic_convention': traffic_convention,
       'action_t': action_t,
     }
-    on_policy_out = next(iter(model_runners['on_policy'](inputs).values())).cast('float32')
-    off_policy_out = next(iter(model_runners['off_policy'](inputs).values())).cast('float32')
-    return vision_out, on_policy_out, off_policy_out
+    model_out = next(iter(model_runner(inputs).values())).cast('float32')
+    new_feat = model_out[:, vision_features_slice].reshape(1, -1).unsqueeze(0)
+    feat_q.assign(feat_q[1:].cat(new_feat, dim=0).contiguous()).realize()
+    return model_out,
   return run_policy
 
 
@@ -277,38 +273,31 @@ def read_file_chunked_to_shm(path):
   p.add_argument('--model-size', type=_parse_size, required=True, help='model input WxH')
   p.add_argument('--camera-resolutions', type=_parse_size, nargs='+', required=True,
                  help='camera resolutions WxH (one or more)')
-  p.add_argument('--vision-onnx', required=True)
-  p.add_argument('--off-policy-onnx', required=True)
-  p.add_argument('--on-policy-onnx', required=True)
+  p.add_argument('--driving-onnx', required=True)
   p.add_argument('--output', required=True)
   p.add_argument('--frame-skip', type=int, required=True)
   args = p.parse_args()
 
-  model_paths = {
-    'vision': read_file_chunked_to_shm(args.vision_onnx),
-    'off_policy': read_file_chunked_to_shm(args.off_policy_onnx),
-    'on_policy': read_file_chunked_to_shm(args.on_policy_onnx),
-  }
+  model_path = read_file_chunked_to_shm(args.driving_onnx)
   model_w, model_h = args.model_size
 
-  model_runners = {name: OnnxRunner(path) for name, path in model_paths.items()}
-  out = {'metadata': {name: make_metadata_dict(path) for name, path in model_paths.items()}}
-
-  assert out['metadata']['off_policy']['input_shapes'] == out['metadata']['on_policy']['input_shapes']
+  model_runner = OnnxRunner(model_path)
+  model_metadata = make_metadata_dict(model_path)
+  model_input_shapes = model_metadata['input_shapes']
+  out = {'metadata': {'driving': model_metadata}}
 
-  run_policy_jit = TinyJit(make_run_policy(model_runners, out['metadata'], args.frame_skip), prune=True)
+  run_policy_jit = TinyJit(make_run_policy(model_runner, model_metadata['output_slices']['hidden_state'], args.frame_skip), prune=True)
 
-  make_policy_queues = partial(make_input_queues, out['metadata']['vision']['input_shapes'],
-                               out['metadata']['on_policy']['input_shapes'], args.frame_skip)
-  make_random_model_inputs = partial(make_random_images, keys=['img', 'big_img'], shape=out['metadata']['vision']['input_shapes']['img'])
+  make_policy_queues = partial(make_input_queues, model_input_shapes, args.frame_skip)
+  make_random_model_inputs = partial(make_random_images, keys=['img', 'big_img'], shape=model_input_shapes['img'])
   out['run_policy'] = compile_jit(run_policy_jit, make_random_model_inputs, POLICY_INPUTS,
                                   make_policy_queues)
 
   for cam_w, cam_h in args.camera_resolutions:
     nv12 = NV12Frame(cam_w, cam_h, *get_nv12_info(cam_w, cam_h))
     make_random_warp_inputs = partial(make_random_images, keys=['frame', 'big_frame'], shape=nv12.size, device=WARP_DEV)
     warp_enqueue = TinyJit(make_warp(nv12, model_w, model_h, args.frame_skip), prune=True)
-    make_warp_queues = partial(make_warp_input_queues, out['metadata']['vision']['input_shapes'], args.frame_skip)
+    make_warp_queues = partial(make_warp_input_queues, model_input_shapes, args.frame_skip)
     out[(cam_w,cam_h)] = compile_jit(warp_enqueue, make_random_warp_inputs, WARP_INPUTS, make_warp_queues)
 
   with open(args.output, "wb") as f:

diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
@@ -79,33 +79,22 @@ def __init__(self, cam_w: int, cam_h: int, usbgpu: bool):
     input_devices = get_tg_input_devices(PROCESS_NAME, usbgpu)
     self.WARP_DEV, self.QUEUE_DEV = input_devices['WARP_DEV'], input_devices['QUEUE_DEV']
     jits = pickle.loads(read_file_chunked(modeld_pkl_path(usbgpu)))
-    vision_metadata = jits['metadata']['vision']
-    self.vision_input_shapes = vision_metadata['input_shapes']
-    self.vision_input_names = list(self.vision_input_shapes.keys())
-    self.vision_output_slices = vision_metadata['output_slices']
-
-    off_policy_metadata = jits['metadata']['off_policy']
-    self.off_policy_output_slices = off_policy_metadata['output_slices']
-
-    policy_metadata = jits['metadata']['on_policy']
-    self.policy_input_shapes = policy_metadata['input_shapes']
-    self.policy_output_slices = policy_metadata['output_slices']
+    model_metadata = jits['metadata']['driving']
+    model_input_shapes = model_metadata['input_shapes']
+    self.vision_input_names = [name for name in model_input_shapes if 'img' in name]
+    self.output_slices = model_metadata['output_slices']
 
     self.prev_desire = np.zeros(ModelConstants.DESIRE_LEN, dtype=np.float32)
 
     self.frame_skip = ModelConstants.MODEL_RUN_FREQ // ModelConstants.MODEL_CONTEXT_FREQ
-    self.input_queues, self.npy = make_input_queues(self.vision_input_shapes, self.policy_input_shapes, self.frame_skip, device=self.QUEUE_DEV)
+    self.input_queues, self.npy = make_input_queues(model_input_shapes, self.frame_skip, device=self.QUEUE_DEV)
     self.full_frames: dict[str, Tensor] = {}
     self._blob_cache: dict[int, Tensor] = {}
     self.parser = Parser()
     self.frame_buf_params = {k: get_nv12_info(cam_w, cam_h) for k in ('img', 'big_img')}
     self.run_policy = jits['run_policy']
     self.warp_enqueue = jits[(cam_w,cam_h)]
 
-  def slice_outputs(self, model_outputs: np.ndarray, output_slices: dict[str, slice]) -> dict[str, np.ndarray]:
-    parsed_model_outputs = {k: model_outputs[np.newaxis, v] for k,v in output_slices.items()}
-    return parsed_model_outputs
-
   def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
           inputs: dict[str, np.ndarray], prepare_only: bool) -> dict[str, np.ndarray] | None:
     for key in bufs.keys():
@@ -131,20 +120,19 @@ def run(self, bufs: dict[str, VisionBuf], transforms: dict[str, np.ndarray],
     if prepare_only:
       return None
 
-    vision_output, on_policy_output, off_policy_output = self.run_policy(
+    model_output, = self.run_policy(
       **{k: self.input_queues[k] for k in POLICY_INPUTS if k in self.input_queues}, img=img, big_img=big_img
     )
 
-    vision_output = vision_output.numpy().flatten()
-    off_policy_output = off_policy_output.numpy().flatten()
-    on_policy_output = on_policy_output.numpy().flatten()
-    vision_outputs_dict = self.parser.parse_vision_outputs(self.slice_outputs(vision_output, self.vision_output_slices))
-    off_policy_outputs_dict = self.parser.parse_off_policy_outputs(self.slice_outputs(off_policy_output, self.off_policy_output_slices))
-    policy_outputs_dict = self.parser.parse_policy_outputs(self.slice_outputs(on_policy_output, self.policy_output_slices))
+    model_output = model_output.numpy().flatten()
+    parsed_outputs = {k: model_output[np.newaxis, v] for k,v in self.output_slices.items()}
+    vision_outputs_dict = self.parser.parse_vision_outputs(parsed_outputs)
+    off_policy_outputs_dict = self.parser.parse_off_policy_outputs(parsed_outputs)
+    policy_outputs_dict = self.parser.parse_policy_outputs(parsed_outputs)
     combined_outputs_dict = {**vision_outputs_dict, **off_policy_outputs_dict, **policy_outputs_dict}
 
     if SEND_RAW_PRED:
-      combined_outputs_dict['raw_pred'] = np.concatenate([vision_output.copy(), on_policy_output.copy(), off_policy_output.copy()])
+      combined_outputs_dict['raw_pred'] = model_output.copy()
     return combined_outputs_dict
 
 

diff --git a/selfdrive/modeld/models/README.md b/selfdrive/modeld/models/README.md
@@ -1,8 +1,10 @@
 ## Neural networks in openpilot
 To view the architecture of the ONNX networks, you can use [netron](https://netron.app/)
 
-## Driving Model (vision model + temporal policy model)
-### Vision inputs (Full size: 799906 x float32)
+## Driving Model
+`driving.onnx` and `big_driving.onnx` each contain the vision model, off-policy temporal policy, and on-policy temporal policy in one graph.
+
+### Inputs
 * **image stream**
   * Two consecutive images (256 * 512 * 3 in RGB) recorded at 20 Hz : 393216 = 2 * 6 * 128 * 256
     * Each 256 * 512 image is represented in YUV420 with 6 channels : 6 * 128 * 256
@@ -15,20 +17,17 @@ To view the architecture of the ONNX networks, you can use [netron](https://netr
       * Channels 0,1,2,3 represent the full-res Y channel and are represented in numpy as Y[::2, ::2], Y[::2, 1::2], Y[1::2, ::2], and Y[1::2, 1::2]
       * Channel 4 represents the half-res U channel
       * Channel 5 represents the half-res V channel
-### Policy inputs
 * **desire**
   * one-hot encoded buffer to command model to execute certain actions, bit needs to be sent for the past 5 seconds (at 20FPS) : 100 * 8
 * **traffic convention**
   * one-hot encoded vector to tell model whether traffic is right-hand or left-hand traffic : 2
-* **lateral control params**
-  * speed and steering delay for predicting the desired curvature: 2
-* **previous desired curvatures**
-  * vector of previously predicted desired curvatures: 100 * 1
+* **action t**
+  * lateral and longitudinal action times: 2
 * **feature buffer**
-  * a buffer of intermediate features including the current feature to form a 5 seconds temporal context (at 20FPS) : 100 * 512
+  * a full-rate queue of previous intermediate vision features; the graph appends the current feature and samples the temporal policy context
 
 
-### Driving Model output format (Full size: XXX x float32)
+### Output
 Refer to **slice_outputs** and **parse_vision_outputs/parse_policy_outputs** in modeld.
 
 

diff --git a/selfdrive/modeld/models/big_driving.onnx b/selfdrive/modeld/models/big_driving.onnx
diff --git a/selfdrive/modeld/models/big_driving_off_policy.onnx b/selfdrive/modeld/models/big_driving_off_policy.onnx
diff --git a/selfdrive/modeld/models/big_driving_on_policy.onnx b/selfdrive/modeld/models/big_driving_on_policy.onnx
diff --git a/selfdrive/modeld/models/big_driving_vision.onnx b/selfdrive/modeld/models/big_driving_vision.onnx
diff --git a/selfdrive/modeld/models/driving.onnx b/selfdrive/modeld/models/driving.onnx
diff --git a/selfdrive/modeld/models/driving_off_policy.onnx b/selfdrive/modeld/models/driving_off_policy.onnx
diff --git a/selfdrive/modeld/models/driving_on_policy.onnx b/selfdrive/modeld/models/driving_on_policy.onnx
diff --git a/selfdrive/modeld/models/driving_vision.onnx b/selfdrive/modeld/models/driving_vision.onnx