aws-neuron · Kelvin-Ng · Mar 13, 2026 · Mar 23, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/spike/ASYNC_API.md b/spike/ASYNC_API.md
diff --git a/spike/src/include/nrt_wrapper.h b/spike/src/include/nrt_wrapper.h
@@ -10,6 +10,89 @@
 #include <cstdint>
 #include <string>
 
+extern "C" {
+
+// Underlying tensor and model declaration copied from NRT
+// This is a temporary hack for implementing nonblocking/async operations,
+// as I will need some underlying info that is not exposed
+// Will not need these after explicit async is ready and stable from NRT
+
+#define DX_CACHE_ALIGNED __attribute__((aligned(64)))
+
+typedef enum nrt_tensor_mem_type {
+  NRT_TENSOR_MEM_TYPE_INVALID = 0,
+  NRT_TENSOR_MEM_TYPE_MALLOC,
+  NRT_TENSOR_MEM_TYPE_DMA,
+  NRT_TENSOR_MEM_TYPE_FAKE,
+} nrt_tensor_mem_type_t;
+
+// Memory, host or device that is used by
+// a tensor.  The memory is ref counted and can be shared among
+// multiple tensors.
+typedef struct nrt_tensor_storage {
+  uint32_t hbm_idx;
+  size_t allocated_size;
+  nrt_tensor_mem_type_t type;
+  union {
+    void *dmem; // dmem associated with addr, for tensor type
+                // NRT_TENSOR_MEM_TYPE_DMA
+    uint8_t
+        *vmem; // malloc'ed memory for tensor type NRT_TENSOR_MEM_TYPE_MALLOC
+  };
+  volatile uint64_t ref_count DX_CACHE_ALIGNED;
+  bool mem_owned_by_tensor;
+
+  pthread_mutex_t tensor_op_cv_lock; // Lock for async exec. Used with
+                                     // `tensor_op_cv` to block the thread while
+                                     // there are still pending execs. If this
+                                     // is NULL we are not in async exec mode.
+  pthread_cond_t tensor_op_cv;       // used to block tensor op vars
+  volatile uint64_t pending_exec_count_read
+      DX_CACHE_ALIGNED; // count of pending execs that reads this location
+  volatile uint64_t pending_exec_count_write
+      DX_CACHE_ALIGNED; // count of pending execs that writes to this location
+  int32_t vtpb_idx;     // same as vcore->vtpb_idx but -1 if no vcore for tensor
+                        // (used for trace api)
+} nrt_tensor_storage_t;
+
+typedef struct nrt_tensor {
+  char *name;                // optional name
+  nrt_tensor_storage_t *sto; // the actual memory represented by the tensor
+  // don't access directly, use helper functions to ensure correctness
+  // params below allow a tensor to represent a slice of the memory
+  // pointed by "sto"
+  size_t _offset; // offset within the storage
+  size_t _size;   // tensor size
+  void *extra;    // used to store any metadata needed by the runtime
+
+  volatile uint64_t ref_count
+      DX_CACHE_ALIGNED; // refcount for tensor. Only when this is 0 can we free
+                        // the tensor it is incremented by
+                        // `tensor_get_reference` and decremented by
+                        // `tensor_free`. Tensor will automatically be freed in
+                        // `tensor_free` once ref_count is zero.
+  volatile uint64_t output_completion_count
+      DX_CACHE_ALIGNED; // used to track the completion count of an output
+                        // tensor. 0 means not complete; 1 and above means the
+                        // number of completions
+} nrt_tensor_t;
+
+typedef struct H_NN {
+  uint32_t id;
+} H_NN;
+
+struct nrt_model {
+  uint32_t start_vnc;      // VirtualNeuronCore start index
+  uint32_t vnc_count;      // number of VirtualNeuronCore(s) requested
+  uint32_t instance_index; // instance index which will execute on the next call
+                           // to nrt_execute
+  uint32_t instance_count; // number of loaded instances
+  uint32_t gid;            // global id, for debug
+  char name[256];
+  H_NN h_nn[]; // kmgr model id (instance_count entries)
+};
+}
+
 namespace spike {
 
 // RAII wrapper for NRT runtime
@@ -26,6 +109,7 @@ class NrtRuntime {
   NrtRuntime &operator=(NrtRuntime &&) = default;
 
   static uint32_t get_visible_nc_count();
+  static uint32_t get_total_nc_count();
 
 private:
   bool initialized_;

diff --git a/spike/src/include/spike.h b/spike/src/include/spike.h
@@ -4,11 +4,24 @@
 #include "model.h"
 #include "nrt_wrapper.h"
 #include "tensor.h"
+#include "tensor_set.h"
+
+#include <nrt/nrt_async.h>
+
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+
+#include <array>
+#include <cstdint>
+#include <deque>
 #include <memory>
 #include <optional>
 #include <unordered_map>
+#include <variant>
 #include <vector>
 
+namespace nb = nanobind;
+
 namespace spike {
 
 // Tensor metadata structure
@@ -24,6 +37,104 @@ struct ModelTensorInfo {
   std::unordered_map<std::string, TensorMetadata> outputs;
 };
 
+// Prepared-batch records. These only hold the arguments to be used when
+// `_batched_start` is called; at start time we just submit one nrta_* request
+// per entry and group them under a single cmd_id.
+struct PreparedTensorWrite {
+  std::shared_ptr<NrtTensor> tensor;
+  const void *data;
+  size_t size;
+  size_t offset;
+  std::variant<nb::bytes, nb::ndarray<>> data_obj;
+};
+
+struct PreparedTensorRead {
+  std::shared_ptr<const NrtTensor> tensor;
+  size_t offset;
+  size_t size;
+  void *data;
+  std::variant<nb::bytes, nb::ndarray<>> data_obj;
+};
+
+// Pending nonblocking operations. Each cmd_id we hand back to Python maps to
+// one PendingOp. The id and wait_seq are common to every kind of op, so they
+// live directly on PendingOp; the per-kind payload is the variant below. The
+// batched variants hold N sub-requests submitted back-to-back on the same
+// (lnc, xu, queue=0); their seq numbers are consecutive, so wait_seq stores
+// the last one and a completed wait_seq implies every prior sub-request is
+// complete too.
+
+// The nrta_* APIs store the op's completion status through the `ret` pointer
+// *after* submission, so `ret` must outlive the call. Callers therefore enqueue
+// the PendingOp first and pass a pointer into the deque-resident copy (see
+// enqueue_pending); std::deque never relocates existing elements on push_back,
+// so that address stays valid until the op is harvested.
+struct PendingTensorWrite {
+  NRT_STATUS ret;
+  std::shared_ptr<NrtTensor> tensor;
+  // Anchors the Python-owned source buffer until the op completes. Empty for
+  // the raw-pointer overload (caller manages the buffer lifetime).
+  std::optional<std::variant<nb::bytes, nb::ndarray<>>> data_obj;
+};
+
+struct PendingTensorRead {
+  NRT_STATUS ret;
+  std::shared_ptr<const NrtTensor> tensor;
+  // The destination buffer; also returned to Python via
+  // NonBlockTensorReadResult.data.
+  std::variant<nb::bytes, nb::ndarray<>> data_obj;
+};
+
+struct PendingTensorWriteBatched {
+  // Lifetime anchors live in tensor_write_batched_prepared_[batch_id], which
+  // persists until close() (or a future explicit-release API) so the same
+  // prepared batch can be _start'd many times.
+  uint64_t batch_id;
+  std::vector<NRT_STATUS> rets;
+};
+
+struct PendingTensorReadBatched {
+  uint64_t batch_id;
+  std::vector<NRT_STATUS> rets;
+};
+
+struct PendingExecute {
+  NRT_STATUS ret;
+  std::shared_ptr<NrtModel> model;
+  std::shared_ptr<const NrtTensorSet> input_set;
+  std::shared_ptr<NrtTensorSet> output_set;
+};
+
+struct PendingOp {
+  uint64_t id;
+  nrta_seq_t wait_seq;
+  std::variant<PendingTensorWrite, PendingTensorRead,
+               PendingTensorWriteBatched, PendingTensorReadBatched,
+               PendingExecute>
+      op;
+};
+
+// NonBlock result structures (exposed to Python)
+struct NonBlockTensorReadResult {
+  uint64_t id;
+  std::variant<nb::bytes, nb::ndarray<>> data;
+  std::optional<std::variant<SpikeError, NrtError>> err;
+};
+
+struct NonBlockTensorWriteResult {
+  uint64_t id;
+  std::optional<std::variant<SpikeError, NrtError>> err;
+};
+
+struct NonBlockExecResult {
+  uint64_t id;
+  std::optional<std::variant<SpikeError, NrtError>> err;
+};
+
+typedef std::variant<NonBlockTensorReadResult, NonBlockTensorWriteResult,
+                     NonBlockExecResult>
+    NonBlockResult;
+
 // Main Spike class - Python interface
 class Spike {
 public:
@@ -72,17 +183,128 @@ class Spike {
                std::optional<std::string> ntff_name = std::nullopt,
                bool save_trace = false);
 
+  // Nonblocking operations
+  uint64_t tensor_write_nonblock(std::shared_ptr<NrtTensor> tensor,
+                                  nb::bytes data_obj, size_t offset = 0);
+  uint64_t tensor_write_nonblock(std::shared_ptr<NrtTensor> tensor,
+                                  nb::ndarray<> data_obj,
+                                  size_t offset = 0);
+  uint64_t tensor_write_nonblock(std::shared_ptr<NrtTensor> tensor,
+                                  const void *data, size_t size,
+                                  size_t offset);
+
+  uint64_t tensor_read_nonblock(std::shared_ptr<const NrtTensor> tensor,
+                                 size_t offset = 0, size_t size = 0);
+  uint64_t tensor_read_nonblock(std::shared_ptr<const NrtTensor> tensor,
+                                 nb::ndarray<> dest, size_t offset = 0,
+                                 size_t size = 0);
+
+  uint64_t tensor_write_nonblock_batched_prepare(
+      std::vector<std::shared_ptr<NrtTensor>> tensors,
+      std::vector<nb::ndarray<>> data_objs,
+      std::optional<std::vector<size_t>> offsets);
+  uint64_t tensor_write_nonblock_batched_start(uint64_t batch_id);
+
+  uint64_t tensor_read_nonblock_batched_prepare(
+      std::vector<std::shared_ptr<const NrtTensor>> tensors,
+      std::vector<nb::ndarray<>> dests,
+      std::optional<std::vector<size_t>> offsets,
+      std::optional<std::vector<size_t>> sizes);
+  uint64_t tensor_read_nonblock_batched_start(uint64_t batch_id);
+
+  uint64_t
+  execute_nonblock(std::shared_ptr<NrtModel> model,
+                   std::shared_ptr<const NrtTensorSet> input_set,
+                   std::shared_ptr<NrtTensorSet> output_set,
+                   std::optional<std::string> ntff_name = std::nullopt,
+                   bool save_trace = false);
+
+  std::optional<NonBlockResult> try_poll();
+
+  NrtTensorSet create_tensor_set(
+      const std::unordered_map<std::string, std::shared_ptr<const NrtTensor>>
+          &tensor_map);
+
+  // Wrap existing NRT objects (for interop with external code)
+  NrtModel wrap_model(nrt_model_t *ptr);
+  NrtTensor wrap_tensor(nrt_tensor_t *ptr);
+  NrtTensorSet wrap_tensor_set(nrt_tensor_set_t *ptr);
+
   // Model introspection
   ModelTensorInfo get_tensor_info(NrtModel &model);
 
 private:
   int verbose_level_;
   std::unique_ptr<NrtRuntime> runtime_;
 
+  // Nonblock state
+  uint64_t next_non_block_id_ = 0;
+  uint64_t next_batch_id_ = 0;
+
+  // One pending-op queue per (lnc, xu) channel. Each queue is FIFO by
+  // submission order, and within a queue nrta_seq_t values are monotonically
+  // increasing, so a channel only needs its front op's wait_seq checked
+  // against nrta_get_sequence's latest-completed seq for that channel.
+  static constexpr uint32_t MAX_LNC = 128;
+  static constexpr uint32_t NUM_CHANNELS = MAX_LNC * NRTA_XU_TYPE_NUM;
+  std::array<std::array<std::deque<PendingOp>, NRTA_XU_TYPE_NUM>, MAX_LNC>
+      xu_queues_;
+
+  // epoll-based completion multiplexing. Each (lnc, xu) channel that has ever
+  // had work gets one eventfd, registered with the runtime via
+  // nrta_event_register_xu_completion() and added to a single epoll instance.
+  // The runtime signals a channel's eventfd whenever that XU completes any
+  // sequence, so try_poll() can ask epoll which channels made progress instead
+  // of probing all MAX_LNC * NRTA_XU_TYPE_NUM channels every call.
+  int epoll_fd_ = -1;
+  // Per-channel eventfd, -1 until the channel is first registered.
+  std::array<int, NUM_CHANNELS> channel_event_fds_;
+
+  // Channels whose front op may be ready to harvest. Populated by epoll_wait
+  // (each signaled eventfd) and by try_poll itself (a channel stays queued
+  // after a successful harvest, since one eventfd signal can cover several
+  // completed ops). Removed once the front is found not-yet-complete or the
+  // queue empties.
+  //
+  // A FIFO queue (rather than an ordered set) gives round-robin fairness: every
+  // try_poll() pops the front channel, and a channel that still has pollable
+  // work is pushed to the back, so a continuously-busy low-index channel can't
+  // starve the others. scan_channel_queued_ tracks membership in O(1) so a
+  // channel is never enqueued twice.
+  std::deque<uint32_t> scan_channels_;
+  std::array<bool, NUM_CHANNELS> scan_channel_queued_;
+
+  static constexpr uint32_t channel_index(uint32_t lnc, uint32_t xu) {
+    return lnc * NRTA_XU_TYPE_NUM + xu;
+  }
+
+  // Lazily creates the epoll instance and the channel's eventfd, registering
+  // the latter with the runtime and the epoll set. Idempotent per channel.
+  void ensure_channel_registered(uint32_t lnc, nrta_xu_t xu);
+
+  // Prepared batches (data kept alive between _prepare and _start).
+  std::unordered_map<uint64_t, std::vector<PreparedTensorWrite>>
+      tensor_write_batched_prepared_;
+
+  std::unordered_map<uint64_t, std::vector<PreparedTensorRead>>
+      tensor_read_batched_prepared_;
+
+  // Appends op to the (lnc, xu) queue and returns a reference to the enqueued
+  // element. std::deque never relocates existing elements on push_back, so the
+  // returned reference (and pointers into its ret field) stay valid until the
+  // op is popped in try_poll. Scalar callers enqueue *before* submitting so the
+  // nrta_* call can write completion status straight into the queued op.
+  PendingOp &enqueue_pending(uint32_t lnc, nrta_xu_t xu, PendingOp op);
+
   // Helper methods
-  NrtTensorSet create_tensor_sets(
+  NrtTensorSet create_tensor_set(
       const std::unordered_map<std::string, NrtTensor &> &tensor_map);
   std::string dtype_to_string(nrt_dtype_t dtype);
+
+  static std::optional<std::variant<SpikeError, NrtError>>
+  ret_to_err(NRT_STATUS ret);
+  static std::optional<std::variant<SpikeError, NrtError>>
+  rets_to_err(const std::vector<NRT_STATUS> &rets);
 };
 
 } // namespace spike

diff --git a/spike/src/include/tensor.h b/spike/src/include/tensor.h
@@ -16,6 +16,9 @@ class NrtTensor {
   // This constructor creates an NrtTensor that owns the tensor
   NrtTensor(nrt_tensor_placement_t placement, uint32_t core_id, size_t size,
             const std::string &name, const Spike *spike);
+  // This constructor creates an NrtTensor that references an existing tensor
+  NrtTensor(nrt_tensor_t *ptr, uint32_t core_id, uint64_t size,
+            const std::string &name, const Spike *spike);
   NrtTensor(const NrtTensor &source, size_t offset, size_t size,
             const std::string &name);
   ~NrtTensor();