diff --git a/NN/API/Models/Gpt2.lean b/NN/API/Models/Gpt2.lean index 558f5da..36b8d28 100644 --- a/NN/API/Models/Gpt2.lean +++ b/NN/API/Models/Gpt2.lean @@ -180,6 +180,75 @@ def causalTransformerTokenScalarModuleDef (cfg : CausalOneHotConfig) causalTransformerTokenScalarModuleDefWithMode .train cfg body tokens targets (reduction := reduction) +abbrev causalTokenLmInputShape (cfg : CausalOneHotConfig) : Shape := + .dim (cfg.batch * cfg.seqLen) .scalar + +/-- +Scalar loss for causal language modeling with per-step float-encoded token ids as inputs. + +`xTokens` and `yTokens` are flattened `(batch * seqLen)` float vectors holding integer token ids. +This matches PyTorch `nn.Embedding` + `F.cross_entropy` training and avoids rebuilding the module +each step (Adam state stays on one session). +-/ +def causalTransformerTokenLmScalarModuleDefWithMode + (mode : _root_.Runtime.Autograd.TorchLean.NN.Mode) + (cfg : CausalOneHotConfig) + (body : nn.Sequential (causalEmbeddingShape cfg) (causalOneHotShape cfg)) + (initParams : _root_.Runtime.Autograd.Torch.TList Float + ((.dim cfg.vocab (.dim cfg.dModel .scalar)) :: paramShapes body)) + (reduction : TorchLean.Loss.Reduction := .mean) : + TorchLean.Module.ScalarModuleDef + ((.dim cfg.vocab (.dim cfg.dModel .scalar)) :: paramShapes body) + [causalTokenLmInputShape cfg, causalTokenLmInputShape cfg] := + { initParams := initParams + initRequiresGrad := + List.replicate (((.dim cfg.vocab (.dim cfg.dModel .scalar)) :: paramShapes body).length) true + loss := fun {α} => by + intro _ _ + exact fun {m} _ _ => + _root_.Runtime.Autograd.Torch.CurriedRef.curry + (Ref := _root_.Runtime.Autograd.TorchLean.NN.Seq.RefT (m := m) (α := α)) + (ss := ((.dim cfg.vocab (.dim cfg.dModel .scalar)) :: paramShapes body) ++ + [causalTokenLmInputShape cfg, causalTokenLmInputShape cfg]) + (β := m (_root_.Runtime.Autograd.TorchLean.NN.Seq.RefT (m := m) (α := α) + Spec.Shape.scalar)) + (fun args => do + let (ps, ins) := + _root_.Runtime.Autograd.Torch.RefList.split + (Ref := _root_.Runtime.Autograd.TorchLean.NN.Seq.RefT (m := m) (α := α)) + (ss₁ := (.dim cfg.vocab (.dim cfg.dModel .scalar)) :: paramShapes body) + (ss₂ := [causalTokenLmInputShape cfg, causalTokenLmInputShape cfg]) args + let .cons xFloat (.cons yFloat .nil) := ins + let .cons tokenEmbedding bodyParams := ps + let tokens ← _root_.Runtime.Autograd.TorchLean.F.floatVecToNatTensor (m := m) (α := α) + (k := cfg.batch * cfg.seqLen) xFloat + let targets ← _root_.Runtime.Autograd.TorchLean.F.floatVecToNatTensor (m := m) (α := α) + (k := cfg.batch * cfg.seqLen) yFloat + let x ← _root_.Runtime.Autograd.TorchLean.F.embeddingBatchSeqNat (m := m) (α := α) + (vocab := cfg.vocab) (dim := cfg.dModel) (batch := cfg.batch) + (seqLen := cfg.seqLen) tokenEmbedding tokens + let logits ← _root_.Runtime.Autograd.TorchLean.NN.Seq.evalParams + (model := body) (α := α) (m := m) mode bodyParams x + let logitsRows ← _root_.Runtime.Autograd.Torch.reshape (m := m) (α := α) + (s₁ := .dim cfg.batch (.dim cfg.seqLen (.dim cfg.vocab .scalar))) + (s₂ := .dim (cfg.batch * cfg.seqLen) (.dim cfg.vocab .scalar)) + logits (by + simp [_root_.Spec.Shape.size, Nat.mul_assoc]) + _root_.Runtime.Autograd.TorchLean.Loss.crossEntropyRowsNat (m := m) (α := α) + (rows := cfg.batch * cfg.seqLen) (classes := cfg.vocab) + logitsRows targets (reduction := reduction)) } + +/-- Training-mode wrapper for float-encoded token-id causal language modeling. -/ +def causalTransformerTokenLmScalarModuleDef (cfg : CausalOneHotConfig) + (body : nn.Sequential (causalEmbeddingShape cfg) (causalOneHotShape cfg)) + (initParams : _root_.Runtime.Autograd.Torch.TList Float + ((.dim cfg.vocab (.dim cfg.dModel .scalar)) :: paramShapes body)) + (reduction : TorchLean.Loss.Reduction := .mean) : + TorchLean.Module.ScalarModuleDef + ((.dim cfg.vocab (.dim cfg.dModel .scalar)) :: paramShapes body) + [causalTokenLmInputShape cfg, causalTokenLmInputShape cfg] := + causalTransformerTokenLmScalarModuleDefWithMode .train cfg body initParams (reduction := reduction) + end models end nn diff --git a/NN/API/Public/Facade/Data/Text.lean b/NN/API/Public/Facade/Data/Text.lean index 529481f..722a361 100644 --- a/NN/API/Public/Facade/Data/Text.lean +++ b/NN/API/Public/Facade/Data/Text.lean @@ -85,6 +85,47 @@ def causalLmOneHotSampleRowsFromTokenArray text.Corpus.randomBatchTokenWindows tokens batch seqLen seed step (padId := padId) causalLmOneHotSampleRows (α := α) batch seqLen vocab idsAt (padId := padId) +/-- +Flatten one `(seqLen + 1)` token window into causal-LM `(x, y)` id lists. +-/ +def causalLmTokenRows (seqLen : Nat) (window : List Nat) (padId : Nat := 0) : + List Nat × List Nat := + let x := (List.range seqLen).map (fun i => window.getD i padId) + let y := (List.range seqLen).map (fun i => window.getD (i + 1) padId) + (x, y) + +/-- +Build a float tensor of integer token ids from a flat `List Nat` of length `batch * seqLen`. +-/ +def causalLmTokenFloatVec {α : Type} [Runtime.SemanticScalar α] [Runtime.Scalar α] + (batch seqLen : Nat) (tokens : List Nat) : + Tensor.T α (.dim (batch * seqLen) .scalar) := + let xF : _root_.Spec.Tensor Float (.dim (batch * seqLen) .scalar) := + _root_.Spec.Tensor.dim (fun i : Fin (batch * seqLen) => + _root_.Spec.Tensor.scalar (Float.ofNat (tokens.getD i.val 0))) + Tensor.castFloat Runtime.ofFloat xF + +/-- +Build a batched token-id causal-language-model sample from an array-backed corpus. + +Token ids are passed as float inputs so the training loop can swap windows each step without +re-instantiating the scalar module. +-/ +def causalLmTokenSampleRowsFromTokenArray + {α : Type} [Runtime.SemanticScalar α] [Runtime.Scalar α] + (batch seqLen : Nat) (tokens : Array Nat) (seed step : Nat) (padId : Nat := 0) : + SupervisedSample α (.dim (batch * seqLen) .scalar) (.dim (batch * seqLen) .scalar) := + let idsAt := + text.Corpus.randomBatchTokenWindows tokens batch seqLen seed step (padId := padId) + let (xList, yList) := + (List.finRange batch).foldl (fun (acc : List Nat × List Nat) bi => + let (xs, ys) := acc + let (xRow, yRow) := causalLmTokenRows seqLen (idsAt bi) (padId := padId) + (xs ++ xRow, ys ++ yRow)) ([], []) + NN.API.Sample.mk + (causalLmTokenFloatVec (α := α) batch seqLen xList) + (causalLmTokenFloatVec (α := α) batch seqLen yList) + /-- Build one unbatched one-hot causal-language-model sample directly from a token list. diff --git a/NN/API/Runtime/Module.lean b/NN/API/Runtime/Module.lean index 29468e1..d80bbd4 100644 --- a/NN/API/Runtime/Module.lean +++ b/NN/API/Runtime/Module.lean @@ -189,6 +189,8 @@ Default dtype policy: - If the user does not specify `--dtype` / `--float32-mode` and `--cuda` is present, default to `dtype=float` (CUDA eager supports `Float` upload/download). - Otherwise default to `dtype=float32` (executable IEEE-754 float32 semantics). + +When `--cuda` is selected, also enable the CUDA fast path (`--fast-kernels`) by default. -/ def parseAndStripWithDefaultDType (args : List String) (defaultDType : DType) : Except String (ExecConfig × List String) := do @@ -214,7 +216,7 @@ def parseAndStripWithDefaultDType (args : List String) (defaultDType : DType) : dtype := dtype, backend := backend, useGpu := useGpu, - fastKernels := fastKernels, + fastKernels := fastKernels || useGpu, fastGpuMatmulPrecision := fastGpuMatmulPrecision }, rest) diff --git a/NN/Runtime/Autograd/Torch/Core/Functional.lean b/NN/Runtime/Autograd/Torch/Core/Functional.lean index 52073c5..7ff7faf 100644 --- a/NN/Runtime/Autograd/Torch/Core/Functional.lean +++ b/NN/Runtime/Autograd/Torch/Core/Functional.lean @@ -207,6 +207,14 @@ class Ops (m : Type → Type) (α : Type) [Context α] [DecidableEq Shape] where gatherRowsNat {rows cols k : Nat} : Ref (.dim rows (.dim cols .scalar)) → Tensor Nat (.dim k .scalar) → m (Ref (.dim k (.dim cols .scalar))) + /-- + Read a float vector of integer-valued token ids and return a `Tensor Nat` index vector. + + This is non-differentiable: gradients do not flow back into the float input. Language-model + benchmarks pass token ids as float inputs so each step can supply a fresh window without + re-instantiating the module. + -/ + floatVecToNatTensor {k : Nat} : Ref (.dim k .scalar) → m (Tensor Nat (.dim k .scalar)) scatterAddVec {n : Nat} : Ref (.dim n .scalar) → Ref Shape.scalar → Fin n → m (Ref (.dim n .scalar)) scatterAddRow {rows cols : Nat} : @@ -457,6 +465,11 @@ def gatherRowsNat {rows cols k : Nat} (x : Ref (m := m) (α := α) (.dim rows (.dim cols .scalar))) (idx : Tensor Nat (.dim k .scalar)) : m (Ref (m := m) (α := α) (.dim k (.dim cols .scalar))) := Ops.gatherRowsNat (m := m) (α := α) (rows := rows) (cols := cols) (k := k) x idx +/-- Convert a float vector of integer token ids to `Tensor Nat` (non-differentiable). -/ +def floatVecToNatTensor {k : Nat} + (x : Ref (m := m) (α := α) (.dim k .scalar)) : + m (Tensor Nat (.dim k .scalar)) := + Ops.floatVecToNatTensor (m := m) (α := α) (k := k) x /-- Re-export of `Ops.scatter_add_vec`. -/ def scatterAddVec {n : Nat} (x : Ref (m := m) (α := α) (.dim n .scalar)) (v : Ref (m := m) (α := α) Shape.scalar) (i : Fin n) diff --git a/NN/Runtime/Autograd/Torch/Core/Ops.lean b/NN/Runtime/Autograd/Torch/Core/Ops.lean index 6d85a19..a7554b5 100644 --- a/NN/Runtime/Autograd/Torch/Core/Ops.lean +++ b/NN/Runtime/Autograd/Torch/Core/Ops.lean @@ -681,6 +681,24 @@ def gatherRowsNat {α : Type} (s : EagerSession α) [Add α] [Zero α] [Decidabl pure (some { id := id }) dispatchCudaOpt (α := α) s "gather_rows_nat" cpu cuda +/-- +Read a float input vector and return the corresponding `Tensor Nat` index vector. + +Non-differentiable: used by token-id language-model losses that accept float-encoded ids as inputs. +-/ +def floatVecToNatTensor {α : Type} (s : EagerSession α) [CudaBridge.TensorConv α] [DecidableEq Shape] + {k : Nat} (x : TensorRef α (.dim k .scalar)) : IO (Tensor Nat (.dim k .scalar)) := do + let v ← getValue (α := α) s (sh := .dim k .scalar) x + match v with + | .dim f => + let ns ← (List.finRange k).mapM (fun i => do + match f i with + | .scalar fl => do + let ff ← CudaBridge.TensorConv.toFloat (α := α) fl + pure (UInt64.toNat (Float.floor ff).toUInt64)) + pure <| + Tensor.dim (fun i => Tensor.scalar (ns.getD i.val 0)) + /-- Gather `k` scalars using indices stored in the nat-environment (`NatVecRef`). -/ def gatherVecRef {α : Type} (s : EagerSession α) [Add α] [Zero α] [DecidableEq Shape] {n k : Nat} (x : TensorRef α (.dim n .scalar)) (idx : NatVecRef k) : diff --git a/NN/Runtime/Autograd/Torch/Core/Trainer.lean b/NN/Runtime/Autograd/Torch/Core/Trainer.lean index baf7efd..ff32e95 100644 --- a/NN/Runtime/Autograd/Torch/Core/Trainer.lean +++ b/NN/Runtime/Autograd/Torch/Core/Trainer.lean @@ -81,6 +81,8 @@ instance {α : Type} [Context α] [Internal.CudaBridge.TensorConv α] [Decidable Internal.EagerSession.gatherVecNat (α := α) sess (n := n) (k := k) x idx gatherRowsNat := fun {rows cols k} x idx => fun sess => Internal.EagerSession.gatherRowsNat (α := α) sess (rows := rows) (cols := cols) (k := k) x idx + floatVecToNatTensor := fun {k} x => fun sess => + Internal.EagerSession.floatVecToNatTensor (α := α) sess (k := k) x scatterAddVec := fun {n} x v i => fun sess => Internal.EagerSession.scatterAddVec (α := α) sess (n := n) x v i scatterAddRow := fun {rows cols} x v i => fun sess => @@ -245,6 +247,8 @@ instance {α : Type} [Context α] [DecidableEq Shape] {Γ : List Shape} : gatherRowsNat := fun {rows cols k} x idx => Runtime.Autograd.Compiled.GraphM.gatherRowsNat (α := α) (Γ := Γ) (rows := rows) (cols := cols) (k := k) x idx + floatVecToNatTensor := fun {_k} _x => + throw "compiled GraphM: floatVecToNatTensor requires eager backend (dynamic token ids)" scatterAddVec := fun {n} x v i => Runtime.Autograd.Compiled.GraphM.scatterAddVec (α := α) (Γ := Γ) (n := n) x v i scatterAddRow := fun {rows cols} x v i => diff --git a/NN/Runtime/Autograd/Torch/Core/Types.lean b/NN/Runtime/Autograd/Torch/Core/Types.lean index 80afb3f..b6ed325 100644 --- a/NN/Runtime/Autograd/Torch/Core/Types.lean +++ b/NN/Runtime/Autograd/Torch/Core/Types.lean @@ -109,6 +109,12 @@ structure Options where -/ fastGpuMatmulPrecision : Runtime.Autograd.FastKernels.GpuMatmulPrecision := .fp32 /-- + Skip autograd during backward (inference-style evaluation). + + Used by `eval1NoGrad` and similar helpers that run forward without recording gradients. + -/ + noGrad : Bool := false + /-- Eager execution on CUDA. When `true` and `backend = .eager`, the eager session uses the CUDA tape diff --git a/NN/Runtime/Autograd/TorchLean/Functional/Core.lean b/NN/Runtime/Autograd/TorchLean/Functional/Core.lean index 17359b5..3c97e29 100644 --- a/NN/Runtime/Autograd/TorchLean/Functional/Core.lean +++ b/NN/Runtime/Autograd/TorchLean/Functional/Core.lean @@ -220,6 +220,13 @@ def embeddingBatchSeqNat {α : Type} [Context α] [DecidableEq Shape] gathered (by simp [Shape.size, Nat.mul_assoc]) +/-- Read float-encoded token ids as a `Tensor Nat` index vector (non-differentiable). -/ +def floatVecToNatTensor {α : Type} [Context α] [DecidableEq Shape] + {m : Type → Type} [Monad m] [Ops (m := m) (α := α)] {k : Nat} + (x : RefTy (m := m) (α := α) (.dim k .scalar)) : + m (Tensor Nat (.dim k .scalar)) := + _root_.Runtime.Autograd.Torch.floatVecToNatTensor (m := m) (α := α) (k := k) x + /-! ## Reductions -/ /-- diff --git a/NN/Runtime/Autograd/TorchLean/NN/Seq.lean b/NN/Runtime/Autograd/TorchLean/NN/Seq.lean index 5b8ff8d..ceaa1b5 100644 --- a/NN/Runtime/Autograd/TorchLean/NN/Seq.lean +++ b/NN/Runtime/Autograd/TorchLean/NN/Seq.lean @@ -172,6 +172,7 @@ def programWithMode {σ τ : Shape} (mode : Mode) (model : Seq σ τ) [_root_.Runtime.Autograd.Torch.Internal.CudaBridge.TensorConv α] (params : _root_.Runtime.Autograd.Torch.ParamList α (paramShapes model)) (x : Spec.Tensor α σ) : IO (Spec.Tensor α τ) := do + let opts := { opts with noGrad := true } let sess ← _root_.Runtime.Autograd.Torch.Internal.EagerSession.new (α := α) opts sess.resetTape let outRef ← (do diff --git a/NN/Verification/TorchLean/Compile.lean b/NN/Verification/TorchLean/Compile.lean index 09e242c..4477e5f 100644 --- a/NN/Verification/TorchLean/Compile.lean +++ b/NN/Verification/TorchLean/Compile.lean @@ -311,6 +311,8 @@ instance {α : Type} [Context α] [DecidableEq Shape] : fail (α := α) "TorchLean→IR: gather is outside the verifier IR fragment" gatherRowsNat := fun {_rows _cols _k} _x _idx => fail (α := α) "TorchLean→IR: gather is outside the verifier IR fragment" + floatVecToNatTensor := fun {_k} _x => + fail (α := α) "TorchLean→IR: float_vec_to_nat_tensor is outside the verifier IR fragment" scatterAddVec := fun {_n} _x _val _i => fail (α := α) "TorchLean→IR: scatter is outside the verifier IR fragment" diff --git a/NN/Verification/TorchLean/SpecEval.lean b/NN/Verification/TorchLean/SpecEval.lean index 7b72c5e..b24a22a 100644 --- a/NN/Verification/TorchLean/SpecEval.lean +++ b/NN/Verification/TorchLean/SpecEval.lean @@ -91,6 +91,8 @@ instance {α : Type} [Context α] [DecidableEq Shape] : Runtime.Autograd.Torch.O "TorchLeanSpecEval: gather_vec_nat not supported in spec backend" gatherRowsNat := fun {_rows _cols _k} _x _idx => throw "TorchLeanSpecEval: gather_rows_nat not supported in spec backend" + floatVecToNatTensor := fun {_k} _x => throw + "TorchLeanSpecEval: float_vec_to_nat_tensor not supported in spec backend" scatterAddVec := fun {_n} _x _val _i => throw "TorchLeanSpecEval: scatter_add_vec not supported in spec backend" scatterAddRow := fun {_rows _cols} _x _row _i => throw diff --git a/README.md b/README.md index 2ce2cd6..8a3c763 100644 --- a/README.md +++ b/README.md @@ -20,13 +20,14 @@ lake exe torchlean mlp --steps 10 --dtype float --backend eager # Optional CUDA run, if the CUDA toolkit and an NVIDIA GPU are available: lake build -K cuda=true -lake exe -K cuda=true torchlean mlp --cuda --fast-kernels --steps 1000 +lake exe -K cuda=true torchlean mlp --cuda --steps 1000 ``` The first MLP command uses the executable IEEE-style Float32 path. The second uses Lean's builtin `Float` runtime path. The CUDA command uses the native GPU runtime path and checks that the CUDA backend is available; it is not a trusted -proof boundary. +proof boundary. Handwritten fused CUDA kernels are opt-in performance paths via +`--fast-kernels`; keep them off unless you are explicitly testing that backend. TorchLean is pinned by `lean-toolchain` and currently builds with `leanprover/lean4:v4.31.0`. diff --git a/csrc/cuda/common/torchlean_cuda_common.h b/csrc/cuda/common/torchlean_cuda_common.h index 4a61d67..73dffcf 100644 --- a/csrc/cuda/common/torchlean_cuda_common.h +++ b/csrc/cuda/common/torchlean_cuda_common.h @@ -18,6 +18,14 @@ static inline void checkCuda(cudaError_t e, const char* msg) { } } +static inline void torchlean_cuda_clear_pending_error() { + (void)cudaGetLastError(); +} + +static inline void torchlean_cuda_check_launch(const char* msg) { + checkCuda(cudaGetLastError(), msg); +} + static inline void torchlean_cuda_free_checked(void** ptr, const char* msg) { if (ptr && *ptr) { checkCuda(cudaFree(*ptr), msg); diff --git a/csrc/cuda/tensor/torchlean_cuda_tensor.cu b/csrc/cuda/tensor/torchlean_cuda_tensor.cu index 9d5f70f..8243166 100644 --- a/csrc/cuda/tensor/torchlean_cuda_tensor.cu +++ b/csrc/cuda/tensor/torchlean_cuda_tensor.cu @@ -891,8 +891,9 @@ extern "C" LEAN_EXPORT lean_obj_res torchlean_cuda_buffer_to_float_array(b_lean_ } \ dim3 blocks = torchlean_blocks_for(b->size); \ dim3 threads = dim3(kBlockSize); \ + torchlean_cuda_clear_pending_error(); \ KERNEL<<>>(b->data, out->data, b->size, (float)c); \ - checkCuda(cudaGetLastError(), "cuda " LABEL " kernel launch failed"); \ + torchlean_cuda_check_launch("cuda " LABEL " kernel launch failed"); \ return torchlean_cuda_buffer_box(out); \ } @@ -1114,13 +1115,15 @@ extern "C" LEAN_EXPORT lean_obj_res torchlean_cuda_buffer_reduce_mean(b_lean_obj "cudaMemcpy reduceMean init failed"); dim3 blocks = torchlean_blocks_for(b->size); dim3 threads = dim3(kBlockSize); + torchlean_cuda_clear_pending_error(); torchlean_reduce_sum_f32<<>>(b->data, out->data, b->size); - checkCuda(cudaGetLastError(), "cuda reduceMean reduce kernel launch failed"); + torchlean_cuda_check_launch("cuda reduceMean reduce kernel launch failed"); } float scale = 1.0f / (float)b->size; + torchlean_cuda_clear_pending_error(); torchlean_scale1_f32<<>>(out->data, scale); - checkCuda(cudaGetLastError(), "cuda reduceMean scale kernel launch failed"); + torchlean_cuda_check_launch("cuda reduceMean scale kernel launch failed"); return torchlean_cuda_buffer_box(out); }