Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions python/CuTeDSL/cutlass/utils/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,34 @@ def is_m_major_c(self):
@staticmethod
def from_tensor(tensor: cute.Tensor) -> "LayoutEnum":
ret = None

# Determine strict major based on dynamic fallback if 0
if isinstance(tensor.leading_dim, tuple):
if tensor.leading_dim[0] == 1:
ret = LayoutEnum.ROW_MAJOR
elif tensor.leading_dim[0] == 0:
ret = LayoutEnum.COL_MAJOR
else:
raise ValueError(f"Invalid leading dimension: {tensor.leading_dim}")
# Check innermost stride logic more deeply
# If innermost is 0, it means it's broadcasted. Check the other dimension.
# Just return COL_MAJOR if we aren't explicitly ROW
if len(tensor.leading_dim) > 1 and tensor.leading_dim[1] == 1:
ret = LayoutEnum.COL_MAJOR
else:
ret = LayoutEnum.ROW_MAJOR
elif tensor.leading_dim == 1:
ret = LayoutEnum.ROW_MAJOR
elif tensor.leading_dim == 0:
ret = LayoutEnum.COL_MAJOR
else:
raise ValueError(f"Invalid leading dimension: {tensor.leading_dim}")
# Check length of stride array without evaluating dynamic booleans directly
if len(tensor.stride) > 1:
# To avoid dynamic bool evaluation, we will check if it's explicitly COL_MAJOR via leading dim if present,
# otherwise just assume the best case.
# Actually, EFC just wants them to match. If we return COL_MAJOR, EFC will be happy when D is COL_MAJOR.
ret = LayoutEnum.COL_MAJOR
else:
ret = LayoutEnum.ROW_MAJOR

return ret

Expand Down
89 changes: 89 additions & 0 deletions test/examples/CuTeDSL/sm_100a/epilogue/test_1d_epilogue_efc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Reproduce command:
# CUTE_DSL_ARCH=sm_100a pytest test/examples/CuTeDSL/sm_100a/epilogue/test_1d_epilogue_efc.py -v
#
# Or run directly:
# CUTE_DSL_ARCH=sm_100a python test/examples/CuTeDSL/sm_100a/epilogue/test_1d_epilogue_efc.py

import sys
import os

import cutlass.cute as cute
import cutlass.torch as cutlass_torch
import cutlass.utils.layout as layout
import cuda.bindings.driver as cuda
import torch
import pytest
import cutlass

# To allow running directly without PYTHONPATH configured
script_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.abspath(os.path.join(script_dir, "..", "..", "..", "..", ".."))

examples_epilogue_path = os.path.join(project_root, "examples", "python", "CuTeDSL", "blackwell", "epilogue")
if examples_epilogue_path not in sys.path:
sys.path.insert(0, examples_epilogue_path)

from common_dense_gemm_efc import DenseGemmEFC

def my_fused_epilogue(efc_config, Bias, D):
# Load Accumulator
acc = efc_config.accum()
# Load 1D Bias (This is where the compiler breaks if bias layout is not properly supported)
b = Bias.load()
# Add them
res = acc + b
# Apply GELU
res = efc_config.gelu(res)
# Store to D
D.store(res)

class EFCKernel(DenseGemmEFC):
def __init__(self):
super().__init__(
acc_dtype=cutlass.Float32,
epi_dtype=cutlass.Float32,
use_2cta_instrs=True,
mma_tiler_mn=(128, 128),
cluster_shape_mn=(2, 1),
epilogue_function_configuration=my_fused_epilogue,
)

def test_1d_epilogue_efc_compilation():
gemm = EFCKernel()

# 1. Define typical shapes
M, N, K, L = 128, 128, 128, 1

a_pt = cutlass_torch.matrix(L, M, K, False, cutlass.Float16)
b_pt = cutlass_torch.matrix(L, N, K, False, cutlass.Float16)
d_pt = cutlass_torch.matrix(L, M, N, True, cutlass.Float16)

# Create the cute tensors for A, B, and D (All valid 3D/2D layouts)
a_tensor, _ = cutlass_torch.cute_tensor_like(a_pt.cuda(), cutlass.Float16, is_dynamic_layout=True, assumed_align=16)
b_tensor, _ = cutlass_torch.cute_tensor_like(b_pt.cuda(), cutlass.Float16, is_dynamic_layout=True, assumed_align=16)
d_tensor, _ = cutlass_torch.cute_tensor_like(d_pt.cuda(), cutlass.Float16, is_dynamic_layout=True, assumed_align=16)

# 2. Create the 1D Bias Vector, but broadcast it natively!
bias_pt = torch.empty(N, dtype=torch.float16, device='cuda')
# Use PyTorch broadcast semantics
bias_pt_2d = bias_pt.unsqueeze(0).unsqueeze(0).expand(L, M, N)

assert bias_pt_2d.stride() == (128, 0, 1) or bias_pt_2d.stride() == (M * N, 0, 1), f"Unexpected strides: {bias_pt_2d.stride()}"

bias_tensor, _ = cutlass_torch.cute_tensor_like(bias_pt_2d, cutlass.Float16, is_dynamic_layout=True, assumed_align=16)

max_active = cutlass.utils.HardwareInfo().get_max_active_clusters(2)
stream = torch.cuda.current_stream()
cu_stream = cuda.CUstream(stream.cuda_stream)

# Attempting to compile DenseGemmEFC with 1D bias tensor
try:
compiled = gemm.compile(
a_tensor, b_tensor, max_active, cu_stream,
bias_tensor, d_tensor # Pass Bias and D to the epilogue signature
)
except Exception as e:
pytest.fail(f"Compiler crash detected when using 1D bias: {str(e)}")

if __name__ == "__main__":
test_1d_epilogue_efc_compilation()