Intrinsics Reference

LittleKernel exposes hardware intrinsics through little_kernel.language (commonly imported as ll). Each intrinsic maps to one or more PTX instructions.

Common Intrinsics

Thread Indexing

ll.threadIdx_x()       # threadIdx.x
ll.threadIdx_y()       # threadIdx.y
ll.blockIdx_x()        # blockIdx.x
ll.blockIdx_y()        # blockIdx.y
ll.blockDim_x()        # blockDim.x
ll.get_lane_idx()      # %laneid
ll.get_warp_idx()      # threadIdx.x / 32

Synchronization

ll.syncthreads()       # __syncthreads()
ll.syncwarp()          # __syncwarp(0xFFFFFFFF)
ll.threadfence()       # __threadfence()

Memory

ll.get_smem_address(buf)     # cvta.shared.u32 (SMEM pointer)
ll.get_smem_address64(buf)   # cvta.shared.u64

Timing

ll.clock64()           # clock64() for latency measurement

SM90 (Hopper) Intrinsics

WGMMA (Warp Group MMA)

ll.wgmma_init_accum_64x64()      # Zero-init m64n64 FP32 accumulator
ll.wgmma_init_accum_64x128()     # Zero-init m64n128 FP32 accumulator
ll.wgmma_init_accum_64x256()     # Zero-init m64n256 FP32 accumulator
ll.wgmma_fence()                 # wgmma.fence.sync.aligned
ll.wgmma_commit()                # wgmma.commit_group.sync.aligned
ll.wgmma_wait(n)                 # wgmma.wait_group.sync.aligned N
ll.wgmma_m64n64k16_bf16(ad, bd)  # WGMMA instruction (descriptor-based)
ll.wgmma_m64n128k16_bf16(ad, bd)
ll.wgmma_m64n256k16_bf16(ad, bd)
ll.wgmma_store_d_64x64(C, ...)   # Store accumulator to global memory
ll.wgmma_store_d_64x128(C, ...)
ll.wgmma_store_d_64x256(C, ...)

TMA (Tensor Memory Accelerator)

# 2D TMA load (global -> shared)
ll.tma_load_2d(desc, bar, smem_addr, coord_x, coord_y)

# 2D TMA store (shared -> global)
ll.tma_store_2d(desc, smem_addr, coord_x, coord_y)
ll.tma_store_arrive()
ll.tma_store_wait()

# 1D bulk copy
ll.tma_copy_1d_g2s(smem_addr, gmem_addr, bar, size)

MBarrier

ll.mbarrier_init(bar, count)
ll.mbarrier_arrive_expect_tx(bar, tx_bytes)
ll.mbarrier_try_wait_parity(bar, parity)
ll.mbarrier_arrive(bar)
ll.mbarrier_invalidate(bar)

Cluster

ll.cluster_arrive()
ll.cluster_wait()
ll.cluster_sync()
ll.get_cluster_ctaid()

SM100 (Blackwell) Intrinsics

UMMA (Unified MMA / tcgen05)

ll.elect_one()                        # elect.sync (returns bool)
ll.tcgen05_alloc(num_cols)            # TMEM allocation
ll.tcgen05_dealloc(tmem, num_cols)    # TMEM deallocation
ll.tcgen05_ld_4x(tmem, smem_addr)    # Load 4 rows from SMEM to TMEM
ll.tcgen05_ld_8x(tmem, smem_addr)    # Load 8 rows from SMEM to TMEM
ll.tcgen05_fence_before()             # Fence before MMA
ll.tcgen05_fence_after()              # Fence after MMA
ll.umma_m256nNk16_bf16(idesc, tmem, ad, bd)  # UMMA MMA instruction
ll.umma_commit()                      # Commit UMMA group

TMA (SM100 / cta_group::2)

ll.tma_load_2d_cg2(desc, bar, smem_addr, x, y)  # TMA load with PEER_BIT_MASK
ll.tma_store_2d_sm100(desc, smem_addr, x, y)     # SM100 TMA store
ll.tma_store_commit()                              # SM100 TMA store commit

TMEM Operations

ll.tmem_store_bf16_row(tmem, D, row_off, N, ldm)  # TMEM -> global (BF16)
ll.tmem_epilogue_coalesced_4w(...)                # Optimized epilogue (requires 4 warps)

Cluster MBarrier (SM100)

ll.mbarrier_arrive_cluster(bar)
ll.mbarrier_arrive_expect_tx_cluster(bar, tx)

Utilities

ll.pack_bf16(a, b)        # Pack two BF16 into uint32
ll.st_shared_128(addr, v0, v1, v2, v3)  # 128-bit shared store
ll.uint_as_float(x)       # Reinterpret uint32 as float32
ll.__shfl_sync(mask, val, src_lane, width)  # Warp shuffle

Runtime Helpers

TMA Descriptor Creation

from little_kernel.runtime.tma_descriptor import create_tma_2d_descriptor

desc = create_tma_2d_descriptor(
    tensor,
    gmem_inner_dim=K,
    gmem_outer_dim=M,
    smem_inner_dim=BK,
    smem_outer_dim=BM,
    gmem_outer_stride=K,
    swizzle_mode=128,       # B128 swizzle
    oob_fill=0,
    l2_promotion=3,         # L2_256B
)