Triton-distributed
Getting Started
Installation
Tutorials
End-to-End Integration
Megakernel Implementations
Intra-Kernel Profiler User Guide
Build Triton-distributed
Running Tests
Primitives Provided by Triton-distributed
AutoTuner for Triton-distributed
Kernels & Layers
Kernels
Layers
Models
Python API
triton-dist.language
Triton-distributed Semantics
LittleKernel
LittleKernel
Advanced Topics
Performance of Triton-distributed on AMD GPUs
Download and fix NVSHMEM
End-to-End Demo for Triton-Distributed
How to pull upstream code
Examples
Tutorials
Triton-distributed Documents
Triton-distributed 自动调优器
Triton-distributed
Index
Index
_
|
A
|
B
|
C
|
D
|
E
|
F
|
G
|
H
|
I
|
K
|
L
|
M
|
N
|
O
|
P
|
Q
|
R
|
S
|
T
|
U
|
V
|
W
|
Z
_
__add__() (vector method)
__getitem__() (vector method)
__init__() (vector method)
__mul__() (vector method)
__setitem__() (vector method)
__sub__() (vector method)
__syncthreads()
built-in function
_forward_pull_kernel()
built-in function
_forward_push_2d_kernel()
built-in function
_forward_push_2d_ll_kernel()
built-in function
_forward_push_2d_ll_multimem_kernel()
built-in function
_forward_push_3d_kernel()
built-in function
_forward_push_numa_2d_kernel()
built-in function
_forward_push_numa_2d_ll_kernel()
built-in function
A
ag_gemm()
built-in function
ag_gemm_intra_node()
built-in function
all_to_all_single_gemm()
built-in function
AllGatherMethod (built-in class)
alloc_profiler_buffer()
built-in function
aot_compile_spaces()
built-in function
apply_triton340_inductor_patch()
built-in function
atomic_add()
built-in function
atomic_cas()
built-in function
B
barrier()
built-in function
barrier_all()
built-in function
barrier_all_block()
built-in function
barrier_all_warp()
built-in function
bincount()
built-in function
broadcastmem_block()
built-in function
built-in function
__syncthreads()
_forward_pull_kernel()
_forward_push_2d_kernel()
_forward_push_2d_ll_kernel()
_forward_push_2d_ll_multimem_kernel()
_forward_push_3d_kernel()
_forward_push_numa_2d_kernel()
_forward_push_numa_2d_ll_kernel()
ag_gemm()
ag_gemm_intra_node()
all_to_all_single_gemm()
alloc_profiler_buffer()
aot_compile_spaces()
apply_triton340_inductor_patch()
atomic_add()
atomic_cas()
barrier()
barrier_all()
barrier_all_block()
barrier_all_warp()
bincount()
broadcastmem_block()
combine_kernel_v2()
consume_token()
cp_engine_producer_all_gather_inter_node()
cp_engine_producer_all_gather_intra_node()
create_ag_gemm_context()
create_ag_gemm_intra_node_context()
create_all_to_all_single_gemm_context()
create_ep_ll_a2a_ctx()
create_fast_allgather_context()
create_gemm_ar_context()
create_gemm_rs_context()
create_gemm_rs_intra_node_context()
create_ll_gemm_ar_context()
create_moe_rs_context()
create_sp_ag_attention_context_inter_node()
create_sp_ag_attention_context_intra_node()
create_ulysses_sp_pre_attn_comm_context()
dispatch_kernel_v2()
dist_print()
dump_c_code()
ep_combine_token_inplace()
ep_dispatch_token_inplace()
export_to_perfetto_trace()
extern_call()
extract()
fast_allgather()
fcollectmem_block()
fence()
finalize_distributed()
fused_sp_ag_attn_inter_node()
fused_sp_ag_attn_intra_node()
gemm_allreduce_op()
gemm_non_persistent()
gemm_persistent()
gemm_rs()
gemm_rs_intra_node()
get_ag_splits_and_recv_offset_for_dispatch()
get_ag_splits_and_recv_offset_for_dispatch_intra_node()
get_auto_all_gather_method()
get_bool_env()
get_current_gpu_clock_rate_in_khz()
get_device_max_shared_memory_size()
get_dispatch_send_reqs()
get_int_env()
get_max_gpu_clock_rate_in_khz()
get_numa_node()
get_triton_combine_kv_algo_info()
getmem()
getmem_block()
getmem_nbi()
getmem_nbi_block()
getmem_nbi_warp()
getmem_warp()
gqa_fwd_batch_decode()
gqa_fwd_batch_decode_aot()
gqa_fwd_batch_decode_intra_rank()
gqa_fwd_batch_decode_intra_rank_aot()
gqa_fwd_batch_decode_persistent()
gqa_fwd_batch_decode_persistent_aot()
has_fullmesh_nvlink()
has_tma()
init_nvshmem_by_torch_process_group()
init_rocshmem_by_torch_process_group()
init_seed()
initialize_distributed()
insert()
is_cuda()
is_hip()
is_nvshmem_multimem_supported()
is_shmem_initialized()
kernel_combine_token_intra_node()
kernel_dispatch_token_intra_node()
kernel_gqa_fwd_batch_decode_split_kv_persistent()
kernel_inter_rank_gqa_fwd_batch_decode_combine_kv()
kernel_name_suffix()
kernel_skipped_token_inplace_local_combine_intra_node()
kernel_skipped_token_local_dispatch_intra_node()
ld()
ld_vector()
low_latency_gemm_allreduce_op()
make_ast_source()
make_vector()
materialize_c_params()
mega_kernel_dispatch_token_moe_grouped_gemm()
mega_kernel_moe_grouped_gemm_combine_token()
my_pe()
n_pes()
notify()
num_ranks()
num_threads()
num_warps()
nvshmem_barrier_all_on_stream()
nvshmem_create_tensor()
nvshmem_create_tensors()
nvshmem_free_tensor_sync()
pack()
parse_to_tracks()
putmem()
putmem_block()
putmem_nbi()
putmem_nbi_block()
putmem_nbi_warp()
putmem_signal_block()
putmem_signal_nbi_block()
putmem_warp()
quiet()
rand_tensor()
rank()
remote_mc_ptr()
remote_ptr()
reset_profiler_buffer()
rocshmem_barrier_all_on_stream()
signal_op()
signal_wait_until()
simt_exec_region()
st()
st_vector()
supports_p2p_native_atomic()
symm_at()
sync_all()
team_my_pe()
team_n_pes()
threads_per_warp()
tid()
unpack()
wait()
zeros_vector()
C
combine() (EPAll2AllLayer method)
(EPLowLatencyAllToAllLayer method)
combine_kernel_v2()
built-in function
consume_token()
built-in function
cp_engine_producer_all_gather_inter_node()
built-in function
cp_engine_producer_all_gather_intra_node()
built-in function
create() (DispatchCombineContext class method)
create_ag_gemm_context()
built-in function
create_ag_gemm_intra_node_context()
built-in function
create_all_to_all_single_gemm_context()
built-in function
create_ep_ll_a2a_ctx()
built-in function
create_fast_allgather_context()
built-in function
create_gemm_ar_context()
built-in function
create_gemm_rs_context()
built-in function
create_gemm_rs_intra_node_context()
built-in function
create_ll_gemm_ar_context()
built-in function
create_moe_rs_context()
built-in function
create_sp_ag_attention_context_inter_node()
built-in function
create_sp_ag_attention_context_intra_node()
built-in function
create_ulysses_sp_pre_attn_comm_context()
built-in function
D
dispatch() (EPAll2AllLayer method)
(EPLowLatencyAllToAllLayer method)
dispatch_kernel_v2()
built-in function
DispatchCombineContext (built-in class)
DispatchMetaInfo (built-in class)
dist_print()
built-in function
dump_c_code()
built-in function
dump_combine_trace() (EPLowLatencyAllToAllLayer method)
dump_dispatch_trace() (EPLowLatencyAllToAllLayer method)
E
ep_barrier_all() (EPAll2AllLayer method)
ep_combine_token_inplace()
built-in function
ep_dispatch_token_inplace()
built-in function
EpAll2AllFusedOp (built-in class)
EPAll2AllLayer (built-in class)
EPAllToAllLayoutDesc (built-in class)
,
[1]
EPConfig (built-in class)
EPLowLatencyAllToAllLayer (built-in class)
export_to_perfetto_trace()
built-in function
extern_call()
built-in function
extract()
built-in function
F
fast_allgather()
built-in function
fcollectmem_block()
built-in function
fence()
built-in function
finalize() (DispatchCombineContext method)
(EpAll2AllFusedOp method)
(EPAll2AllLayer method)
(EPLowLatencyAllToAllLayer method)
(LowlatencyDispatchContext method)
finalize_distributed()
built-in function
fused_sp_ag_attn_inter_node()
built-in function
fused_sp_ag_attn_intra_node()
built-in function
G
gemm_allreduce_op()
built-in function
gemm_non_persistent()
built-in function
gemm_persistent()
built-in function
gemm_rs()
built-in function
gemm_rs_intra_node()
built-in function
get_ag_splits_and_recv_offset_for_dispatch()
built-in function
get_ag_splits_and_recv_offset_for_dispatch_intra_node()
built-in function
get_auto_all_gather_method()
built-in function
get_bool_env()
built-in function
get_current_gpu_clock_rate_in_khz()
built-in function
get_device_max_shared_memory_size()
built-in function
get_dispatch_send_reqs()
built-in function
get_int_env()
built-in function
get_max_gpu_clock_rate_in_khz()
built-in function
get_numa_node()
built-in function
get_nvshmem_breakdown() (EpAll2AllFusedOp method)
get_nvshmem_size() (EpAll2AllFusedOp method)
get_nvshmem_size_gb() (EpAll2AllFusedOp method)
get_nvshmem_size_mb() (EpAll2AllFusedOp method)
get_triton_combine_kv_algo_info()
built-in function
getmem()
built-in function
getmem_block()
built-in function
getmem_nbi()
built-in function
getmem_nbi_block()
built-in function
getmem_nbi_warp()
built-in function
getmem_warp()
built-in function
gqa_fwd_batch_decode()
built-in function
gqa_fwd_batch_decode_aot()
built-in function
gqa_fwd_batch_decode_intra_rank()
built-in function
gqa_fwd_batch_decode_intra_rank_aot()
built-in function
gqa_fwd_batch_decode_persistent()
built-in function
gqa_fwd_batch_decode_persistent_aot()
built-in function
H
has_fullmesh_nvlink()
built-in function
has_tma()
built-in function
hidden (EPConfig attribute)
I
init_nvshmem_by_torch_process_group()
built-in function
init_output_buffer() (EpAll2AllFusedOp method)
init_rocshmem_by_torch_process_group()
built-in function
init_seed()
built-in function
initialize_distributed()
built-in function
insert()
built-in function
is_cuda()
built-in function
is_hip()
built-in function
is_intra_node (EPConfig property)
is_nvshmem_multimem_supported()
built-in function
is_shmem_initialized()
built-in function
K
kernel_combine_token_intra_node()
built-in function
kernel_dispatch_token_intra_node()
built-in function
kernel_gqa_fwd_batch_decode_split_kv_persistent()
built-in function
kernel_inter_rank_gqa_fwd_batch_decode_combine_kv()
built-in function
kernel_name_suffix()
built-in function
kernel_skipped_token_inplace_local_combine_intra_node()
built-in function
kernel_skipped_token_local_dispatch_intra_node()
built-in function
L
ld()
built-in function
ld_vector()
built-in function
local_world_size (EPConfig attribute)
low_latency_gemm_allreduce_op()
built-in function
LowlatencyCombineContext (built-in class)
LowlatencyDispatchContext (built-in class)
M
make_ast_source()
built-in function
make_vector()
built-in function
materialize_c_params()
built-in function
max_tokens (EPConfig attribute)
mega_dispatch_group_gemm() (EpAll2AllFusedOp method)
mega_group_gemm_combine() (EpAll2AllFusedOp method)
mega_kernel_dispatch_token_moe_grouped_gemm()
built-in function
mega_kernel_moe_grouped_gemm_combine_token()
built-in function
ModelBuilder (built-in class)
my_pe()
built-in function
N
n_pes()
built-in function
notify()
built-in function
num_dispatch_token_cur_rank (EPAllToAllLayoutDesc attribute)
,
[1]
num_experts (EPConfig attribute)
num_experts_per_rank (EPConfig property)
num_input_tokens_per_rank (EPAllToAllLayoutDesc attribute)
,
[1]
num_ranks()
built-in function
num_recv_tokens_per_rank (EPAllToAllLayoutDesc attribute)
num_threads()
built-in function
num_warps()
built-in function
nvshmem_barrier_all_on_stream()
built-in function
NVSHMEM_CMP_EQ (built-in variable)
NVSHMEM_CMP_GE (built-in variable)
NVSHMEM_CMP_GT (built-in variable)
NVSHMEM_CMP_LE (built-in variable)
NVSHMEM_CMP_LT (built-in variable)
NVSHMEM_CMP_NE (built-in variable)
nvshmem_create_tensor()
built-in function
nvshmem_create_tensors()
built-in function
nvshmem_free_tensor_sync()
built-in function
NVSHMEM_SIGNAL_ADD (built-in variable)
NVSHMEM_SIGNAL_DTYPE (built-in variable)
NVSHMEM_SIGNAL_SET (built-in variable)
NVSHMEM_TEAM_WORLD (built-in variable)
NVSHMEMX_TEAM_NODE (built-in variable)
O
offset_dtype (EPConfig attribute)
P
pack()
built-in function
parse_to_tracks()
built-in function
preprocess() (EpAll2AllFusedOp method)
Profiler (built-in class)
ProfilerBuffer (built-in class)
putmem()
built-in function
putmem_block()
built-in function
putmem_nbi()
built-in function
putmem_nbi_block()
built-in function
putmem_nbi_warp()
built-in function
putmem_signal_block()
built-in function
putmem_signal_nbi_block()
built-in function
putmem_warp()
built-in function
Q
quiet()
built-in function
R
rand_tensor()
built-in function
rank (EPConfig attribute)
rank()
built-in function
reallocate_dispatch_output_buf() (DispatchCombineContext method)
recast() (vector method)
recv_buf_offset_per_expert (EPAllToAllLayoutDesc attribute)
recv_buf_tokens_per_expert (EPAllToAllLayoutDesc attribute)
recv_token_source_count_and_start (DispatchMetaInfo attribute)
recv_token_source_indices (DispatchMetaInfo attribute)
remote_mc_ptr()
built-in function
remote_ptr()
built-in function
reset_profiler_buffer()
built-in function
reversed_token_scatter_idx (EPAllToAllLayoutDesc attribute)
rocshmem_barrier_all_on_stream()
built-in function
S
send_reqs_recv_tensor (EPAllToAllLayoutDesc attribute)
signal_op()
built-in function
signal_val (LowlatencyDispatchContext attribute)
signal_wait_until()
built-in function
simt_exec_region()
built-in function
skipped_token_mapping_indices (EPAllToAllLayoutDesc attribute)
skipped_token_topk_mapping_indices (EPAllToAllLayoutDesc attribute)
st()
built-in function
st_vector()
built-in function
supports_p2p_native_atomic()
built-in function
symm_at()
built-in function
sync() (EpAll2AllFusedOp method)
sync_all()
built-in function
T
team_my_pe()
built-in function
team_n_pes()
built-in function
threads_per_warp()
built-in function
tid()
built-in function
to() (vector method)
token_dst_scatter_idx (EPAllToAllLayoutDesc attribute)
,
[1]
token_dtype (EPConfig attribute)
token_sort_indices (EPAllToAllLayoutDesc attribute)
topk (EPConfig attribute)
topk_indices_tensor (EPAllToAllLayoutDesc attribute)
,
[1]
U
unpack()
built-in function
update_phase() (LowlatencyDispatchContext method)
V
vector (built-in class)
W
wait()
built-in function
weight_dtype (EPConfig attribute)
world_size (EPConfig attribute)
Z
zeros_vector()
built-in function