Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions cmake/DetectRuntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ function(detect_runtime_backend)
set(_ASCEND_ROOT "$ENV{ASCEND_HOME}")
elseif(DEFINED ENV{ASCEND_TOOLKIT_HOME})
set(_ASCEND_ROOT "$ENV{ASCEND_TOOLKIT_HOME}")
elseif(DEFINED ENV{ASCEND_HOME_PATH})
set(_ASCEND_ROOT "$ENV{ASCEND_HOME_PATH}")
elseif(EXISTS "/usr/local/Ascend/ascend-toolkit/latest")
set(_ASCEND_ROOT "/usr/local/Ascend/ascend-toolkit/latest")
endif()
Expand All @@ -79,9 +81,85 @@ function(detect_runtime_backend)
)

if(_ASCEND_INCLUDE_DIR AND _ASCEND_LIBRARY)
find_library(_ASCEND_RUNTIME_LIBRARY runtime
PATHS
"${_ASCEND_ROOT}/lib64"
"${_ASCEND_ROOT}/lib"
"${_ASCEND_ROOT}/runtime/lib64"
"${_ASCEND_ROOT}/runtime/lib"
"${_ASCEND_ROOT}/aarch64-linux/lib64"
"${_ASCEND_ROOT}/aarch64-linux/lib"
"/usr/local/Ascend/cann/aarch64-linux/lib64"
"/usr/local/Ascend/cann/aarch64-linux/lib"
NO_DEFAULT_PATH
)
find_path(_ASCEND_FFTS_INCLUDE_DIR
NAMES runtime/rt_ffts_plus.h rt_external_ffts.h
PATHS
"${_ASCEND_ROOT}/include"
"${_ASCEND_ROOT}/pkg_inc"
"${_ASCEND_ROOT}/pkg_inc/runtime"
"${_ASCEND_ROOT}/aarch64-linux/pkg_inc"
"${_ASCEND_ROOT}/aarch64-linux/pkg_inc/runtime"
"/usr/local/Ascend/cann/aarch64-linux/pkg_inc"
"/usr/local/Ascend/cann/aarch64-linux/pkg_inc/runtime"
NO_DEFAULT_PATH
)
if(_ASCEND_FFTS_INCLUDE_DIR)
set(_ASCEND_FFTS_INCLUDE_DIRS "${_ASCEND_FFTS_INCLUDE_DIR}")
get_filename_component(_ASCEND_FFTS_INCLUDE_PARENT
"${_ASCEND_FFTS_INCLUDE_DIR}" DIRECTORY)
if(_ASCEND_FFTS_INCLUDE_DIR MATCHES "/runtime$"
AND EXISTS "${_ASCEND_FFTS_INCLUDE_PARENT}")
list(APPEND _ASCEND_FFTS_INCLUDE_DIRS "${_ASCEND_FFTS_INCLUDE_PARENT}")
endif()

set(_ASCEND_FFTS_EXTRA_INCLUDE_CANDIDATES
"${_ASCEND_ROOT}/pkg_inc"
"${_ASCEND_ROOT}/pkg_inc/toolchain"
"${_ASCEND_ROOT}/pkg_inc/profiling"
"${_ASCEND_ROOT}/aarch64-linux/pkg_inc"
"${_ASCEND_ROOT}/aarch64-linux/pkg_inc/toolchain"
"${_ASCEND_ROOT}/aarch64-linux/pkg_inc/profiling"
"/usr/local/Ascend/cann/aarch64-linux/pkg_inc"
"/usr/local/Ascend/cann/aarch64-linux/pkg_inc/toolchain"
"/usr/local/Ascend/cann/aarch64-linux/pkg_inc/profiling")
foreach(_ASCEND_EXTRA_INCLUDE_DIR
${_ASCEND_FFTS_EXTRA_INCLUDE_CANDIDATES})
if(EXISTS "${_ASCEND_EXTRA_INCLUDE_DIR}")
list(APPEND _ASCEND_FFTS_INCLUDE_DIRS "${_ASCEND_EXTRA_INCLUDE_DIR}")
endif()
endforeach()
find_path(_ASCEND_PROF_COMMON_INCLUDE_DIR
NAMES prof_common.h
PATHS ${_ASCEND_FFTS_EXTRA_INCLUDE_CANDIDATES}
NO_DEFAULT_PATH
)
if(_ASCEND_PROF_COMMON_INCLUDE_DIR)
list(APPEND _ASCEND_FFTS_INCLUDE_DIRS
"${_ASCEND_PROF_COMMON_INCLUDE_DIR}")
endif()
find_path(_ASCEND_PROF_API_INCLUDE_DIR
NAMES prof_api.h toolchain/prof_api.h profiling/prof_api.h
PATHS ${_ASCEND_FFTS_EXTRA_INCLUDE_CANDIDATES}
NO_DEFAULT_PATH
)
if(_ASCEND_PROF_API_INCLUDE_DIR)
list(APPEND _ASCEND_FFTS_INCLUDE_DIRS
"${_ASCEND_PROF_API_INCLUDE_DIR}")
endif()
list(REMOVE_DUPLICATES _ASCEND_FFTS_INCLUDE_DIRS)
endif()

message(STATUS "Found Ascend runtime: ${_ASCEND_ROOT}")
message(STATUS " Include: ${_ASCEND_INCLUDE_DIR}")
message(STATUS " Library: ${_ASCEND_LIBRARY}")
if(_ASCEND_RUNTIME_LIBRARY AND _ASCEND_FFTS_INCLUDE_DIR)
message(STATUS " FFTS Includes: ${_ASCEND_FFTS_INCLUDE_DIRS}")
message(STATUS " Runtime Library: ${_ASCEND_RUNTIME_LIBRARY}")
else()
message(STATUS " FFTS support: disabled (missing FFTS header or libruntime)")
endif()

# Create imported target
add_library(Ascend::Runtime INTERFACE IMPORTED GLOBAL)
Expand All @@ -92,6 +170,11 @@ function(detect_runtime_backend)
set(RUNTIME_BACKEND "Ascend" PARENT_SCOPE)
set(ASCEND_FOUND TRUE PARENT_SCOPE)
set(ASCEND_ROOT "${_ASCEND_ROOT}" PARENT_SCOPE)
if(_ASCEND_RUNTIME_LIBRARY AND _ASCEND_FFTS_INCLUDE_DIR)
set(HAVE_ASCEND_FFTS_RUNTIME TRUE PARENT_SCOPE)
set(ASCEND_RUNTIME_LIBRARY "${_ASCEND_RUNTIME_LIBRARY}" PARENT_SCOPE)
set(ASCEND_FFTS_INCLUDE_DIRS "${_ASCEND_FFTS_INCLUDE_DIRS}" PARENT_SCOPE)
endif()

return()
endif()
Expand Down
176 changes: 176 additions & 0 deletions ffts_direct_h2d_io_num_odirect.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# FFTS Direct H2D io_num 与 O_DIRECT 说明

## 当前 direct H2D 实现

`ffts-direct-h2d` 是 Ascend copy benchmark 中的 direct FFTS SDMA H2D 路径。它不经过 CE staging buffer,而是直接把 `aclrtHostGetDevicePointer` 返回的 device-visible mapped host pointer 写入 FFTS SDMA descriptor 的 source,把 device buffer pointer 写入 destination。

当前 direct H2D 包含三个 case:

| Case 名称 | 源 buffer | 目标 buffer | 含义 |
| --- | --- | --- | --- |
| `all_host_to_all_device_ffts_direct_h2d` | 每个子进程各自分配一块 `aclrtMallocHost` host buffer,并注册为 mapped host memory | 每张卡一块 device buffer | all host to all device |
| `one_share_host_to_all_device_ffts_direct_h2d` | 父进程创建一块 POSIX shared memory,所有子进程 mmap 同一块源 buffer,并在各自进程里注册为 mapped host memory | 每张卡一块 device buffer | one shared host to all device |
| `all_odirect_host_to_all_device_ffts_direct_h2d` | 每个子进程分配一块 UCM O_DIRECT local buffer 形态的 anonymous mmap host buffer,并注册为 mapped + pinned host memory | 每张卡一块 device buffer | local direct-IO style host buffer to all device |

校验默认关闭。需要调试数据正确性时,设置 `COPY_FFTS_VALIDATE=1`,程序会初始化确定性的 host pattern,通过 FFTS SDMA 拷贝,再把 device buffer 读回 host 做比较。

## `-n` 与 `frags`

copy benchmark 原来已经有全局 `-n` 参数。为了兼容旧逻辑,direct H2D 现在分成两种模式:

| 模式 | 命令形式 | 含义 |
| --- | --- | --- |
| 兼容模式 | 不传 `--frags`、`-frags` 或 `-f` | `-n` 仍表示总 fragment 数,direct H2D 会把所有 fragment 合成一个 FFTS task 下发。这和旧行为一致。 |
| 多 task 模式 | 传 `--frags <count>`、`-frags <count>` 或 `-f <count>` | `-n` 表示 IO/task 数量,`frags` 表示每个 IO/task 内包含多少个 fragment。程序会分配 `-n * frags` 个 fragment,并在每次迭代里下发 `-n` 个 FFTS task。 |

示例:

```bash
# 旧行为:12800 个 fragment 合并为 1 个 FFTS task。
FFTS_MAX_READY_LANES=8 \
./build/module/copy/copy -t all_host_to_all_device_ffts_direct_h2d -s 4M -n 12800 -i 10 -d 8

# 新行为:100 个 IO/task,每个 task 里 128 个 fragment。
FFTS_MAX_READY_LANES=8 \
./build/module/copy/copy -t all_host_to_all_device_ffts_direct_h2d -s 4M -n 100 -frags 128 -i 10 -d 8

# 新行为:100 个 IO/task,每个 task 里 128 个 fragment,每张卡共 12800 个 fragment。
FFTS_MAX_READY_LANES=8 \
./build/module/copy/copy -t all_host_to_all_device_ffts_direct_h2d -s 32K -n 100 -frags 128 -i 10 -d 8
```

默认 benchmark 配置建议固定 lanes 为 8:

```bash
FFTS_MAX_READY_LANES=8 \
./build/module/copy/copy -t all_host_to_all_device_ffts_direct_h2d -s 4M -n 100 -frags 128 -i 10 -d 8

FFTS_MAX_READY_LANES=8 \
./build/module/copy/copy -t all_host_to_all_device_ffts_direct_h2d -s 32K -n 100 -frags 128 -i 10 -d 8

FFTS_MAX_READY_LANES=8 \
./build/module/copy/copy -t one_share_host_to_all_device_ffts_direct_h2d -s 4M -n 100 -frags 128 -i 10 -d 8

FFTS_MAX_READY_LANES=8 \
./build/module/copy/copy -t one_share_host_to_all_device_ffts_direct_h2d -s 32K -n 100 -frags 128 -i 10 -d 8

FFTS_MAX_READY_LANES=8 \
./build/module/copy/copy -t all_odirect_host_to_all_device_ffts_direct_h2d -s 4M -n 100 -frags 128 -i 10 -d 8

FFTS_MAX_READY_LANES=8 \
./build/module/copy/copy -t all_odirect_host_to_all_device_ffts_direct_h2d -s 32K -n 100 -frags 128 -i 10 -d 8
```

## 提交流程

direct H2D copy instance 在 `Prepare` 阶段会为每个 fragment 构造一个 copy spec:

```text
mapped host source pointer -> device destination pointer -> fragment size
```

兼容模式下,所有 copy spec 放进同一个 task group,只调用一次 FFTS dispatcher。多 task 模式下,copy spec 会按 `frags` 分组,每个分组单独调用一次 dispatcher。因此 `-n 100 -frags 128` 表示每次统计迭代、每张卡会调用 100 次 `rtFftsPlusTaskLaunchWithFlag`,每次 task 里包含 128 个 fragment。

输出结果里的 `Count` 仍然按总 fragment 数统计,不只按 task 数统计:

```text
单卡 Count = -n * frags
聚合 Count = -n * frags * device_count
```

## 多进程 submit 与 CPU 亲和性

所有 fork fan-out case 默认不主动绑核。子进程继承启动进程已有的 CPU affinity,因此如果需要限制 CPU 范围,建议在外部用 `taskset`、`numactl` 或调度系统统一控制。

`-n 100 -frags 128` 模式下,每个子进程每次统计迭代会调用 100 次 `rtFftsPlusTaskLaunchWithFlag`。如果 submit 耗时波动较大,应优先在外部控制进程 CPU affinity,再对比是否存在跨 NUMA 调度影响。

## UCM 开启 O_DIRECT 后的 host buffer 形态

UCM local CacheStore buffer 有两种 host 分配路径:

| 条件 | 分配路径 | 实际内存形态 |
| --- | --- | --- |
| `io_direct=false` | `MakeHostBuffer` | `aclrtMallocHost` |
| `io_direct=true` 且 shared buffer 关闭 | `MakeHostBuffer4DirectIo` | anonymous private `mmap`,先尝试 HugeTLB 或 gigantic HugeTLB,失败后 fallback 到 transparent hugepage advice,然后 `mlock`,再 `aclrtHostRegisterV2(MAPPED | PINNED)` |

新增的 dev-sandbox case `all_odirect_host_to_all_device_ffts_direct_h2d` 对齐第二种形态。它会分配 anonymous mmap host memory,优先尝试 HugeTLB/gigantic HugeTLB,失败后使用 THP advice fallback,然后用 `ACL_HOST_REG_MAPPED | ACL_HOST_REG_PINNED` 注册,并通过 `aclrtHostGetDevicePointer` 拿到 FFTS descriptor 可用的 mapped pointer。

## shared memory + O_DIRECT 的实际形态

UCM 开启 shared memory 后,不会把 shared transfer buffer 切到 local `MakeHostBuffer4DirectIo` 路径。shared buffer 仍然是 POSIX shared memory:

```text
shm_open/ftruncate -> mmap(MAP_SHARED) -> page-aligned data area -> aclrtHostRegisterV2(MAPPED | PINNED)
```

PCStore 里 `ioDirect` 会保存在 reader 上,影响的是文件读写时是否用 `O_DIRECT` 打开文件;shared host buffer 本身仍是 POSIX shared memory,并注册为 mapped + pinned host memory。因此 dev-sandbox 当前已有的 `one_share_host_to_all_device_ffts_direct_h2d` 就对应 shared memory + O_DIRECT 的 host buffer 形态。

## runtime FFTS launch 接口用法

当前 dispatcher 会包含 `runtime/rt_ffts_plus.h` 或 `rt_external_ffts.h`,然后构造 FFTS Plus task descriptor:

| 字段 | 当前取值或含义 |
| --- | --- |
| `rtFftsPlusSqe_t::fftsType` | `RT_FFTS_PLUS_TYPE` |
| `rtFftsPlusSqe_t::totalContextNum` | 当前 task 里的 SDMA context 数量 |
| `rtFftsPlusSqe_t::readyContextNum` | 初始 ready 的 context 数量,由 `FFTS_MAX_READY_LANES` 控制,默认 8 |
| `rtFftsPlusSqe_t::preloadContextNum` | `readyContextNum` 和 128 的较小值 |
| `rtFftsPlusSqe_t::timeout` | 0 |
| `rtFftsPlusSqe_t::subType` | `0x5A`,当前作为 communication task 使用 |
| `rtFftsPlusTaskInfo_t::descBuf` | context descriptor array 的 host 地址 |
| `rtFftsPlusTaskInfo_t::descBufLen` | descriptor array 的字节长度 |
| `rtFftsPlusTaskInfo_t::descAddrType` | `RT_FFTS_PLUS_CTX_DESC_ADDR_TYPE_HOST` |
| `rtFftsPlusTaskInfo_t::argsHandleInfoNum` | 0 |
| `rtFftsPlusTaskInfo_t::argsHandleInfoPtr` | null |

每个 descriptor 是一个 `rtFftsPlusSdmaCtx_t`,通过公共的 128 字节 context 类型承载。当前 context 设置 `contextType = RT_CTX_TYPE_SDMA`,填充 source/destination 地址高低位,以及数据长度字段。

本 checkout 没有 runtime 头文件副本,因此本机无法枚举上游 runtime 支持的全部 FFTS task/context type。当前 dev-sandbox 和 UCM 可见代码只使用 `RT_FFTS_PLUS_TYPE` + `RT_CTX_TYPE_SDMA`。后续如果要扩展其他类型,应基于目标机 runtime 头文件里的其他 `RT_CTX_TYPE_*` layout 新增 dispatcher;如果 runtime 要求不同 task class,再调整 `subType`,同时保持当前 SDMA 路径不变。

## 修改点总结

- copy CLI 支持 `--frags`、`-frags` 和 `-f`。
- direct H2D 默认不传 `--frags`、`-frags` 或 `-f` 时仍保持旧行为:所有 fragment 只下发一个 FFTS task。
- 传 `--frags`、`-frags` 或 `-f` 后,direct H2D 会按 task 拆分,多次调用 `rtFftsPlusTaskLaunchWithFlag`。
- 校验默认关闭,继续保留 `COPY_FFTS_VALIDATE=1` 手动开启。
- 新增 `all_odirect_host_to_all_device_ffts_direct_h2d`,用于覆盖 UCM local O_DIRECT 风格 host memory。
- 明确 shared memory + O_DIRECT 仍对应 POSIX shared memory + mapped/pinned register,dev-sandbox 由 `one_share_host_to_all_device_ffts_direct_h2d` 覆盖。

## 测试方式

本地静态检查:

```bash
git diff --check
```

有 CMake 的环境可做编译检查:

```bash
cmake -B build
cmake --build build -j
```

Ascend 机器上的 runtime smoke:

```bash
# 默认关闭校验,兼容模式。
FFTS_MAX_READY_LANES=8 \
./build/module/copy/copy -t all_host_to_all_device_ffts_direct_h2d -s 32K -n 8 -i 1 -d 1

# 手动开启校验。
FFTS_MAX_READY_LANES=8 COPY_FFTS_VALIDATE=1 \
./build/module/copy/copy -t all_host_to_all_device_ffts_direct_h2d -s 32K -n 8 -i 1 -d 1

# 多 task 模式:100 个 task,每个 task 128 个 fragment。
FFTS_MAX_READY_LANES=8 COPY_FFTS_VALIDATE=1 \
./build/module/copy/copy -t all_host_to_all_device_ffts_direct_h2d -s 32K -n 100 -frags 128 -i 1 -d 1

# O_DIRECT local host buffer 形态。
FFTS_MAX_READY_LANES=8 COPY_FFTS_VALIDATE=1 \
./build/module/copy/copy -t all_odirect_host_to_all_device_ffts_direct_h2d -s 32K -n 100 -frags 128 -i 1 -d 1

# shared memory + O_DIRECT 对应的 shared host buffer 形态。
FFTS_MAX_READY_LANES=8 COPY_FFTS_VALIDATE=1 \
./build/module/copy/copy -t one_share_host_to_all_device_ffts_direct_h2d -s 32K -n 100 -frags 128 -i 1 -d 1
```
Loading