UCT(Unified Communication Transport)是一个传输层,它抽象了各种硬件架构之间的差异,并提供了支持通信协议实现的低级 API。该层的主要目标是以最小的软件开销提供对硬件网络资源的直接有效的访问。为此,UCT 依赖于低级驱动程序,例如 uGNI、Verbs、共享内存、ROCM、CUDA。此外,该层还提供通信上下文管理(基于线程和应用程序级别, 如: ucs_async_context_create, uct_worker_create)以及设备特定存储器(包括加速器中的存储器)的分配和管理的构造。在通信 API 方面,UCT 定义了立即(短消息,如: uct_ep_am_short)、缓冲区复制发送(bcopy,如: uct_ep_am_bcopy)和零拷贝(zcopy, 如: uct_ep_am_zcopy)通信操作的接口。短操作针对可以就地发布和完成的小消息进行了优化。bcopy 操作针对通常通过所谓的弹跳缓冲区发送的中等大小的消息进行了优化。最后,zcopy 操作公开零复制内存到内存通信语义。
UCT对外API的头文件: src/uct/api/uct.h
PUT操作
uct_ep_put_short
uct_ep_put_bcopy
uct_ep_put_zcopy
GET操作
uct_ep_get_short
uct_ep_get_bcopy
uct_ep_get_zcopy
AM活动消息
uct_ep_am_short
uct_ep_am_short_iov
uct_ep_am_bcopy
uct_ep_am_zcopy
原子操作
uct_ep_atomic_cswap64
uct_ep_atomic_cswap32
uct_ep_atomic32_post
uct_ep_atomic64_post
uct_ep_atomic32_fetch
uct_ep_atomic64_fetch
TAG操作
uct_ep_tag_eager_short
uct_ep_tag_eager_bcopy
uct_ep_tag_eager_zcopy
uct_ep_tag_rndv_zcopy
uct_ep_tag_rndv_cancel
uct_ep_tag_rndv_request
uct_iface_tag_recv_zcopy
uct_iface_tag_recv_cancel
typedef struct uct_md *uct_md_h;
struct uct_md {
uct_md_ops_t *ops;
uct_component_t *component;
};
typedef struct uct_md_resource_desc {
char md_name[UCT_MD_NAME_MAX]; /**< Memory domain name */
} uct_md_resource_desc_t;
// 资源描述符是表示网络资源的对象。 资源描述符可以表示独立的通信资源(例如HCA端口、网络接口)或多个资源(例如多个网络接口或通信端口)。 它还可以表示通过单个物理网络接口定义的虚拟通信资源
typedef struct uct_tl_resource_desc {
char tl_name[UCT_TL_NAME_MAX]; /**< Transport name */
char dev_name[UCT_DEVICE_NAME_MAX]; /**< Hardware device name */
uct_device_type_t dev_type; /**< The device represented by this resource
(e.g. UCT_DEVICE_TYPE_NET for a network interface) */
ucs_sys_device_t sys_device; /**< The identifier associated with the device
bus_id as captured in ucs_sys_bus_id_t struct */
} uct_tl_resource_desc_t;
typedef struct uct_worker *uct_worker_h;
typedef struct uct_worker {
ucs_callbackq_t progress_q;
} uct_worker_t;
struct ucs_callbackq {
/**
* Array of fast-path element, the last is reserved as a sentinel to mark
* array end.
*/
ucs_callbackq_elem_t fast_elems[UCS_CALLBACKQ_FAST_COUNT + 1];
/**
* Private data, which we don't want to expose in API to avoid pulling
* more header files
*/
char priv[72];
};
typedef struct uct_iface {
uct_iface_ops_t ops;
} uct_iface_t;
// 网络接口支持的函数操作表
typedef struct uct_iface_ops {
/* endpoint - put */
uct_ep_put_short_func_t ep_put_short;
uct_ep_put_bcopy_func_t ep_put_bcopy;
uct_ep_put_zcopy_func_t ep_put_zcopy;
/* endpoint - get */
uct_ep_get_short_func_t ep_get_short;
uct_ep_get_bcopy_func_t ep_get_bcopy;
uct_ep_get_zcopy_func_t ep_get_zcopy;
/* endpoint - active message */
uct_ep_am_short_func_t ep_am_short;
uct_ep_am_short_iov_func_t ep_am_short_iov;
uct_ep_am_bcopy_func_t ep_am_bcopy;
uct_ep_am_zcopy_func_t ep_am_zcopy;
/* endpoint - atomics */
uct_ep_atomic_cswap64_func_t ep_atomic_cswap64;
uct_ep_atomic_cswap32_func_t ep_atomic_cswap32;
uct_ep_atomic32_post_func_t ep_atomic32_post;
uct_ep_atomic64_post_func_t ep_atomic64_post;
uct_ep_atomic32_fetch_func_t ep_atomic32_fetch;
uct_ep_atomic64_fetch_func_t ep_atomic64_fetch;
/* endpoint - tagged operations */
uct_ep_tag_eager_short_func_t ep_tag_eager_short;
uct_ep_tag_eager_bcopy_func_t ep_tag_eager_bcopy;
uct_ep_tag_eager_zcopy_func_t ep_tag_eager_zcopy;
uct_ep_tag_rndv_zcopy_func_t ep_tag_rndv_zcopy;
uct_ep_tag_rndv_cancel_func_t ep_tag_rndv_cancel;
uct_ep_tag_rndv_request_func_t ep_tag_rndv_request;
/* interface - tagged operations */
uct_iface_tag_recv_zcopy_func_t iface_tag_recv_zcopy;
uct_iface_tag_recv_cancel_func_t iface_tag_recv_cancel;
/* endpoint - pending queue */
uct_ep_pending_add_func_t ep_pending_add;
uct_ep_pending_purge_func_t ep_pending_purge;
/* endpoint - synchronization */
uct_ep_flush_func_t ep_flush;
uct_ep_fence_func_t ep_fence;
uct_ep_check_func_t ep_check;
/* endpoint - connection establishment */
uct_ep_create_func_t ep_create;
uct_ep_connect_func_t ep_connect;
uct_ep_disconnect_func_t ep_disconnect;
uct_cm_ep_conn_notify_func_t cm_ep_conn_notify;
uct_ep_destroy_func_t ep_destroy;
uct_ep_get_address_func_t ep_get_address;
uct_ep_connect_to_ep_func_t ep_connect_to_ep;
uct_iface_accept_func_t iface_accept;
uct_iface_reject_func_t iface_reject;
/* interface - synchronization */
uct_iface_flush_func_t iface_flush;
uct_iface_fence_func_t iface_fence;
/* interface - progress control */
uct_iface_progress_enable_func_t iface_progress_enable;
uct_iface_progress_disable_func_t iface_progress_disable;
uct_iface_progress_func_t iface_progress;
/* interface - events */
uct_iface_event_fd_get_func_t iface_event_fd_get;
uct_iface_event_arm_func_t iface_event_arm;
/* interface - management */
uct_iface_close_func_t iface_close;
uct_iface_query_func_t iface_query;
/* interface - connection establishment */
uct_iface_get_device_address_func_t iface_get_device_address;
uct_iface_get_address_func_t iface_get_address;
uct_iface_is_reachable_func_t iface_is_reachable;
} uct_iface_ops_t;
对在远程接口运行的保护域内注册的任何虚拟地址, 可执行远程内存访问(RMA)
typedef struct uct_ep {
uct_iface_h iface; // 与网络端口关联
} uct_ep_t;
有两种类型的完成:“本地完成”和“远程完成”。
可能无法立即本地完成的通信 API 如下所示:
ucs_status_t uct_OPERATION(... , uct_completion_t *comp)
例如:
ucs_status_t uct_ep_put_zcopy(uct_ep_h ep, const void *buffer, size_t length,
uct_mem_h memh, uint64_t remote_addr,
uct_rkey_t rkey, uct_completion_t *comp)
使用示例:
status = api_call(..., &my_handle->comp);
if (likely(status == UCS_OK)) {
/* done */
} else if (status == UCS_INPROGRESS) {
/* started */
} else if (status == UCS_ERR_NO_RESOURCE) {
/* cannot be started now */
} else {
/* error */
}
排序 回调由最底层接口触发。传输可能不是本地排序的(这意味着 X 的完成并不意味着 0..X-1 的本地完成)。因此,高层/用户可能希望为每个片段设置回调。此外,还将进行单独的围栏操作。
uct_iface_h
)可以独立于不同的线程进行。uct_mem_h
rkey_bundle_t
,其中包含 rkey as uct_rkey_t
,以及一个用于跟踪其资源使用情况的不透明指针。create_ep(iface) -> uct_ep_t
- 本地操作connect_ep_to_ep(uct_ep_t, remote_iface_addr, remote_ep_addr)
- 双方都得调用它 - 很可能是本地操作。connect_ep_to_iface(uct_ep_t, remote_iface_addr)
- 可选的传输能力 - 一侧 - 一侧会调用它就足够了。代码位置: examples/uct_hello_world.c
编译: cd examples; make && ./uct_hello_world
服务端执行(指定RDMA网口和零拷贝模式):
/home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs -z
客户端执行(指定RDMA网口和零拷贝模式, 以及服务端IP):
/home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs -n 172.17.29.63 -z
服务端日志:
export UCX_LOG_LEVEL=debug
/home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs
[root@node63 ucx]# ./s_uct.sh
[1696660940.924550] [node63:3375441:0] debug.c:1155 UCX DEBUG using signal stack 0x7faadc17a000 size 141824
[1696660940.925589] [node63:3375441:0] init.c:121 UCX DEBUG /home/xb/project/ucx/src/ucs/.libs/libucs.so.0 loaded at 0x7faadbd15000
[1696660940.925611] [node63:3375441:0] init.c:122 UCX DEBUG cmd line: /home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs
[1696660940.925624] [node63:3375441:0] module.c:72 UCX DEBUG ucs library path: /home/xb/project/ucx/src/ucs/.libs/libucs.so.0
[1696660940.925629] [node63:3375441:0] module.c:280 UCX DEBUG loading modules for ucs
INFO: UCT_HELLO_WORLD AM function = uct_ep_am_short server = (null) port = 13337
[1696660940.925665] [node63:3375441:0] module.c:280 UCX DEBUG loading modules for uct
[1696660940.926871] [node63:3375441:0] topo.c:792 UCX DEBUG /sys/class/net/ib98-0: PF sysfs path is '/sys/devices/pci0000:97/0000:97:04.0/0000:98:00.0'
[1696660940.926884] [node63:3375441:0] topo.c:240 UCX DEBUG added sys_dev 0 for bus id 98:00.0
[1696660940.926889] [node63:3375441:0] topo.c:475 UCX DEBUG ib98-0: bdf_name 0000:98:00.0 sys_dev 0
[1696660940.927070] [node63:3375441:0] topo.c:792 UCX DEBUG /sys/class/net/ib17-0: PF sysfs path is '/sys/devices/pci0000:15/0000:15:04.0/0000:17:00.0'
[1696660940.927076] [node63:3375441:0] topo.c:240 UCX DEBUG added sys_dev 1 for bus id 17:00.0
[1696660940.927081] [node63:3375441:0] topo.c:475 UCX DEBUG ib17-0: bdf_name 0000:17:00.0 sys_dev 1
[1696660940.927754] [node63:3375441:0] topo.c:787 UCX DEBUG /sys/class/net/lo: sysfs path undetected
[1696660940.927758] [node63:3375441:0] topo.c:479 UCX DEBUG lo: system device unknown
[1696660940.928680] [node63:3375441:0] topo.c:792 UCX DEBUG /sys/class/net/ethA69-0: PF sysfs path is '/sys/devices/pci0000:68/0000:68:02.0/0000:69:00.0'
[1696660940.928685] [node63:3375441:0] topo.c:240 UCX DEBUG added sys_dev 2 for bus id 69:00.0
[1696660940.928689] [node63:3375441:0] topo.c:475 UCX DEBUG ethA69-0: bdf_name 0000:69:00.0 sys_dev 2
[1696660940.928787] [node63:3375441:0] module.c:280 UCX DEBUG loading modules for uct_ib
[1696660940.933874] [node63:3375441:0] topo.c:792 UCX DEBUG /sys/class/infiniband/mlx5_0: PF sysfs path is '/sys/devices/pci0000:15/0000:15:04.0/0000:17:00.0'
[1696660940.933885] [node63:3375441:0] topo.c:475 UCX DEBUG mlx5_0: bdf_name 0000:17:00.0 sys_dev 1
[1696660940.933906] [node63:3375441:0] ib_device.c:487 UCX DEBUG mlx5_0: vendor_id 0x15b3 device_id 4117
[1696660940.934468] [node63:3375441:0] ib_mlx5dv_md.c:1264 UCX DEBUG mlx5_0: crossing_vhca_mkey is not supported
[1696660940.934661] [node63:3375441:0] ib_mlx5dv_md.c:880 UCX DEBUG mlx5_0: ODP is disabled because version 1 is not supported for DevX QP
[1696660940.934883] [node63:3375441:0] async.c:232 UCX DEBUG added async handler 0x172b320 [id=4 ref 1] ???() to hash
[1696660940.934952] [node63:3375441:0] async.c:494 UCX DEBUG listening to async event fd 4 events 0x1 mode thread_spinlock
[1696660940.934958] [node63:3375441:0] ib_device.c:586 UCX DEBUG initialized device 'mlx5_0' (InfiniBand channel adapter) with 1 ports
[1696660940.934968] [node63:3375441:0] ib_md.c:1115 UCX DEBUG mlx5_0: cuda GPUDirect RDMA is disabled
[1696660940.934974] [node63:3375441:0] ib_md.c:1115 UCX DEBUG mlx5_0: rocm GPUDirect RDMA is disabled
[1696660940.934985] [node63:3375441:0] ib_md.c:1140 UCX DEBUG mlx5_0: dmabuf is supported
[1696660940.934992] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool devx dbrec: align 64, maxelems 4294967295, elemsize 40
[1696660940.935245] [node63:3375441:0] ib_mlx5dv_md.c:1341 UCX DEBUG mlx5_0: opened DEVX md log_max_qp=17
[1696660940.935251] [node63:3375441:0] ib_md.c:1103 UCX DEBUG mlx5_0: relaxed order memory access is disabled
[1696660940.935710] [node63:3375441:0] ib_mlx5dv_md.c:1011 UCX DEBUG created indirect rkey 0x9f00 for remote flush
[1696660940.935715] [node63:3375441:0] ib_md.c:1054 UCX DEBUG mlx5_0: md open by 'uct_ib_mlx5_devx_md_ops' is successful
[1696660940.935750] [node63:3375441:0] ib_device.c:1052 UCX DEBUG no compatible IB ports found for flags 0xc4
[1696660940.935755] [node63:3375441:0] uct_md.c:97 UCX DEBUG failed to query dc_mlx5 resources: No such device
[1696660940.937373] [node63:3375441:0] ib_iface.c:927 UCX DEBUG using pkey[0] 0xffff on mlx5_0:1/RoCE
[1696660940.937429] [node63:3375441:0] ib_device.c:916 UCX DEBUG mlx5_0:1 using gid_index 3
[1696660940.938579] [node63:3375441:0] ib_iface.c:1453 UCX DEBUG created uct_ib_iface_t headroom_ofs 12 payload_ofs 16 hdr_ofs 15 data_sz 8256
[1696660940.938616] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool rc_recv_desc: align 64, maxelems 4294967295, elemsize 8279
[1696660940.938621] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool rc_send_desc: align 64, maxelems 4294967295, elemsize 8328
[1696660940.938705] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool send-ops-mpool: align 64, maxelems 4294967295, elemsize 56
[1696660940.939137] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool pending-ops: align 1, maxelems 4294967295, elemsize 64
[1696660940.939146] [node63:3375441:0] mpool.c:138 UCX DEBUG mpool rc_verbs_short_desc: align 64, maxelems 4294967295, elemsize 200
[1696660940.939623] [node63:3375441:0] ib_iface.c:1052 UCX DEBUG iface=0x1732010: created RC QP 0x1a917 on mlx5_0:1 TX wr:409 sge:5 inl:124 resp:64 RX wr:0 sge:0 resp:64
[1696660940.945048] [node63:3375441:0] mpool.c:282 UCX DEBUG mpool rc_recv_desc: allocated chunk 0x7faad6a00018 of 37748712 bytes with 4537 elements
Using rc_verbs/mlx5_0:1
Waiting for connection...
[1696660984.949828] [node63:3375441:0] ib_iface.c:1052 UCX DEBUG iface=0x1732010: created RC QP 0x1a91b on mlx5_0:1 TX wr:409 sge:5 inl:124 resp:64 RX wr:0 sge:0 resp:64
[1696660984.949850] [node63:3375441:0] rc_ep.c:165 UCX DEBUG created rc ep 0x172ce60
[1696660984.950046] [node63:3375441:0] ib_iface.c:809 UCX DEBUG iface 0x1732010: ah_attr dlid=49152 sl=0 port=1 src_path_bits=0 dgid=::ffff:172.17.29.63 flow_label=0xffffffff sgid_index=3 traffic_class=106
[1696660984.950392] [node63:3375441:0] rc_iface.c:934 UCX DEBUG connected rc qp 0x1a91b on mlx5_0:1/RoCE to lid 49152(+0) sl 0 remote_qp 0x1a91a mtu 1024 timer 18x7 rnr 13x7 rd_atom 16
----- UCT TEST SUCCESS ----
[callback] uct_ep_am_short sent ABCDEFGHIJKLMNO (16 bytes)
---------------------------
----- UCT TEST SUCCESS ----
[main] uct_ep_am_short sent ABCDEFGHIJKLMNO (16 bytes)
---------------------------
[1696660984.951144] [node63:3375441:0] rc_ep.c:185 UCX DEBUG destroy rc ep 0x172ce60
[1696660984.951206] [node63:3375441:a] ib_device.c:468 UCX DEBUG IB Async event on mlx5_0: SRQ-attached QP 0x1a91b was flushed
[1696660984.952967] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool rc_verbs_short_desc destroyed
[1696660984.953322] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool send-ops-mpool destroyed
[1696660984.953327] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool rc_send_desc destroyed
[1696660984.953663] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool rc_recv_desc destroyed
[1696660984.953668] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool pending-ops destroyed
[1696660984.954331] [node63:3375441:0] ib_mlx5dv_md.c:1399 UCX DEBUG mlx5_0: md=0x172d3f0 md->flags=0x3f01a3 flush_rkey=0x9f00
[1696660984.954898] [node63:3375441:0] mpool.c:194 UCX DEBUG mpool devx dbrec destroyed
[1696660984.954907] [node63:3375441:0] ib_device.c:605 UCX DEBUG destroying ib device mlx5_0
[1696660984.954915] [node63:3375441:0] async.c:157 UCX DEBUG removed async handler 0x172b320 [id=4 ref 1] ???() from hash
[1696660984.954919] [node63:3375441:0] async.c:547 UCX DEBUG removing async handler 0x172b320 [id=4 ref 1] ???()
[1696660984.954971] [node63:3375441:0] async.c:172 UCX DEBUG release async handler 0x172b320 [id=4 ref 0] ???()
You have mail in /var/spool/mail/root
[root@node63 ucx]#
客户端日志:
export UCX_LOG_LEVEL=debug
/home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs -n 172.17.29.63
[root@node63 ucx]# ./c_uct.sh
[1696660984.917046] [node63:3385663:0] debug.c:1155 UCX DEBUG using signal stack 0x7fc7c70b0000 size 141824
[1696660984.929624] [node63:3385663:0] init.c:121 UCX DEBUG /home/xb/project/ucx/src/ucs/.libs/libucs.so.0 loaded at 0x7fc7c6c4b000
[1696660984.929650] [node63:3385663:0] init.c:122 UCX DEBUG cmd line: /home/xb/project/ucx/examples/.libs/lt-uct_hello_world -d mlx5_0:1 -t rc_verbs -n 172.17.29.63
[1696660984.929662] [node63:3385663:0] module.c:72 UCX DEBUG ucs library path: /home/xb/project/ucx/src/ucs/.libs/libucs.so.0
[1696660984.929668] [node63:3385663:0] module.c:280 UCX DEBUG loading modules for ucs
INFO: UCT_HELLO_WORLD AM function = uct_ep_am_short server = 172.17.29.63 port = 13337
[1696660984.929709] [node63:3385663:0] module.c:280 UCX DEBUG loading modules for uct
[1696660984.930933] [node63:3385663:0] topo.c:792 UCX DEBUG /sys/class/net/ib98-0: PF sysfs path is '/sys/devices/pci0000:97/0000:97:04.0/0000:98:00.0'
[1696660984.930947] [node63:3385663:0] topo.c:240 UCX DEBUG added sys_dev 0 for bus id 98:00.0
[1696660984.930951] [node63:3385663:0] topo.c:475 UCX DEBUG ib98-0: bdf_name 0000:98:00.0 sys_dev 0
[1696660984.931136] [node63:3385663:0] topo.c:792 UCX DEBUG /sys/class/net/ib17-0: PF sysfs path is '/sys/devices/pci0000:15/0000:15:04.0/0000:17:00.0'
[1696660984.931142] [node63:3385663:0] topo.c:240 UCX DEBUG added sys_dev 1 for bus id 17:00.0
[1696660984.931146] [node63:3385663:0] topo.c:475 UCX DEBUG ib17-0: bdf_name 0000:17:00.0 sys_dev 1
[1696660984.931853] [node63:3385663:0] topo.c:787 UCX DEBUG /sys/class/net/lo: sysfs path undetected
[1696660984.931857] [node63:3385663:0] topo.c:479 UCX DEBUG lo: system device unknown
[1696660984.932818] [node63:3385663:0] topo.c:792 UCX DEBUG /sys/class/net/ethA69-0: PF sysfs path is '/sys/devices/pci0000:68/0000:68:02.0/0000:69:00.0'
[1696660984.932824] [node63:3385663:0] topo.c:240 UCX DEBUG added sys_dev 2 for bus id 69:00.0
[1696660984.932830] [node63:3385663:0] topo.c:475 UCX DEBUG ethA69-0: bdf_name 0000:69:00.0 sys_dev 2
[1696660984.932925] [node63:3385663:0] module.c:280 UCX DEBUG loading modules for uct_ib
[1696660984.937340] [node63:3385663:0] topo.c:792 UCX DEBUG /sys/class/infiniband/mlx5_0: PF sysfs path is '/sys/devices/pci0000:15/0000:15:04.0/0000:17:00.0'
[1696660984.937350] [node63:3385663:0] topo.c:475 UCX DEBUG mlx5_0: bdf_name 0000:17:00.0 sys_dev 1
[1696660984.937372] [node63:3385663:0] ib_device.c:487 UCX DEBUG mlx5_0: vendor_id 0x15b3 device_id 4117
[1696660984.937974] [node63:3385663:0] ib_mlx5dv_md.c:1264 UCX DEBUG mlx5_0: crossing_vhca_mkey is not supported
[1696660984.938164] [node63:3385663:0] ib_mlx5dv_md.c:880 UCX DEBUG mlx5_0: ODP is disabled because version 1 is not supported for DevX QP
[1696660984.938384] [node63:3385663:0] async.c:232 UCX DEBUG added async handler 0xa1d320 [id=4 ref 1] ???() to hash
[1696660984.938455] [node63:3385663:0] async.c:494 UCX DEBUG listening to async event fd 4 events 0x1 mode thread_spinlock
[1696660984.938462] [node63:3385663:0] ib_device.c:586 UCX DEBUG initialized device 'mlx5_0' (InfiniBand channel adapter) with 1 ports
[1696660984.938472] [node63:3385663:0] ib_md.c:1115 UCX DEBUG mlx5_0: cuda GPUDirect RDMA is disabled
[1696660984.938478] [node63:3385663:0] ib_md.c:1115 UCX DEBUG mlx5_0: rocm GPUDirect RDMA is disabled
[1696660984.938490] [node63:3385663:0] ib_md.c:1140 UCX DEBUG mlx5_0: dmabuf is supported
[1696660984.938497] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool devx dbrec: align 64, maxelems 4294967295, elemsize 40
[1696660984.938739] [node63:3385663:0] ib_mlx5dv_md.c:1341 UCX DEBUG mlx5_0: opened DEVX md log_max_qp=17
[1696660984.938744] [node63:3385663:0] ib_md.c:1103 UCX DEBUG mlx5_0: relaxed order memory access is disabled
[1696660984.939190] [node63:3385663:0] ib_mlx5dv_md.c:1011 UCX DEBUG created indirect rkey 0xae00 for remote flush
[1696660984.939194] [node63:3385663:0] ib_md.c:1054 UCX DEBUG mlx5_0: md open by 'uct_ib_mlx5_devx_md_ops' is successful
[1696660984.939228] [node63:3385663:0] ib_device.c:1052 UCX DEBUG no compatible IB ports found for flags 0xc4
[1696660984.939233] [node63:3385663:0] uct_md.c:97 UCX DEBUG failed to query dc_mlx5 resources: No such device
[1696660984.940832] [node63:3385663:0] ib_iface.c:927 UCX DEBUG using pkey[0] 0xffff on mlx5_0:1/RoCE
[1696660984.940891] [node63:3385663:0] ib_device.c:916 UCX DEBUG mlx5_0:1 using gid_index 3
[1696660984.941917] [node63:3385663:0] ib_iface.c:1453 UCX DEBUG created uct_ib_iface_t headroom_ofs 12 payload_ofs 16 hdr_ofs 15 data_sz 8256
[1696660984.941951] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool rc_recv_desc: align 64, maxelems 4294967295, elemsize 8279
[1696660984.941955] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool rc_send_desc: align 64, maxelems 4294967295, elemsize 8328
[1696660984.942041] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool send-ops-mpool: align 64, maxelems 4294967295, elemsize 56
[1696660984.942491] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool pending-ops: align 1, maxelems 4294967295, elemsize 64
[1696660984.942502] [node63:3385663:0] mpool.c:138 UCX DEBUG mpool rc_verbs_short_desc: align 64, maxelems 4294967295, elemsize 200
[1696660984.942986] [node63:3385663:0] ib_iface.c:1052 UCX DEBUG iface=0xa24010: created RC QP 0x1a919 on mlx5_0:1 TX wr:409 sge:5 inl:124 resp:64 RX wr:0 sge:0 resp:64
[1696660984.948402] [node63:3385663:0] mpool.c:282 UCX DEBUG mpool rc_recv_desc: allocated chunk 0x7fc7c1a00018 of 37748712 bytes with 4537 elements
Using rc_verbs/mlx5_0:1
[1696660984.949740] [node63:3385663:0] ib_iface.c:1052 UCX DEBUG iface=0xa24010: created RC QP 0x1a91a on mlx5_0:1 TX wr:409 sge:5 inl:124 resp:64 RX wr:0 sge:0 resp:64
[1696660984.949767] [node63:3385663:0] rc_ep.c:165 UCX DEBUG created rc ep 0xa1ee60
[1696660984.950046] [node63:3385663:0] ib_iface.c:809 UCX DEBUG iface 0xa24010: ah_attr dlid=49152 sl=0 port=1 src_path_bits=0 dgid=::ffff:172.17.29.63 flow_label=0xffffffff sgid_index=3 traffic_class=106
[1696660984.950441] [node63:3385663:0] rc_iface.c:934 UCX DEBUG connected rc qp 0x1a91a on mlx5_0:1/RoCE to lid 49152(+0) sl 0 remote_qp 0x1a91b mtu 1024 timer 18x7 rnr 13x7 rd_atom 16
[1696660984.951064] [node63:3385663:a] ib_device.c:468 UCX DEBUG IB Async event on mlx5_0: SRQ-attached QP 0x1a91a was flushed
[1696660984.951190] [node63:3385663:0] rc_ep.c:185 UCX DEBUG destroy rc ep 0xa1ee60
[1696660984.953127] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool rc_verbs_short_desc destroyed
[1696660984.953408] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool send-ops-mpool destroyed
[1696660984.953414] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool rc_send_desc destroyed
[1696660984.953740] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool rc_recv_desc destroyed
[1696660984.953748] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool pending-ops destroyed
[1696660984.954492] [node63:3385663:0] ib_mlx5dv_md.c:1399 UCX DEBUG mlx5_0: md=0xa1f3f0 md->flags=0x3f01a3 flush_rkey=0xae00
[1696660984.955001] [node63:3385663:0] mpool.c:194 UCX DEBUG mpool devx dbrec destroyed
[1696660984.955012] [node63:3385663:0] ib_device.c:605 UCX DEBUG destroying ib device mlx5_0
[1696660984.955021] [node63:3385663:0] async.c:157 UCX DEBUG removed async handler 0xa1d320 [id=4 ref 1] ???() from hash
[1696660984.955027] [node63:3385663:0] async.c:547 UCX DEBUG removing async handler 0xa1d320 [id=4 ref 1] ???()
[1696660984.955073] [node63:3385663:0] async.c:172 UCX DEBUG release async handler 0xa1d320 [id=4 ref 0] ???()
[root@node63 ucx]#
UCT设计: https://github.com/openucx/ucx/wiki/UCT-Design
UCT文档: https://openucx.readthedocs.io/en/master/ucx_features.html
UCX项目原版: https://github.com/openucx/ucx.git
晓兵笔记版: https://github.com/ssbandjl/ucx
博客: https://logread.cn | https://blog.csdn.net/ssbandjl | https://cloud.tencent.com/developer/user/5060293/articles
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。