diff --git a/gpudirect-rdma/nccl-rdma-installer-a4x.yaml b/gpudirect-rdma/nccl-rdma-installer-a4x.yaml index 4eea0eed5..adab1898c 100644 --- a/gpudirect-rdma/nccl-rdma-installer-a4x.yaml +++ b/gpudirect-rdma/nccl-rdma-installer-a4x.yaml @@ -62,7 +62,7 @@ spec: path: /home/kubernetes/bin/nvidia type: Directory initContainers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.1.0 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.1.2 name: nccl-rdma-installer resources: requests: diff --git a/gpudirect-rdma/nccl-rdma-installer-autopilot.yaml b/gpudirect-rdma/nccl-rdma-installer-autopilot.yaml index 2e23ade0e..0cbabb671 100644 --- a/gpudirect-rdma/nccl-rdma-installer-autopilot.yaml +++ b/gpudirect-rdma/nccl-rdma-installer-autopilot.yaml @@ -61,7 +61,7 @@ spec: hostPath: path: /home/kubernetes/bin/gib initContainers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.2 name: nccl-rdma-installer resources: requests: diff --git a/gpudirect-rdma/nccl-rdma-installer.yaml b/gpudirect-rdma/nccl-rdma-installer.yaml index 934440143..73324332d 100644 --- a/gpudirect-rdma/nccl-rdma-installer.yaml +++ b/gpudirect-rdma/nccl-rdma-installer.yaml @@ -55,7 +55,7 @@ spec: hostPath: path: /home/kubernetes/bin/gib initContainers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.2 name: nccl-rdma-installer resources: requests: diff --git a/gpudirect-rdma/nccl-test-a4-autopilot.yaml b/gpudirect-rdma/nccl-test-a4-autopilot.yaml index c9c5b7fb3..e65214cc5 100644 --- a/gpudirect-rdma/nccl-test-a4-autopilot.yaml +++ b/gpudirect-rdma/nccl-test-a4-autopilot.yaml @@ -66,7 +66,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.2 name: test resources: requests: @@ -129,7 +129,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.2 name: test resources: requests: diff --git a/gpudirect-rdma/nccl-test-a4.yaml b/gpudirect-rdma/nccl-test-a4.yaml index 1e7f6747b..4d46fc900 100644 --- a/gpudirect-rdma/nccl-test-a4.yaml +++ b/gpudirect-rdma/nccl-test-a4.yaml @@ -51,7 +51,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.2 name: test resources: requests: @@ -111,7 +111,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.2 name: test resources: requests: diff --git a/gpudirect-rdma/nccl-test-a4x-max-jobset.yaml b/gpudirect-rdma/nccl-test-a4x-max-jobset.yaml index e5adf8883..6be996d1b 100644 --- a/gpudirect-rdma/nccl-test-a4x-max-jobset.yaml +++ b/gpudirect-rdma/nccl-test-a4x-max-jobset.yaml @@ -78,7 +78,7 @@ spec: - name: nccl-test stdin: true tty: true - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-gib-a4x-max-arm64:v1.1.1 + image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.1.2 env: - name: MY_NODE_NAME valueFrom: diff --git a/gpudirect-rdma/nccl-test-a4x-max.yaml b/gpudirect-rdma/nccl-test-a4x-max.yaml index 37f8c3a33..cd76b1a96 100644 --- a/gpudirect-rdma/nccl-test-a4x-max.yaml +++ b/gpudirect-rdma/nccl-test-a4x-max.yaml @@ -75,7 +75,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-gib-a4x-max-arm64:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.1.2 securityContext: capabilities: add: ["IPC_LOCK"] @@ -143,7 +143,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-gib-a4x-max-arm64:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.1.2 securityContext: capabilities: add: ["IPC_LOCK"] diff --git a/gpudirect-rdma/nccl-test-autopilot.yaml b/gpudirect-rdma/nccl-test-autopilot.yaml index 992b26b57..4060546d9 100644 --- a/gpudirect-rdma/nccl-test-autopilot.yaml +++ b/gpudirect-rdma/nccl-test-autopilot.yaml @@ -66,7 +66,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.2 name: test resources: requests: @@ -129,7 +129,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.2 name: test resources: requests: diff --git a/gpudirect-rdma/nccl-test-imex-a4x-max.yaml b/gpudirect-rdma/nccl-test-imex-a4x-max.yaml index 951e9404d..34395b0a4 100644 --- a/gpudirect-rdma/nccl-test-imex-a4x-max.yaml +++ b/gpudirect-rdma/nccl-test-imex-a4x-max.yaml @@ -57,7 +57,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-gib-a4x-max-arm64:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.1.2 name: test resources: requests: @@ -115,7 +115,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-gib-a4x-max-arm64:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-arm64:v1.1.2 name: test resources: requests: diff --git a/gpudirect-rdma/nccl-test-imex-a4x.yaml b/gpudirect-rdma/nccl-test-imex-a4x.yaml index c45232a90..ac31dcad8 100644 --- a/gpudirect-rdma/nccl-test-imex-a4x.yaml +++ b/gpudirect-rdma/nccl-test-imex-a4x.yaml @@ -70,7 +70,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic-arm64:v1.1.0 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic-arm64:v1.1.2 name: test resources: requests: @@ -142,7 +142,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic-arm64:v1.1.0 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic-arm64:v1.1.2 name: test resources: requests: diff --git a/gpudirect-rdma/nccl-test-managed-rdma.yaml b/gpudirect-rdma/nccl-test-managed-rdma.yaml index aca08e699..9c90e4d2a 100644 --- a/gpudirect-rdma/nccl-test-managed-rdma.yaml +++ b/gpudirect-rdma/nccl-test-managed-rdma.yaml @@ -66,7 +66,7 @@ spec: - name: rdma resourceClaimTemplateName: nccl-test-all-mrdma containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.0 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.2 name: test resources: requests: @@ -115,7 +115,7 @@ spec: - name: rdma resourceClaimTemplateName: nccl-test-all-mrdma containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.0 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.2 name: test resources: requests: diff --git a/gpudirect-rdma/nccl-test.yaml b/gpudirect-rdma/nccl-test.yaml index b1616bac1..f708be628 100644 --- a/gpudirect-rdma/nccl-test.yaml +++ b/gpudirect-rdma/nccl-test.yaml @@ -63,7 +63,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.2 name: test resources: requests: @@ -121,7 +121,7 @@ spec: medium: "Memory" sizeLimit: 250Gi containers: - - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.1 + - image: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic:v1.1.2 name: test resources: requests: