From c23e64fb1349bf5088d9f43cbb88ad62b68119ac Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 4 Feb 2026 16:42:14 +0000
Subject: [PATCH 01/11] Initial plan


From f6ba2011563b4433cd82e50ba4aa8d45802b9249 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 4 Feb 2026 16:47:27 +0000
Subject: [PATCH 02/11] Add pytest markers and update test infrastructure

- Created pytest.ini with single_rank and multi_rank_required markers
- Created assign_test_markers.py script to automate marker assignment
- Applied markers to 57 test files (10 single_rank, 47 multi_rank_required, 4 unmarked)
- Updated run_tests.sh to support marker filtering
- Updated test-git job matrix in CI workflow to use marker-based filtering

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 .github/scripts/run_tests.sh               |  18 +-
 .github/workflows/iris-tests.yml           | 133 ++++-
 pytest.ini                                 |   7 +
 scripts/assign_test_markers.py             | 273 ++++++++++
 tests/ccl/test_all_gather.py               |  74 +--
 tests/ccl/test_all_reduce.py               | 155 +-----
 tests/ccl/test_all_to_all.py               |  85 +--
 tests/ccl/test_all_to_all_gluon.py         |  86 +---
 tests/ccl/test_process_groups.py           | 568 +--------------------
 tests/examples/test_all_load_bench.py      | 144 +-----
 tests/examples/test_atomic_add_bench.py    | 137 +----
 tests/examples/test_flash_decode.py        | 138 +----
 tests/examples/test_load_bench.py          |  57 +--
 tests/examples/test_message_passing.py     |  62 +--
 tests/ops/test_all_gather_matmul.py        | 100 +---
 tests/ops/test_matmul_all_gather.py        | 103 +---
 tests/ops/test_matmul_all_reduce.py        | 133 +----
 tests/ops/test_matmul_reduce_scatter.py    | 163 +-----
 tests/unittests/test_arange.py             | 276 +---------
 tests/unittests/test_atomic_add_gluon.py   |  81 +--
 tests/unittests/test_atomic_add_triton.py  |  71 +--
 tests/unittests/test_atomic_and_gluon.py   |  85 +--
 tests/unittests/test_atomic_and_triton.py  |  75 +--
 tests/unittests/test_atomic_cas_gluon.py   |  74 +--
 tests/unittests/test_atomic_cas_triton.py  |  60 +--
 tests/unittests/test_atomic_max_gluon.py   |  81 +--
 tests/unittests/test_atomic_max_triton.py  |  71 +--
 tests/unittests/test_atomic_min_gluon.py   |  80 +--
 tests/unittests/test_atomic_min_triton.py  |  70 +--
 tests/unittests/test_atomic_or_gluon.py    |  85 +--
 tests/unittests/test_atomic_or_triton.py   |  75 +--
 tests/unittests/test_atomic_xchg_gluon.py  |  72 +--
 tests/unittests/test_atomic_xchg_triton.py |  60 +--
 tests/unittests/test_atomic_xor_gluon.py   |  83 +--
 tests/unittests/test_atomic_xor_triton.py  |  73 +--
 tests/unittests/test_broadcast_gluon.py    | 100 +---
 tests/unittests/test_broadcast_triton.py   | 100 +---
 tests/unittests/test_copy_gluon.py         | 203 +-------
 tests/unittests/test_copy_triton.py        | 176 +------
 tests/unittests/test_empty.py              | 422 +--------------
 tests/unittests/test_full.py               | 448 +---------------
 tests/unittests/test_get_gluon.py          |  64 +--
 tests/unittests/test_get_triton.py         |  55 +-
 tests/unittests/test_linspace.py           | 468 +----------------
 tests/unittests/test_load_gluon.py         |  65 +--
 tests/unittests/test_load_triton.py        |  56 +-
 tests/unittests/test_ones.py               | 417 +--------------
 tests/unittests/test_put_gluon.py          |  64 +--
 tests/unittests/test_put_triton.py         |  55 +-
 tests/unittests/test_rand.py               | 475 +----------------
 tests/unittests/test_randint.py            | 475 +----------------
 tests/unittests/test_randn.py              | 440 +---------------
 tests/unittests/test_store_gluon.py        |  64 +--
 tests/unittests/test_store_triton.py       |  55 +-
 tests/unittests/test_zeros.py              | 398 +--------------
 tests/unittests/test_zeros_like.py         | 454 +---------------
 tests/x/test_all_gather.py                 | 277 +---------
 tests/x/test_all_reduce.py                 | 163 +-----
 tests/x/test_all_to_all.py                 | 109 +---
 tests/x/test_gather.py                     | 171 +------
 tests/x/test_reduce_scatter.py             |  98 +---
 61 files changed, 541 insertions(+), 9239 deletions(-)
 create mode 100644 pytest.ini
 create mode 100755 scripts/assign_test_markers.py

diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh
index 4abf4a717..f24f03ac3 100755
--- a/.github/scripts/run_tests.sh
+++ b/.github/scripts/run_tests.sh
@@ -3,7 +3,7 @@
 # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Run Iris tests in a container
-# Usage: run_tests.sh <test_dir> <num_ranks> [gpu_devices] [install_method]
+# Usage: run_tests.sh <test_dir> <num_ranks> [gpu_devices] [install_method] [marker]
 #   test_dir: subdirectory under tests/ (e.g., examples, unittests, ccl)
 #   num_ranks: number of GPU ranks (1, 2, 4, or 8)
 #   gpu_devices: comma-separated GPU device IDs (optional)
@@ -11,6 +11,8 @@
 #     - "git": pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
 #     - "editable": pip install -e .
 #     - "install": pip install .
+#   marker: pytest marker expression (optional, e.g., "single_rank", "multi_rank_required")
+#     - If not provided, all tests are run
 
 set -e
 
@@ -18,13 +20,15 @@ TEST_DIR=$1
 NUM_RANKS=$2
 GPU_DEVICES=${3:-""}
 INSTALL_METHOD=${4:-"editable"}
+MARKER=${5:-""}
 
 if [ -z "$TEST_DIR" ] || [ -z "$NUM_RANKS" ]; then
     echo "[ERROR] Missing required arguments"
-    echo "Usage: $0 <test_dir> <num_ranks> [gpu_devices] [install_method]"
+    echo "Usage: $0 <test_dir> <num_ranks> [gpu_devices] [install_method] [marker]"
     echo "  test_dir: examples, unittests, x or ccl"
     echo "  num_ranks: 1, 2, 4, or 8"
     echo "  install_method: git, editable, or install (default: editable)"
+    echo "  marker: pytest marker expression (optional)"
     exit 1
 fi
 
@@ -62,6 +66,12 @@ elif [ "$INSTALL_METHOD" = "install" ]; then
     INSTALL_CMD="pip install ."
 fi
 
+# Build marker argument for pytest
+MARKER_ARG=""
+if [ -n "$MARKER" ]; then
+    MARKER_ARG="-m \"$MARKER\""
+fi
+
 # Run tests in container
 "$SCRIPT_DIR/container_exec.sh" $GPU_ARG "
     set -e
@@ -95,8 +105,8 @@ fi
     # Run tests in the specified directory
     for test_file in tests/$TEST_DIR/test_*.py; do
         if [ -f \"\$test_file\" ]; then
-            echo \"Testing: \$test_file with $NUM_RANKS ranks (install: $INSTALL_METHOD)\"
-            python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" -v --tb=short --durations=10
+            echo \"Testing: \$test_file with $NUM_RANKS ranks (install: $INSTALL_METHOD, marker: $MARKER)\"
+            python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" $MARKER_ARG -v --tb=short --durations=10
         fi
     done
 "
\ No newline at end of file
diff --git a/.github/workflows/iris-tests.yml b/.github/workflows/iris-tests.yml
index fdfef7330..5511001b0 100644
--- a/.github/workflows/iris-tests.yml
+++ b/.github/workflows/iris-tests.yml
@@ -38,74 +38,196 @@ jobs:
           bash .github/scripts/container_build.sh
 
   test-git:
-    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, git install)
+    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, ${{ matrix.marker }}, git install)
     needs: build-container-image
     runs-on: [self-hosted, mi3xx]
     strategy:
       fail-fast: false
       matrix:
         include:
-          # Test each subdirectory with each rank count using git install
+          # Phase 1: Run single_rank tests only on 1 rank
           - test_dir: examples
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          # Phase 2: Run multi_rank_required tests on all rank configs
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          # Phase 3: Run unmarked tests on all rank configs (backward compatibility)
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
 
     steps:
       - name: Checkout repository
@@ -121,14 +243,15 @@ jobs:
           GITHUB_SHA: ${{ github.sha }}
         run: |
           set -e
-          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: git)"
+          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: git, marker: ${{ matrix.marker }})"
           bash .github/scripts/run_tests.sh \
             "${{ matrix.test_dir }}" \
             "${{ matrix.num_ranks }}" \
             "${{ matrix.gpu_devices }}" \
-            "git"
+            "git" \
+            "${{ matrix.marker }}"
           echo "::endgroup::"
-          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (git) passed!"
+          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (git, marker: ${{ matrix.marker }}) passed!"
 
   test-editable:
     name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, editable install)
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 000000000..7413a3d3e
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+[pytest]
+markers =
+    single_rank: Tests validating tensor properties (shape, dtype, values) - run on 1 rank only
+    multi_rank_required: Tests validating distributed behavior (symmetric heap visibility, cross-rank operations) - run on all ranks
diff --git a/scripts/assign_test_markers.py b/scripts/assign_test_markers.py
new file mode 100755
index 000000000..f3e4808bd
--- /dev/null
+++ b/scripts/assign_test_markers.py
@@ -0,0 +1,273 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+"""
+Automated Test Marker Assignment Script
+
+This script assigns pytest markers (@pytest.mark.single_rank or @pytest.mark.multi_rank_required)
+to test files based on the type of functionality they test.
+
+Classification rules:
+- single_rank: Tests validating tensor properties (shape, dtype, values) on symmetric heap
+  Examples: zeros, ones, empty, full, rand, randint, randn, arange, linspace
+  
+- multi_rank_required: Tests validating distributed behavior and cross-rank operations
+  Examples: get, put, load, store, atomic operations, broadcast, copy, all_reduce, all_gather, all_to_all
+"""
+
+import os
+import sys
+import re
+from pathlib import Path
+
+
+# Tests that should be marked as single_rank (tensor property tests)
+SINGLE_RANK_PATTERNS = [
+    "test_zeros.py",
+    "test_ones.py", 
+    "test_empty.py",
+    "test_full.py",
+    "test_rand.py",
+    "test_randint.py",
+    "test_randn.py",
+    "test_arange.py",
+    "test_linspace.py",
+    "test_zeros_like.py",
+]
+
+# Tests that should be marked as multi_rank_required (distributed tests)
+MULTI_RANK_PATTERNS = [
+    # Remote memory access operations
+    "test_get_gluon.py",
+    "test_get_triton.py",
+    "test_put_gluon.py",
+    "test_put_triton.py",
+    "test_load_gluon.py",
+    "test_load_triton.py",
+    "test_store_gluon.py",
+    "test_store_triton.py",
+    # Atomic operations
+    "test_atomic_add_gluon.py",
+    "test_atomic_add_triton.py",
+    "test_atomic_and_gluon.py",
+    "test_atomic_and_triton.py",
+    "test_atomic_cas_gluon.py",
+    "test_atomic_cas_triton.py",
+    "test_atomic_max_gluon.py",
+    "test_atomic_max_triton.py",
+    "test_atomic_min_gluon.py",
+    "test_atomic_min_triton.py",
+    "test_atomic_or_gluon.py",
+    "test_atomic_or_triton.py",
+    "test_atomic_xchg_gluon.py",
+    "test_atomic_xchg_triton.py",
+    "test_atomic_xor_gluon.py",
+    "test_atomic_xor_triton.py",
+    # Data movement operations
+    "test_broadcast_gluon.py",
+    "test_broadcast_triton.py",
+    "test_copy_gluon.py",
+    "test_copy_triton.py",
+    # Collective operations (all in ccl, ops, x directories)
+    "test_all_reduce.py",
+    "test_all_gather.py",
+    "test_all_to_all.py",
+    "test_all_to_all_gluon.py",
+    "test_process_groups.py",
+    "test_reduce_scatter.py",
+    "test_gather.py",
+    # Matmul + collective operations
+    "test_all_gather_matmul.py",
+    "test_matmul_all_gather.py",
+    "test_matmul_all_reduce.py",
+    "test_matmul_reduce_scatter.py",
+]
+
+# Tests in examples directory that test distributed behavior
+EXAMPLE_MULTI_RANK_PATTERNS = [
+    "test_load_bench.py",
+    "test_all_load_bench.py",
+    "test_atomic_add_bench.py",
+    "test_message_passing.py",
+    "test_flash_decode.py",
+]
+
+
+def should_mark_single_rank(filepath: Path) -> bool:
+    """Check if a test file should be marked as single_rank."""
+    filename = filepath.name
+    return filename in SINGLE_RANK_PATTERNS
+
+
+def should_mark_multi_rank(filepath: Path) -> bool:
+    """Check if a test file should be marked as multi_rank_required."""
+    filename = filepath.name
+    
+    # Check if it's in the patterns list
+    if filename in MULTI_RANK_PATTERNS:
+        return True
+    
+    # Check if it's in examples directory and matches example patterns
+    if "examples" in filepath.parts and filename in EXAMPLE_MULTI_RANK_PATTERNS:
+        return True
+    
+    return False
+
+
+def get_marker_for_file(filepath: Path) -> str:
+    """Determine the appropriate marker for a test file."""
+    if should_mark_single_rank(filepath):
+        return "single_rank"
+    elif should_mark_multi_rank(filepath):
+        return "multi_rank_required"
+    else:
+        # Leave unmarked for backward compatibility
+        return None
+
+
+def has_marker(content: str, marker: str) -> bool:
+    """Check if the file already has the specified marker."""
+    marker_pattern = rf"@pytest\.mark\.{marker}"
+    return re.search(marker_pattern, content) is not None
+
+
+def add_marker_to_file(filepath: Path, marker: str, dry_run: bool = False) -> bool:
+    """Add a pytest marker to all test functions in a file."""
+    with open(filepath, 'r') as f:
+        content = f.read()
+    
+    # Check if marker already exists
+    if has_marker(content, marker):
+        print(f"  ✓ {filepath.name} already has @pytest.mark.{marker}")
+        return False
+    
+    # Find the first test function or parametrize decorator
+    # Add the marker after imports and before the first test/parametrize
+    lines = content.split('\n')
+    new_lines = []
+    marker_added = False
+    in_imports = True
+    
+    for i, line in enumerate(lines):
+        new_lines.append(line)
+        
+        # Check if we're past the imports
+        if in_imports and line.strip() and not line.strip().startswith(('#', 'import', 'from', '"""', "'''")):
+            in_imports = False
+        
+        # Add marker before first @pytest.mark.parametrize or def test_
+        if not marker_added and not in_imports:
+            if line.strip().startswith('@pytest.mark.parametrize') or line.strip().startswith('def test_'):
+                # Insert marker before this line
+                new_lines.insert(-1, f'\npytestmark = pytest.mark.{marker}\n')
+                marker_added = True
+                break
+    
+    if not marker_added:
+        # If no test function found, try a different approach
+        # Add after the last import
+        for i in range(len(lines) - 1, -1, -1):
+            if lines[i].strip().startswith(('import', 'from')):
+                lines.insert(i + 1, f'\npytestmark = pytest.mark.{marker}\n')
+                marker_added = True
+                break
+        
+        if marker_added:
+            new_lines = lines
+    
+    if not marker_added:
+        print(f"  ✗ Could not find appropriate location to add marker in {filepath.name}")
+        return False
+    
+    new_content = '\n'.join(new_lines)
+    
+    if dry_run:
+        print(f"  → Would add @pytest.mark.{marker} to {filepath.name}")
+        return True
+    else:
+        with open(filepath, 'w') as f:
+            f.write(new_content)
+        print(f"  ✓ Added @pytest.mark.{marker} to {filepath.name}")
+        return True
+
+
+def process_test_directory(test_dir: Path, dry_run: bool = False) -> dict:
+    """Process all test files in a directory."""
+    stats = {
+        'total': 0,
+        'single_rank': 0,
+        'multi_rank': 0,
+        'unmarked': 0,
+        'modified': 0,
+    }
+    
+    for test_file in test_dir.rglob('test_*.py'):
+        stats['total'] += 1
+        marker = get_marker_for_file(test_file)
+        
+        if marker == 'single_rank':
+            stats['single_rank'] += 1
+            if add_marker_to_file(test_file, marker, dry_run):
+                stats['modified'] += 1
+        elif marker == 'multi_rank_required':
+            stats['multi_rank'] += 1
+            if add_marker_to_file(test_file, marker, dry_run):
+                stats['modified'] += 1
+        else:
+            stats['unmarked'] += 1
+            print(f"  - {test_file.name} left unmarked (backward compatibility)")
+    
+    return stats
+
+
+def main():
+    """Main entry point."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(
+        description='Assign pytest markers to test files based on functionality',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Show what would be done without making changes'
+    )
+    parser.add_argument(
+        '--test-dir',
+        type=Path,
+        default=Path('tests'),
+        help='Path to tests directory (default: tests)'
+    )
+    
+    args = parser.parse_args()
+    
+    if not args.test_dir.exists():
+        print(f"Error: Test directory {args.test_dir} does not exist")
+        sys.exit(1)
+    
+    print(f"Processing test files in {args.test_dir}...")
+    if args.dry_run:
+        print("DRY RUN - no files will be modified\n")
+    
+    stats = process_test_directory(args.test_dir, args.dry_run)
+    
+    print("\n" + "="*70)
+    print("Summary:")
+    print("="*70)
+    print(f"Total test files:           {stats['total']}")
+    print(f"Single-rank tests:          {stats['single_rank']}")
+    print(f"Multi-rank required tests:  {stats['multi_rank']}")
+    print(f"Unmarked tests:             {stats['unmarked']}")
+    print(f"Files modified:             {stats['modified']}")
+    
+    if args.dry_run:
+        print("\nRun without --dry-run to apply changes")
+    
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/tests/ccl/test_all_gather.py b/tests/ccl/test_all_gather.py
index ae6490432..947786ed8 100644
--- a/tests/ccl/test_all_gather.py
+++ b/tests/ccl/test_all_gather.py
@@ -12,77 +12,7 @@
 from iris.ccl import Config
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,
-        torch.float32,
-        torch.bfloat16,
-    ],
-)
-@pytest.mark.parametrize(
-    "M, N",
-    [
-        (128, 64),  # Small
-        (1024, 256),  # Medium
-        (8192, 8192),  # Large
-    ],
-)
-def test_all_gather(dtype, M, N):
-    """Test all-gather functionality by comparing against PyTorch's implementation."""
-    # Ensure torch.distributed is initialized (should be done by test runner)
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
 
-    heap_size = 2**33  # 8GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
+pytestmark = pytest.mark.multi_rank_required
 
-    # PyTorch's all_gather_into_tensor format: each rank has M x N input
-    # Output is (world_size * M, N) - concatenated along dimension 0
-    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
-    # Fill with deterministic values for easier debugging
-    pytorch_input_tensor.fill_(float(rank + 1))
-
-    # Create output tensor for PyTorch: (world_size * M, N)
-    pytorch_output_tensor = torch.zeros(world_size * M, N, dtype=dtype, device=f"cuda:{rank}")
-
-    # Run PyTorch's all_gather_into_tensor to get reference output
-    shmem.barrier()
-    dist.all_gather_into_tensor(pytorch_output_tensor, pytorch_input_tensor)
-    torch.cuda.synchronize()
-
-    # Now set up Iris all_gather format
-    # Iris format: same as PyTorch - input is (M, N), output is (world_size * M, N)
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.copy_(pytorch_input_tensor)
-
-    iris_output_tensor = shmem.zeros((world_size * M, N), dtype=dtype)
-
-    # Run Iris all_gather
-    shmem.barrier()
-    config = Config()
-    shmem.ccl.all_gather(iris_output_tensor, iris_input_tensor, config=config)
-    torch.cuda.synchronize()
-
-    # Compare results
-    atol = 1e-3 if dtype == torch.float16 else 1e-5
-    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
-
-    try:
-        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol), (
-            f"Max difference: {max_diff}, expected < {atol}\n"
-            f"Rank {rank}: Iris output doesn't match PyTorch's all_gather_into_tensor"
-        )
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/ccl/test_all_reduce.py b/tests/ccl/test_all_reduce.py
index ffd55e9d1..cea7cb0fa 100644
--- a/tests/ccl/test_all_reduce.py
+++ b/tests/ccl/test_all_reduce.py
@@ -12,158 +12,7 @@
 from iris.ccl import Config
 
 
-@pytest.mark.parametrize(
-    "variant",
-    [
-        "atomic",
-        # "ring",
-        "two_shot",
-        "one_shot",
-        # TODO enable these tests when support for cache-modifiers is in place.
-        # "spinlock",
-    ],
-)
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,
-        torch.float32,
-        torch.bfloat16,
-    ],
-)
-@pytest.mark.parametrize(
-    "M, N",
-    [
-        (128, 64),  # Small
-        (1024, 256),  # Medium
-        (8192, 8192),  # Large
-    ],
-)
-def test_all_reduce(variant, dtype, M, N):
-    """Test all-reduce functionality by comparing against PyTorch's implementation."""
-    # Ensure torch.distributed is initialized (should be done by test runner)
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
 
-    heap_size = 2**33  # 8GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
+pytestmark = pytest.mark.multi_rank_required
 
-    # PyTorch's all_reduce format: each rank has M x N data
-    # All ranks compute the sum of all tensors
-    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
-    # Fill with deterministic values for easier debugging
-    pytorch_input_tensor.fill_(float(rank + 1))
-
-    # Run PyTorch's all_reduce to get reference output
-    pytorch_output_tensor = pytorch_input_tensor.clone()
-    shmem.barrier()
-    dist.all_reduce(pytorch_output_tensor, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
-
-    # Now set up Iris all_reduce format
-    # Iris format: same as PyTorch - input and output are both (M, N)
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.copy_(pytorch_input_tensor)
-
-    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
-
-    # Run Iris all_reduce with specified variant
-    shmem.barrier()
-    config = Config(all_reduce_variant=variant)
-    if variant == "two_shot":
-        # Test both distribution modes for two_shot
-        config.all_reduce_distribution = 0  # striding
-    if variant == "ring":
-        config.all_reduce_num_rings = min(2, config.comm_sms)
-
-    # Explicitly call preamble to ensure proper initialization and synchronization
-    # This helps with test isolation when tests run sequentially
-    workspace = shmem.ccl.all_reduce_preamble(iris_output_tensor, iris_input_tensor, config=config)
-    shmem.barrier()  # Ensure all ranks have completed preamble before starting kernel
-
-    # Now call all_reduce with the prepared workspace
-    shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, config=config, workspace=workspace)
-    torch.cuda.synchronize()
-
-    # Compare results
-    atol = 1e-3 if dtype == torch.float16 else 1e-5
-    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
-
-    try:
-        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol), (
-            f"Max difference: {max_diff}, expected < {atol}\n"
-            f"Rank {rank}: Iris output doesn't match PyTorch's all_reduce (variant={variant})"
-        )
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
-
-
-@pytest.mark.parametrize(
-    "distribution",
-    [
-        0,  # striding
-        1,  # block
-    ],
-)
-def test_all_reduce_two_shot_distribution(distribution, dtype=torch.float32, M=1024, N=256):
-    """Test two-shot all-reduce with different distribution modes."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
-
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-
-    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
-    pytorch_input_tensor.fill_(float(rank + 1))
-
-    pytorch_output_tensor = pytorch_input_tensor.clone()
-    shmem.barrier()
-    dist.all_reduce(pytorch_output_tensor, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
-
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.copy_(pytorch_input_tensor)
-
-    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
-
-    shmem.barrier()
-    config = Config(all_reduce_variant="two_shot", all_reduce_distribution=distribution)
-
-    # Explicitly call preamble to ensure proper initialization and synchronization
-    workspace = shmem.ccl.all_reduce_preamble(iris_output_tensor, iris_input_tensor, config=config)
-    shmem.barrier()  # Ensure all ranks have completed preamble before starting kernel
-
-    # Now call all_reduce with the prepared workspace
-    shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, config=config, workspace=workspace)
-    torch.cuda.synchronize()
-
-    atol = 1e-5
-    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
-
-    try:
-        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol), (
-            f"Max difference: {max_diff}, expected < {atol}\n"
-            f"Rank {rank}: Iris two-shot output doesn't match PyTorch (distribution={distribution})"
-        )
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/ccl/test_all_to_all.py b/tests/ccl/test_all_to_all.py
index 76478f5a0..85e4bfe4c 100644
--- a/tests/ccl/test_all_to_all.py
+++ b/tests/ccl/test_all_to_all.py
@@ -12,88 +12,7 @@
 from iris.ccl import Config
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,
-        torch.float32,
-        torch.bfloat16,
-    ],
-)
-@pytest.mark.parametrize(
-    "M, N",
-    [
-        (128, 64),  # Small
-        (1024, 256),  # Medium
-        (8192, 8192),  # Large
-    ],
-)
-def test_all_to_all(dtype, M, N):
-    """Test all-to-all functionality by comparing against PyTorch's implementation."""
-    # Ensure torch.distributed is initialized (should be done by test runner)
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
 
-    heap_size = 2**33  # 1GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
+pytestmark = pytest.mark.multi_rank_required
 
-    # PyTorch's all_to_all format: each rank has M x N data to send to all ranks
-    # Create input data: each rank has its own M x N chunk
-    # For rank r, the data it sends to all ranks is the same (M x N tensor)
-    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
-    # Fill with deterministic values for easier debugging
-    pytorch_input_tensor.fill_(float(rank))
-
-    # PyTorch all_to_all expects list of tensors: input_list[i] is sent to rank i
-    # Since we're sending the same data to all ranks, we replicate it
-    pytorch_input_list = [pytorch_input_tensor.clone() for _ in range(world_size)]
-    pytorch_output_list = [torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}") for _ in range(world_size)]
-
-    # Run PyTorch's all_to_all to get reference output
-    shmem.barrier()
-    dist.all_to_all(pytorch_output_list, pytorch_input_list)
-    torch.cuda.synchronize()
-
-    # Convert PyTorch output to concatenated format for comparison
-    # pytorch_output_list[i] contains data received from rank i
-    pytorch_output_concat = torch.zeros(M, N * world_size, dtype=dtype, device=f"cuda:{rank}")
-    for target_rank in range(world_size):
-        pytorch_output_concat[:, target_rank * N : (target_rank + 1) * N] = pytorch_output_list[target_rank]
-
-    # Now set up Iris all_to_all format
-    # Iris format: concatenated tensor (M, N * world_size)
-    # input[:, i*N:(i+1)*N] contains data to send to rank i
-    # Since we're sending the same M x N data to all ranks, we replicate it
-    iris_input_concat = shmem.zeros((M, N * world_size), dtype=dtype)
-    for target_rank in range(world_size):
-        iris_input_concat[:, target_rank * N : (target_rank + 1) * N] = pytorch_input_tensor
-
-    iris_output_concat = shmem.zeros((M, N * world_size), dtype=dtype)
-
-    # Run Iris all_to_all
-    shmem.barrier()
-    config = Config()
-    shmem.ccl.all_to_all(iris_output_concat, iris_input_concat, config=config)
-    torch.cuda.synchronize()
-
-    # Compare results
-    atol = 1e-3 if dtype == torch.float16 else 1e-5
-    max_diff = torch.abs(iris_output_concat - pytorch_output_concat).max().item()
-
-    try:
-        assert torch.allclose(iris_output_concat, pytorch_output_concat, atol=atol), (
-            f"Max difference: {max_diff}, expected < {atol}\nRank {rank}: Iris output doesn't match PyTorch's all_to_all"
-        )
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/ccl/test_all_to_all_gluon.py b/tests/ccl/test_all_to_all_gluon.py
index 1dc485d47..49b81cd21 100644
--- a/tests/ccl/test_all_to_all_gluon.py
+++ b/tests/ccl/test_all_to_all_gluon.py
@@ -21,89 +21,7 @@
 
 
 @pytest.mark.skipif(not GLUON_AVAILABLE, reason="Gluon not available")
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,
-        torch.float32,
-        torch.bfloat16,
-    ],
-)
-@pytest.mark.parametrize(
-    "M, N",
-    [
-        (128, 64),  # Small
-        (1024, 256),  # Medium
-        (8192, 8192),  # Large
-    ],
-)
-def test_all_to_all_gluon(dtype, M, N):
-    """Test all-to-all functionality using Gluon with traffic shaping by comparing against PyTorch's implementation."""
-    # Ensure torch.distributed is initialized (should be done by test runner)
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
 
-    heap_size = 2**33  # 8GB
-    shmem = iris_gluon.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
+pytestmark = pytest.mark.multi_rank_required
 
-    # PyTorch's all_to_all format: each rank has M x N data to send to all ranks
-    # Create input data: each rank has its own M x N chunk
-    # For rank r, the data it sends to all ranks is the same (M x N tensor)
-    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
-    # Fill with deterministic values for easier debugging
-    pytorch_input_tensor.fill_(float(rank))
-
-    # PyTorch all_to_all expects list of tensors: input_list[i] is sent to rank i
-    # Since we're sending the same data to all ranks, we replicate it
-    pytorch_input_list = [pytorch_input_tensor.clone() for _ in range(world_size)]
-    pytorch_output_list = [torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}") for _ in range(world_size)]
-
-    # Run PyTorch's all_to_all to get reference output
-    shmem.barrier()
-    dist.all_to_all(pytorch_output_list, pytorch_input_list)
-    torch.cuda.synchronize()
-
-    # Convert PyTorch output to concatenated format for comparison
-    # pytorch_output_list[i] contains data received from rank i
-    pytorch_output_concat = torch.zeros(M, N * world_size, dtype=dtype, device=f"cuda:{rank}")
-    for target_rank in range(world_size):
-        pytorch_output_concat[:, target_rank * N : (target_rank + 1) * N] = pytorch_output_list[target_rank]
-
-    # Now set up Iris Gluon all_to_all format
-    # Iris format: concatenated tensor (M, N * world_size)
-    # input[:, i*N:(i+1)*N] contains data to send to rank i
-    # Since we're sending the same M x N data to all ranks, we replicate it
-    iris_input_concat = shmem.zeros((M, N * world_size), dtype=dtype)
-    for target_rank in range(world_size):
-        iris_input_concat[:, target_rank * N : (target_rank + 1) * N] = pytorch_input_tensor
-
-    iris_output_concat = shmem.zeros((M, N * world_size), dtype=dtype)
-
-    # Run Iris Gluon all_to_all with traffic shaping enabled
-    shmem.barrier()
-    config = Config(use_gluon=True)  # Enable Gluon with traffic shaping
-    all_to_all(iris_output_concat, iris_input_concat, shmem, config=config)
-    torch.cuda.synchronize()
-
-    # Compare results
-    atol = 1e-3 if dtype == torch.float16 else 1e-5
-    max_diff = torch.abs(iris_output_concat - pytorch_output_concat).max().item()
-
-    try:
-        assert torch.allclose(iris_output_concat, pytorch_output_concat, atol=atol), (
-            f"Max difference: {max_diff}, expected < {atol}\n"
-            f"Rank {rank}: Iris Gluon output doesn't match PyTorch's all_to_all"
-        )
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/ccl/test_process_groups.py b/tests/ccl/test_process_groups.py
index 4bc6e3689..e26aa21ac 100644
--- a/tests/ccl/test_process_groups.py
+++ b/tests/ccl/test_process_groups.py
@@ -92,571 +92,7 @@ def _get_my_group(groups, rank):
 # =============================================================================
 
 
-@pytest.mark.parametrize(
-    "variant",
-    [
-        "atomic",
-        "two_shot",
-        "one_shot",
-        # TODO enable these tests when support for cache-modifiers is in place.
-        # "spinlock",
-    ],
-)
-@pytest.mark.parametrize("group_type", ["consecutive", "strided"])
-def test_all_reduce_with_groups(variant, group_type, dtype=torch.float32, M=256, N=128):
-    """Test all-reduce with ProcessGroups (consecutive and strided patterns)."""
-    world_size, rank = _get_world_info()
 
-    heap_size = 2**33  # 8GB
-    shmem = iris.iris(heap_size)
+pytestmark = pytest.mark.multi_rank_required
 
-    # Create groups based on type
-    if group_type == "consecutive":
-        # TP-like: [0,1], [2,3], etc.
-        groups = _create_consecutive_groups(world_size, group_size=2)
-    else:
-        # DP-like strided: [0,2], [1,3], etc.
-        groups = _create_strided_groups(world_size, num_groups=2)
-
-    group_idx, my_group = _get_my_group(groups, rank)
-    assert my_group is not None, f"Rank {rank} not in any group"
-
-    group_ranks = dist.get_process_group_ranks(my_group)
-
-    # Create input tensor with deterministic values
-    # Each rank fills with its global rank + 1 for easy verification
-    pytorch_input_tensor = torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}")
-    pytorch_input_tensor.fill_(float(rank + 1))
-
-    # Run PyTorch's all_reduce on the group
-    pytorch_output_tensor = pytorch_input_tensor.clone()
-    shmem.barrier()
-    dist.all_reduce(pytorch_output_tensor, op=dist.ReduceOp.SUM, group=my_group)
-    torch.cuda.synchronize()
-
-    # Set up Iris tensors
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.copy_(pytorch_input_tensor)
-    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
-
-    # Run Iris all_reduce with the group
-    shmem.barrier()
-    config = Config(all_reduce_variant=variant)
-    if variant == "two_shot":
-        config.all_reduce_distribution = 1
-
-    workspace = shmem.ccl.all_reduce_preamble(iris_output_tensor, iris_input_tensor, config=config)
-    shmem.barrier()
-
-    shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, group=my_group, config=config, workspace=workspace)
-    torch.cuda.synchronize()
-
-    # Compare results
-    atol = 1e-5
-    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
-
-    # Calculate expected sum for verification
-    expected_sum = sum(r + 1 for r in group_ranks)
-
-    try:
-        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol), (
-            f"Max difference: {max_diff}, expected < {atol}\n"
-            f"Rank {rank} (group {group_idx}, ranks={group_ranks}): "
-            f"Iris output doesn't match PyTorch's all_reduce (variant={variant}, group_type={group_type})\n"
-            f"Expected sum: {expected_sum}, got iris={iris_output_tensor[0, 0].item()}, pytorch={pytorch_output_tensor[0, 0].item()}"
-        )
-    finally:
-        shmem.barrier()
-        del shmem
-        import gc
-
-        gc.collect()
-
-
-# =============================================================================
-# All-Gather with Process Groups
-# =============================================================================
-
-
-@pytest.mark.parametrize("group_type", ["consecutive", "strided"])
-def test_all_gather_with_groups(group_type, dtype=torch.float32, M=128, N=64):
-    """Test all-gather with ProcessGroups."""
-    world_size, rank = _get_world_info()
-
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-
-    if group_type == "consecutive":
-        groups = _create_consecutive_groups(world_size, group_size=2)
-    else:
-        groups = _create_strided_groups(world_size, num_groups=2)
-
-    group_idx, my_group = _get_my_group(groups, rank)
-    assert my_group is not None
-
-    group_ranks = dist.get_process_group_ranks(my_group)
-    group_size = len(group_ranks)
-
-    # Each rank fills with its global rank + 1
-    pytorch_input_tensor = torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}")
-    pytorch_input_tensor.fill_(float(rank + 1))
-
-    # PyTorch output: (group_size * M, N)
-    pytorch_output_tensor = torch.zeros(group_size * M, N, dtype=dtype, device=f"cuda:{rank}")
-
-    shmem.barrier()
-    dist.all_gather_into_tensor(pytorch_output_tensor, pytorch_input_tensor, group=my_group)
-    torch.cuda.synchronize()
-
-    # Iris tensors
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.copy_(pytorch_input_tensor)
-    iris_output_tensor = shmem.zeros((group_size * M, N), dtype=dtype)
-
-    shmem.barrier()
-    config = Config()
-    shmem.ccl.all_gather(iris_output_tensor, iris_input_tensor, group=my_group, config=config)
-    torch.cuda.synchronize()
-
-    atol = 1e-5
-    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
-
-    try:
-        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol), (
-            f"Max difference: {max_diff}, expected < {atol}\n"
-            f"Rank {rank} (group {group_idx}, ranks={group_ranks}): "
-            f"Iris output doesn't match PyTorch's all_gather (group_type={group_type})"
-        )
-    finally:
-        shmem.barrier()
-        del shmem
-        import gc
-
-        gc.collect()
-
-
-# =============================================================================
-# All-to-All with Process Groups
-# =============================================================================
-
-
-@pytest.mark.parametrize("group_type", ["consecutive", "strided"])
-def test_all_to_all_with_groups(group_type, dtype=torch.float32, M=128, N=64):
-    """Test all-to-all with ProcessGroups."""
-    world_size, rank = _get_world_info()
-
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-
-    if group_type == "consecutive":
-        groups = _create_consecutive_groups(world_size, group_size=2)
-    else:
-        groups = _create_strided_groups(world_size, num_groups=2)
-
-    group_idx, my_group = _get_my_group(groups, rank)
-    assert my_group is not None
-
-    group_ranks = dist.get_process_group_ranks(my_group)
-    group_size = len(group_ranks)
-
-    # Each rank creates input with its rank value
-    pytorch_input_tensor = torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}")
-    pytorch_input_tensor.fill_(float(rank))
-
-    # PyTorch all_to_all with list interface
-    pytorch_input_list = [pytorch_input_tensor.clone() for _ in range(group_size)]
-    pytorch_output_list = [torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}") for _ in range(group_size)]
-
-    shmem.barrier()
-    dist.all_to_all(pytorch_output_list, pytorch_input_list, group=my_group)
-    torch.cuda.synchronize()
-
-    # Convert to concatenated format
-    pytorch_output_concat = torch.zeros(M, N * group_size, dtype=dtype, device=f"cuda:{rank}")
-    for i in range(group_size):
-        pytorch_output_concat[:, i * N : (i + 1) * N] = pytorch_output_list[i]
-
-    # Iris: concatenated format (M, N * group_size)
-    iris_input_concat = shmem.zeros((M, N * group_size), dtype=dtype)
-    for i in range(group_size):
-        iris_input_concat[:, i * N : (i + 1) * N] = pytorch_input_tensor
-
-    iris_output_concat = shmem.zeros((M, N * group_size), dtype=dtype)
-
-    shmem.barrier()
-    config = Config()
-    shmem.ccl.all_to_all(iris_output_concat, iris_input_concat, group=my_group, config=config)
-    torch.cuda.synchronize()
-
-    atol = 1e-5
-    max_diff = torch.abs(iris_output_concat - pytorch_output_concat).max().item()
-
-    try:
-        assert torch.allclose(iris_output_concat, pytorch_output_concat, atol=atol), (
-            f"Max difference: {max_diff}, expected < {atol}\n"
-            f"Rank {rank} (group {group_idx}, ranks={group_ranks}): "
-            f"Iris output doesn't match PyTorch's all_to_all (group_type={group_type})"
-        )
-    finally:
-        shmem.barrier()
-        del shmem
-        import gc
-
-        gc.collect()
-
-
-# =============================================================================
-# Reduce-Scatter with Process Groups
-# =============================================================================
-#
-# NOTE: Iris's reduce_scatter has different semantics than PyTorch's reduce_scatter_tensor:
-# - PyTorch: input is (group_size * M, N), output is (M, N) - splits reduced tensor
-# - Iris: input and output are both (M, N) - distributes tiles among ranks
-#
-# Until semantics are aligned, we test reduce_scatter with groups by verifying
-# that the group operations produce mathematically correct results.
-
-
-@pytest.mark.parametrize("group_type", ["consecutive", "strided"])
-def test_reduce_scatter_with_groups(group_type, dtype=torch.float32, M=256, N=128):
-    """
-    Test reduce-scatter with ProcessGroups.
-
-    Since Iris's reduce_scatter has different semantics than PyTorch's,
-    we verify correctness by checking that:
-    1. Each rank in the group receives its assigned tiles (reduced values)
-    2. The sum of all tiles across the group equals the expected total
-    """
-    world_size, rank = _get_world_info()
-
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-
-    if group_type == "consecutive":
-        groups = _create_consecutive_groups(world_size, group_size=2)
-    else:
-        groups = _create_strided_groups(world_size, num_groups=2)
-
-    group_idx, my_group = _get_my_group(groups, rank)
-    assert my_group is not None
-
-    group_ranks = dist.get_process_group_ranks(my_group)
-
-    # Each rank fills with its global rank + 1
-    input_value = float(rank + 1)
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.fill_(input_value)
-    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
-
-    shmem.barrier()
-    config = Config()
-    shmem.ccl.reduce_scatter(iris_output_tensor, iris_input_tensor, group=my_group, config=config)
-    torch.cuda.synchronize()
-
-    # Expected sum for each tile (all ranks in group contribute)
-    expected_sum = sum(r + 1 for r in group_ranks)
-
-    # In reduce_scatter with tile distribution, each rank gets some tiles
-    # with the reduced sum value. Check that non-zero tiles have the correct value.
-    non_zero_mask = iris_output_tensor != 0
-
-    try:
-        if non_zero_mask.any():
-            non_zero_values = iris_output_tensor[non_zero_mask]
-            # All non-zero values should equal the expected sum
-            assert torch.allclose(non_zero_values, torch.full_like(non_zero_values, expected_sum), atol=1e-5), (
-                f"Rank {rank} (group {group_idx}, ranks={group_ranks}): "
-                f"Non-zero tiles have incorrect values. Expected {expected_sum}, got unique values: {non_zero_values.unique().tolist()}"
-            )
-
-        # Gather outputs from all ranks in group to verify total coverage
-        # (This is a simplified check - full verification would need cross-rank communication)
-
-    finally:
-        shmem.barrier()
-        del shmem
-        import gc
-
-        gc.collect()
-
-
-# =============================================================================
-# Edge Cases and Verification Tests
-# =============================================================================
-
-
-def test_group_info_extraction():
-    """Test that extract_group_info returns correct values for different groups."""
-    world_size, rank = _get_world_info()
-
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-
-    from iris.ccl.utils import extract_group_info
-
-    # Test 1: group=None should return global info
-    rank_in_group, rank_global, ws, rank_start, rank_stride = extract_group_info(None, shmem)
-    assert rank_in_group == rank_global == rank, "group=None: rank mismatch"
-    assert ws == world_size, "group=None: world_size mismatch"
-    assert rank_start == 0, "group=None: rank_start should be 0"
-    assert rank_stride == 1, "group=None: rank_stride should be 1"
-
-    # Test 2: Consecutive group [0, 1] - ALL ranks must call new_group collectively
-    consecutive_group = dist.new_group([0, 1])
-    if rank < 2:
-        rank_in_group, rank_global, ws, rank_start, rank_stride = extract_group_info(consecutive_group, shmem)
-        assert rank_in_group == rank, "Consecutive group: rank_in_group mismatch"
-        assert rank_global == rank, "Consecutive group: rank_global mismatch"
-        assert ws == 2, "Consecutive group: world_size should be 2"
-        assert rank_start == 0, "Consecutive group: rank_start should be 0"
-        assert rank_stride == 1, "Consecutive group: rank_stride should be 1"
-
-    # Test 3: Strided group [0, 2] - ALL ranks must call new_group collectively
-    if world_size >= 4:
-        strided_group = dist.new_group([0, 2])
-        if rank in [0, 2]:
-            rank_in_group, rank_global, ws, rank_start, rank_stride = extract_group_info(strided_group, shmem)
-            expected_rank_in_group = 0 if rank == 0 else 1
-            assert rank_in_group == expected_rank_in_group, (
-                f"Strided group: rank_in_group should be {expected_rank_in_group}, got {rank_in_group}"
-            )
-            assert rank_global == rank, f"Strided group: rank_global should be {rank}, got {rank_global}"
-            assert ws == 2, "Strided group: world_size should be 2"
-            assert rank_start == 0, "Strided group: rank_start should be 0"
-            assert rank_stride == 2, "Strided group: rank_stride should be 2"
-
-    shmem.barrier()
-    del shmem
-    import gc
-
-    gc.collect()
-
-
-def test_all_reduce_group_correctness():
-    """
-    Verify all-reduce with groups produces correct mathematical results.
-
-    With strided groups [0,2] and [1,3]:
-    - Group [0,2]: ranks fill with 1 and 3, sum should be 4
-    - Group [1,3]: ranks fill with 2 and 4, sum should be 6
-    """
-    world_size, rank = _get_world_info()
-
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-
-    # Create strided groups
-    groups = _create_strided_groups(world_size, num_groups=2)
-    group_idx, my_group = _get_my_group(groups, rank)
-    group_ranks = dist.get_process_group_ranks(my_group)
-
-    M, N = 64, 32
-    dtype = torch.float32
-
-    # Fill with rank + 1
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.fill_(float(rank + 1))
-    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
-
-    shmem.barrier()
-    config = Config(all_reduce_variant="two_shot")
-    workspace = shmem.ccl.all_reduce_preamble(iris_output_tensor, iris_input_tensor, config=config)
-    shmem.barrier()
-
-    shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, group=my_group, config=config, workspace=workspace)
-    torch.cuda.synchronize()
-
-    # Calculate expected sum
-    expected_sum = sum(r + 1 for r in group_ranks)
-    actual_sum = iris_output_tensor[0, 0].item()
-
-    try:
-        assert abs(actual_sum - expected_sum) < 1e-5, (
-            f"Rank {rank} (group ranks={group_ranks}): Expected sum {expected_sum}, got {actual_sum}"
-        )
-    finally:
-        shmem.barrier()
-        del shmem
-        import gc
-
-        gc.collect()
-
-
-def test_rank_stride_target_rank_calculation():
-    """
-    Explicitly test that rank_start + i * rank_stride correctly computes target_rank.
-
-    This test verifies the core indexing mechanism used in CCL kernels:
-    - Loop index `i` goes from 0 to world_size-1 (position in group)
-    - `target_rank = rank_start + i * rank_stride` computes global rank
-    - `group_rank` (rank_in_group) is compared with `i` for local vs remote operations
-
-    Example with strided group [0, 2] (stride=2):
-        i=0 -> target_rank = 0 + 0*2 = 0 (global rank 0)
-        i=1 -> target_rank = 0 + 1*2 = 2 (global rank 2)
-    """
-    world_size, rank = _get_world_info()
-
-    if world_size != 4:
-        pytest.skip("This test requires exactly 4 ranks for strided group testing")
-
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-
-    from iris.ccl.utils import extract_group_info
-
-    # Test with strided group [0, 2] - stride of 2
-    strided_group_02 = dist.new_group([0, 2])
-
-    # Test with strided group [1, 3] - stride of 2
-    strided_group_13 = dist.new_group([1, 3])
-
-    if rank in [0, 2]:
-        rank_in_group, rank_global, ws, rank_start, rank_stride = extract_group_info(strided_group_02, shmem)
-
-        # Verify the target_rank calculation for each loop iteration
-        expected_target_ranks = [0, 2]  # Global ranks in the group
-        for i in range(ws):
-            computed_target_rank = rank_start + i * rank_stride
-            assert computed_target_rank == expected_target_ranks[i], (
-                f"Rank {rank}: For i={i}, expected target_rank={expected_target_ranks[i]}, "
-                f"got {computed_target_rank} (rank_start={rank_start}, rank_stride={rank_stride})"
-            )
-
-        # Verify that i == group_rank identifies the local rank correctly
-        expected_local_i = 0 if rank == 0 else 1
-        assert rank_in_group == expected_local_i, (
-            f"Rank {rank}: rank_in_group={rank_in_group} should match expected_local_i={expected_local_i}"
-        )
-
-        # Verify: when i == rank_in_group, target_rank == rank_global
-        local_target_rank = rank_start + rank_in_group * rank_stride
-        assert local_target_rank == rank_global, (
-            f"Rank {rank}: local_target_rank={local_target_rank} should equal rank_global={rank_global}"
-        )
-
-    if rank in [1, 3]:
-        rank_in_group, rank_global, ws, rank_start, rank_stride = extract_group_info(strided_group_13, shmem)
-
-        # Verify the target_rank calculation for each loop iteration
-        expected_target_ranks = [1, 3]  # Global ranks in the group
-        for i in range(ws):
-            computed_target_rank = rank_start + i * rank_stride
-            assert computed_target_rank == expected_target_ranks[i], (
-                f"Rank {rank}: For i={i}, expected target_rank={expected_target_ranks[i]}, "
-                f"got {computed_target_rank} (rank_start={rank_start}, rank_stride={rank_stride})"
-            )
-
-        # Verify that i == group_rank identifies the local rank correctly
-        expected_local_i = 0 if rank == 1 else 1
-        assert rank_in_group == expected_local_i, (
-            f"Rank {rank}: rank_in_group={rank_in_group} should match expected_local_i={expected_local_i}"
-        )
-
-        # Verify: when i == rank_in_group, target_rank == rank_global
-        local_target_rank = rank_start + rank_in_group * rank_stride
-        assert local_target_rank == rank_global, (
-            f"Rank {rank}: local_target_rank={local_target_rank} should equal rank_global={rank_global}"
-        )
-
-    shmem.barrier()
-    del shmem
-    import gc
-
-    gc.collect()
-
-
-def test_all_gather_strided_data_placement():
-    """
-    Verify all-gather with strided groups places data in correct output locations.
-
-    This test ensures that with strided groups like [0, 2]:
-    - Rank 0's data goes to output[0:M, :] on all group members
-    - Rank 2's data goes to output[M:2M, :] on all group members
-
-    The key insight: output placement uses rank_in_group (0, 1) not global rank (0, 2).
-    """
-    world_size, rank = _get_world_info()
-
-    if world_size != 4:
-        pytest.skip("This test requires exactly 4 ranks for strided group testing")
-
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-
-    M, N = 64, 32
-    dtype = torch.float32
-
-    # Create strided groups [0, 2] and [1, 3]
-    strided_group_02 = dist.new_group([0, 2])
-    strided_group_13 = dist.new_group([1, 3])
-
-    # Test with group [0, 2]
-    if rank in [0, 2]:
-        group_ranks = [0, 2]
-        group_size = 2
-
-        # Each rank fills input with its global rank + 1 for identification
-        input_tensor = shmem.zeros((M, N), dtype=dtype)
-        input_tensor.fill_(float(rank + 1))  # Rank 0 -> 1.0, Rank 2 -> 3.0
-
-        output_tensor = shmem.zeros((group_size * M, N), dtype=dtype)
-
-        shmem.barrier()
-        config = Config()
-        shmem.ccl.all_gather(output_tensor, input_tensor, group=strided_group_02, config=config)
-        torch.cuda.synchronize()
-
-        # Verify data placement:
-        # - output[0:M, :] should contain rank 0's data (value 1.0)
-        # - output[M:2M, :] should contain rank 2's data (value 3.0)
-        chunk_0 = output_tensor[0:M, :].mean().item()
-        chunk_1 = output_tensor[M : 2 * M, :].mean().item()
-
-        expected_chunk_0 = 1.0  # From global rank 0 (rank_in_group=0)
-        expected_chunk_1 = 3.0  # From global rank 2 (rank_in_group=1)
-
-        assert abs(chunk_0 - expected_chunk_0) < 1e-5, (
-            f"Rank {rank}: output[0:M] should be {expected_chunk_0} (from rank 0), got {chunk_0}"
-        )
-        assert abs(chunk_1 - expected_chunk_1) < 1e-5, (
-            f"Rank {rank}: output[M:2M] should be {expected_chunk_1} (from rank 2), got {chunk_1}"
-        )
-
-    # Test with group [1, 3]
-    if rank in [1, 3]:
-        group_ranks = [1, 3]
-        group_size = 2
-
-        # Each rank fills input with its global rank + 1 for identification
-        input_tensor = shmem.zeros((M, N), dtype=dtype)
-        input_tensor.fill_(float(rank + 1))  # Rank 1 -> 2.0, Rank 3 -> 4.0
-
-        output_tensor = shmem.zeros((group_size * M, N), dtype=dtype)
-
-        shmem.barrier()
-        config = Config()
-        shmem.ccl.all_gather(output_tensor, input_tensor, group=strided_group_13, config=config)
-        torch.cuda.synchronize()
-
-        # Verify data placement:
-        # - output[0:M, :] should contain rank 1's data (value 2.0)
-        # - output[M:2M, :] should contain rank 3's data (value 4.0)
-        chunk_0 = output_tensor[0:M, :].mean().item()
-        chunk_1 = output_tensor[M : 2 * M, :].mean().item()
-
-        expected_chunk_0 = 2.0  # From global rank 1 (rank_in_group=0)
-        expected_chunk_1 = 4.0  # From global rank 3 (rank_in_group=1)
-
-        assert abs(chunk_0 - expected_chunk_0) < 1e-5, (
-            f"Rank {rank}: output[0:M] should be {expected_chunk_0} (from rank 1), got {chunk_0}"
-        )
-        assert abs(chunk_1 - expected_chunk_1) < 1e-5, (
-            f"Rank {rank}: output[M:2M] should be {expected_chunk_1} (from rank 3), got {chunk_1}"
-        )
-
-    shmem.barrier()
-    del shmem
-    import gc
-
-    gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/examples/test_all_load_bench.py b/tests/examples/test_all_load_bench.py
index bc925cdd5..6839eb209 100644
--- a/tests/examples/test_all_load_bench.py
+++ b/tests/examples/test_all_load_bench.py
@@ -17,147 +17,7 @@
 spec.loader.exec_module(module)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "buffer_size, heap_size",
-    [
-        ((1 << 20), (1 << 30)),  # 1 MiB buffer, 1 GiB heap
-        ((1 << 22), (1 << 31)),  # 4 MiB buffer, 2 GiB heap
-    ],
-)
-@pytest.mark.parametrize(
-    "block_size",
-    [
-        512,
-        1024,
-    ],
-)
-def test_all_load_bench(dtype, buffer_size, heap_size, block_size):
-    # TODO: Benchmark is not accurate. See: https://github.com/ROCm/iris/issues/119
-    pytest.skip("Benchmark is not accurate. See: https://github.com/ROCm/iris/issues/119")
-    shmem = None
-    try:
-        shmem = iris.iris(heap_size)
-        num_ranks = shmem.get_num_ranks()
 
-        element_size_bytes = torch.tensor([], dtype=dtype).element_size()
-        n_elements = buffer_size // element_size_bytes
-        buffer = shmem.zeros(n_elements, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-        # Create arguments similar to what all_load_bench.py expects
-        args = {
-            "datatype": _torch_dtype_to_str(dtype),
-            "block_size": block_size,
-            "active_ranks": num_ranks,
-            "num_warmup": 4,
-            "num_experiments": 8,
-            "verbose": False,
-            "validate": False,
-        }
-
-        shmem.barrier()
-
-        # Run the experiment and measure bandwidth
-        bandwidth_gbps = module.run_experiment(shmem, args, buffer)
-
-        shmem.barrier()
-
-        # Verify that we got a reasonable bandwidth measurement
-        assert isinstance(bandwidth_gbps, float)
-        assert bandwidth_gbps >= 0.0  # Bandwidth should be non-negative
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        if shmem is not None:
-            try:
-                shmem.barrier()
-            except Exception:
-                pass  # Ignore errors during cleanup
-            # Explicitly delete the shmem instance to trigger cleanup
-            del shmem
-            # Force garbage collection to ensure IPC handles are cleaned up
-            import gc
-
-            gc.collect()
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,  # Test with one dtype for validation
-    ],
-)
-def test_all_load_bench_with_validation(dtype):
-    """Test all_load_bench with validation enabled to ensure correctness"""
-    heap_size = 1 << 30  # 1 GiB heap
-    buffer_size = 1 << 20  # 1 MiB buffer
-    block_size = 512
-
-    shmem = None
-    try:
-        shmem = iris.iris(heap_size)
-        num_ranks = shmem.get_num_ranks()
-
-        element_size_bytes = torch.tensor([], dtype=dtype).element_size()
-        n_elements = buffer_size // element_size_bytes
-        buffer = shmem.zeros(n_elements, dtype=dtype)
-
-        # Create arguments with validation enabled
-        args = {
-            "datatype": _torch_dtype_to_str(dtype),
-            "block_size": block_size,
-            "active_ranks": num_ranks,
-            "num_warmup": 1,
-            "num_experiments": 1,
-            "verbose": False,
-            "validate": True,  # Enable validation
-        }
-
-        shmem.barrier()
-
-        # Run the experiment and measure bandwidth
-        bandwidth_gbps = module.run_experiment(shmem, args, buffer)
-
-        shmem.barrier()
-
-        # Verify that we got a reasonable bandwidth measurement
-        assert isinstance(bandwidth_gbps, float)
-        assert bandwidth_gbps >= 0.0  # Bandwidth should be non-negative
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        if shmem is not None:
-            try:
-                shmem.barrier()
-            except Exception:
-                pass  # Ignore errors during cleanup
-            # Explicitly delete the shmem instance to trigger cleanup
-            del shmem
-            # Force garbage collection to ensure IPC handles are cleaned up
-            import gc
-
-            gc.collect()
-
-
-def _torch_dtype_to_str(dtype):
-    """Convert torch dtype to string format expected by all_load_bench.py"""
-    if dtype == torch.int8:
-        return "int8"
-    elif dtype == torch.float16:
-        return "fp16"
-    elif dtype == torch.bfloat16:
-        return "bf16"
-    elif dtype == torch.float32:
-        return "fp32"
-    else:
-        raise ValueError(f"Unsupported dtype: {dtype}")
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/examples/test_atomic_add_bench.py b/tests/examples/test_atomic_add_bench.py
index dbf995e96..fceceba83 100644
--- a/tests/examples/test_atomic_add_bench.py
+++ b/tests/examples/test_atomic_add_bench.py
@@ -36,140 +36,7 @@
 spec.loader.exec_module(module)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "buffer_size, heap_size",
-    [
-        (20480, (1 << 33)),
-    ],
-)
-@pytest.mark.parametrize(
-    "block_size",
-    [
-        512,
-        1024,
-    ],
-)
-def test_atomic_bandwidth(dtype, buffer_size, heap_size, block_size):
-    """Test that atomic_add benchmark runs and produces positive bandwidth."""
-    shmem = None
-    try:
-        shmem = iris.iris(heap_size)
-        num_ranks = shmem.get_num_ranks()
 
-        element_size_bytes = torch.tensor([], dtype=dtype).element_size()
-        n_elements = buffer_size // element_size_bytes
-        source_buffer = shmem.arange(n_elements, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-        shmem.barrier()
-
-        args = {
-            "datatype": torch_dtype_to_str(dtype),
-            "block_size": block_size,
-            "verbose": False,
-            "validate": False,
-            "num_experiments": 10,
-            "num_warmup": 5,
-        }
-
-        source_rank = 0
-        destination_rank = 1 if num_ranks > 1 else 0
-
-        bandwidth_gbps, _ = module.run_experiment(shmem, args, source_rank, destination_rank, source_buffer)
-
-        assert bandwidth_gbps > 0, f"Bandwidth should be positive, got {bandwidth_gbps}"
-
-        shmem.barrier()
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        if shmem is not None:
-            try:
-                shmem.barrier()
-            except Exception:
-                pass  # Ignore errors during cleanup
-            # Explicitly delete the shmem instance to trigger cleanup
-            del shmem
-            # Force garbage collection to ensure IPC handles are cleaned up
-            import gc
-
-            gc.collect()
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "buffer_size, heap_size",
-    [
-        (20480, (1 << 33)),
-    ],
-)
-@pytest.mark.parametrize(
-    "block_size",
-    [
-        512,
-        1024,
-    ],
-)
-def test_atomic_correctness(dtype, buffer_size, heap_size, block_size):
-    """Test that atomic_add benchmark runs and produces positive bandwidth."""
-    shmem = None
-    try:
-        shmem = iris.iris(heap_size)
-        num_ranks = shmem.get_num_ranks()
-
-        element_size_bytes = torch.tensor([], dtype=dtype).element_size()
-        n_elements = buffer_size // element_size_bytes
-        source_buffer = shmem.arange(n_elements, dtype=dtype)
-
-        shmem.barrier()
-
-        args = {
-            "datatype": torch_dtype_to_str(dtype),
-            "block_size": block_size,
-            "verbose": False,
-            "validate": False,
-            "num_experiments": 1,
-            "num_warmup": 0,
-        }
-
-        source_rank = 0
-        destination_rank = 1 if num_ranks > 1 else 0
-
-        _, result_buffer = module.run_experiment(shmem, args, source_rank, destination_rank, source_buffer)
-
-        if shmem.get_rank() == destination_rank:
-            expected = torch.ones(n_elements, dtype=dtype, device="cuda")
-
-            assert torch.allclose(result_buffer, expected), "Result buffer should be equal to expected"
-
-        shmem.barrier()
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        if shmem is not None:
-            try:
-                shmem.barrier()
-            except Exception:
-                pass  # Ignore errors during cleanup
-            # Explicitly delete the shmem instance to trigger cleanup
-            del shmem
-            # Force garbage collection to ensure IPC handles are cleaned up
-            import gc
-
-            gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/examples/test_flash_decode.py b/tests/examples/test_flash_decode.py
index 68b478b58..3cfcc5700 100644
--- a/tests/examples/test_flash_decode.py
+++ b/tests/examples/test_flash_decode.py
@@ -117,137 +117,7 @@ def prepare_correctness_data(cfg, args, num_query_heads, num_kv_heads, NUM_BLOCK
     return {"query": query, "key_value_cache": key_value_cache}
 
 
-@pytest.mark.parametrize("head_dim", [128])
-@pytest.mark.parametrize("num_seqs", [1, 8])
-@pytest.mark.parametrize("num_heads", [48, 96])
-@pytest.mark.parametrize("kv_len", [4096, 65536])
-def test_correctness_fused_full(kv_len, num_heads, num_seqs, head_dim):
-    """
-    Tests the correctness of the Iris Fused implementation against the Torch reference.
-    This test is parameterized to run all combinations of the parameters.
-    """
-    shmem = None
-    try:
-        shmem = iris.iris()
-
-        args = Namespace()
-        args.rank = shmem.get_rank()
-        args.num_ranks = shmem.get_num_ranks()
-        args.local_num_ranks = shmem.get_num_ranks()
-        args.shmem = shmem
-
-        config = {
-            "kv_len": kv_len,
-            "num_heads": num_heads,
-            "num_seqs": num_seqs,
-            "head_dim": head_dim,
-            "dtype": torch.float16,
-            "block_size": 1,
-            "soft_cap": 0,
-        }
-
-        # torch.manual_seed(42)
-        torch.set_default_device("cuda")
-
-        num_query_heads = num_heads
-        num_kv_heads = num_query_heads // 8 if num_query_heads >= 8 else 1
-        scale = head_dim**-0.5
-        NUM_BLOCKS_PER_RANK = config["kv_len"] + 1
-        NUM_BLOCKS = NUM_BLOCKS_PER_RANK * args.num_ranks
-
-        tensor_data = prepare_correctness_data(config, args, num_query_heads, num_kv_heads, NUM_BLOCKS)
-        query = tensor_data["query"]
-        key_value_cache = tensor_data["key_value_cache"]
-
-        key_cache = key_value_cache[:, 0, :, :, :].contiguous()
-        value_cache = key_value_cache[:, 1, :, :, :].contiguous()
-        key_cache_this_rank = key_cache[
-            args.rank * NUM_BLOCKS_PER_RANK : (args.rank + 1) * NUM_BLOCKS_PER_RANK
-        ].contiguous()
-        value_cache_this_rank = value_cache[
-            args.rank * NUM_BLOCKS_PER_RANK : (args.rank + 1) * NUM_BLOCKS_PER_RANK
-        ].contiguous()
-
-        block_tables_this_rank = torch.arange(NUM_BLOCKS_PER_RANK, dtype=torch.int32).repeat(num_seqs, 1)
-        all_block_tables_numpy = iris._distributed_helpers.distributed_allgather_multidim(
-            block_tables_this_rank.cpu().numpy()
-        )
-        block_tables = torch.from_numpy(all_block_tables_numpy).view(args.num_ranks, num_seqs, -1)
-        ref_block_tables = torch.cat([block_tables[i] + i * NUM_BLOCKS_PER_RANK for i in range(args.num_ranks)], dim=-1)
-
-        common_params = {
-            "num_q_heads": num_query_heads,
-            "num_kv_heads": num_kv_heads,
-            "q_head_dim": head_dim,
-            "v_head_dim": head_dim,
-            "page_size": config["block_size"],
-            "scale": scale,
-            "soft_cap": config["soft_cap"],
-            "max_allowed_batch": num_seqs,
-        }
-
-        iris_fd_layer = flash_decode_fused_layer(
-            args.shmem,
-            args.rank,
-            args.rank // args.local_num_ranks,
-            args.num_ranks,
-            args.num_ranks // args.local_num_ranks,
-            **common_params,
-        )
-
-        args.shmem.barrier()
-        if hasattr(iris_fd_layer, "clear_flags"):
-            iris_fd_layer.clear_flags()
-        args.shmem.barrier()
-
-        kv_lens_per_rank = [config["kv_len"]] * num_seqs
-        global_kv_lens = [kv_lens_per_rank[0] * args.num_ranks] * num_seqs
-        kv_lens_tensor = torch.tensor(kv_lens_per_rank, dtype=torch.int32, device=query.device)
-        global_kv_lens_tensor = kv_lens_tensor.unsqueeze(0).repeat(args.num_ranks, 1)
-
-        output = iris_fd_layer(
-            query, key_cache_this_rank, value_cache_this_rank, global_kv_lens_tensor, block_tables_this_rank
-        )
-        torch.cuda.synchronize()
-
-        ref_output = ref_paged_attn(
-            query=query.clone(),
-            key_cache=key_cache,
-            value_cache=value_cache,
-            query_lens=[1] * num_seqs,
-            kv_lens_per_rank=global_kv_lens,
-            block_tables=ref_block_tables,
-            scale=scale,
-            soft_cap=config["soft_cap"],
-        )
-        args.shmem.barrier()
-
-        error = None
-        try:
-            atol = 1e-4
-            rtol = 1e-4
-            torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
-        except AssertionError as e:
-            error = e
-
-        print_correctness_report(args.rank, output, ref_output, error)
-
-        if error:
-            raise error
-
-        args.shmem.barrier()
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        if shmem is not None:
-            try:
-                shmem.barrier()
-            except Exception:
-                pass  # Ignore errors during cleanup
-            # Explicitly delete the shmem instance to trigger cleanup
-            del shmem
-            # Force garbage collection to ensure IPC handles are cleaned up
-            import gc
-
-            gc.collect()
+
+pytestmark = pytest.mark.multi_rank_required
+
+@pytest.mark.parametrize("head_dim", [128])
\ No newline at end of file
diff --git a/tests/examples/test_load_bench.py b/tests/examples/test_load_bench.py
index 261c2a8ed..da287f71c 100644
--- a/tests/examples/test_load_bench.py
+++ b/tests/examples/test_load_bench.py
@@ -19,60 +19,7 @@
 
 
 @pytest.mark.skip(reason="Test is inconsistent and needs debugging - tracked in issue")
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "buffer_size, heap_size",
-    [
-        ((1 << 32), (1 << 33)),
-    ],
-)
-@pytest.mark.parametrize(
-    "block_size",
-    [
-        512,
-        1024,
-    ],
-)
-def test_load_bench(dtype, buffer_size, heap_size, block_size):
-    shmem = None
-    try:
-        shmem = iris.iris(heap_size)
-        num_ranks = shmem.get_num_ranks()
 
-        bandwidth_matrix = np.zeros((num_ranks, num_ranks), dtype=np.float32)
-        element_size_bytes = torch.tensor([], dtype=dtype).element_size()
-        source_buffer = shmem.ones(buffer_size // element_size_bytes, dtype=dtype)
-        result_buffer = shmem.zeros_like(source_buffer)
+pytestmark = pytest.mark.multi_rank_required
 
-        shmem.barrier()
-
-        for source_rank in range(num_ranks):
-            for destination_rank in range(num_ranks):
-                bandwidth_gbps = module.bench_load(
-                    shmem, source_rank, destination_rank, source_buffer, result_buffer, block_size, dtype
-                )
-                bandwidth_matrix[source_rank, destination_rank] = bandwidth_gbps
-                shmem.barrier()
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        if shmem is not None:
-            try:
-                shmem.barrier()
-            except Exception:
-                pass  # Ignore errors during cleanup
-            # Explicitly delete the shmem instance to trigger cleanup
-            del shmem
-            # Force garbage collection to ensure IPC handles are cleaned up
-            import gc
-
-            gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/examples/test_message_passing.py b/tests/examples/test_message_passing.py
index aa173dead..0f86646af 100644
--- a/tests/examples/test_message_passing.py
+++ b/tests/examples/test_message_passing.py
@@ -109,61 +109,7 @@ def run_message_passing_kernels(module, args):
             gc.collect()
 
 
-@pytest.mark.parametrize(
-    "dtype_str",
-    [
-        "int8",
-        "fp16",
-        "bf16",
-        "fp32",
-    ],
-)
-@pytest.mark.parametrize(
-    "buffer_size, heap_size",
-    [
-        (4096, 1 << 20),  # Smaller sizes for testing
-        (8192, 1 << 21),
-    ],
-)
-@pytest.mark.parametrize(
-    "block_size",
-    [
-        512,
-        1024,
-    ],
-)
-def test_message_passing_load_store(dtype_str, buffer_size, heap_size, block_size):
-    """Test message passing with load/store operations."""
-    args = create_test_args(dtype_str, buffer_size, heap_size, block_size)
-    success = run_message_passing_kernels(load_store_module, args)
-    assert success, "Message passing load/store validation failed"
-
-
-@pytest.mark.parametrize(
-    "dtype_str",
-    [
-        "int8",
-        "fp16",
-        "bf16",
-        "fp32",
-    ],
-)
-@pytest.mark.parametrize(
-    "buffer_size, heap_size",
-    [
-        (4096, 1 << 20),  # Smaller sizes for testing
-        (8192, 1 << 21),
-    ],
-)
-@pytest.mark.parametrize(
-    "block_size",
-    [
-        512,
-        1024,
-    ],
-)
-def test_message_passing_put(dtype_str, buffer_size, heap_size, block_size):
-    """Test message passing with put operations."""
-    args = create_test_args(dtype_str, buffer_size, heap_size, block_size)
-    success = run_message_passing_kernels(put_module, args)
-    assert success, "Message passing put validation failed"
+
+pytestmark = pytest.mark.multi_rank_required
+
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/ops/test_all_gather_matmul.py b/tests/ops/test_all_gather_matmul.py
index 193505011..c659a573d 100644
--- a/tests/ops/test_all_gather_matmul.py
+++ b/tests/ops/test_all_gather_matmul.py
@@ -15,103 +15,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 1e-2, 1e-2),
-    ],
-)
-@pytest.mark.parametrize(
-    "M,K_local,N",
-    [
-        (128, 32, 64),
-        (256, 64, 128),
-    ],
-)
-def test_all_gather_matmul(dtype, atol, rtol, M, K_local, N):
-    """Test all_gather_matmul against torch all_gather + matmul."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
 
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
+pytestmark = pytest.mark.multi_rank_required
 
-    K = K_local * world_size  # Full K dimension
-
-    # Skip if problem size is too small for world_size or block sizes
-    # With default or custom configs, we need at least one tile
-    min_block_size = 32  # Smallest block size we use
-    if M < min_block_size:
-        pytest.skip(f"M={M} too small (need >= {min_block_size})")
-    if K_local < min_block_size:
-        pytest.skip(f"K_local={K_local} too small (need >= {min_block_size})")
-    if N < min_block_size:
-        pytest.skip(f"N={N} too small (need >= {min_block_size})")
-
-    # Seed for reproducibility - different seed per rank for A_sharded
-    torch.manual_seed(42 + rank)
-    A_sharded = torch.randn(M, K_local, dtype=dtype, device=f"cuda:{rank}")
-
-    # B must be identical on all ranks
-    torch.manual_seed(123)
-    B = torch.randn(K, N, dtype=dtype, device=f"cuda:{rank}")
-
-    # Reference: torch all_gather + matmul
-    A_gathered_list = [torch.zeros(M, K_local, dtype=dtype, device=f"cuda:{rank}") for _ in range(world_size)]
-    dist.all_gather(A_gathered_list, A_sharded)
-    A_gathered_ref = torch.cat(A_gathered_list, dim=1)  # (M, K)
-    ref_output = torch.matmul(A_gathered_ref, B)
-    torch.cuda.synchronize()
-
-    # Create shmem tensors directly
-    A_sharded_shmem = shmem.zeros((M, K_local), dtype=dtype)
-    A_sharded_shmem.copy_(A_sharded)
-    B_shmem = shmem.zeros((K, N), dtype=dtype)
-    B_shmem.copy_(B)
-    output = shmem.zeros((M, N), dtype=dtype)
-
-    shmem.barrier()
-
-    # Run fused all_gather + matmul using shmem.ops API
-    from iris.ops.config import FusedConfig
-
-    # Use appropriate block sizes based on problem size
-    # For small problems, use smaller blocks
-    if M <= 256 or K_local <= 64 or N <= 128:
-        config = FusedConfig(block_size_m=64, block_size_n=64, block_size_k=32)
-    else:
-        config = FusedConfig()
-
-    # Validate config against problem size
-    assert M >= config.block_size_m, f"M ({M}) must be >= block_size_m ({config.block_size_m})"
-    assert K_local >= config.block_size_k, f"K_local ({K_local}) must be >= block_size_k ({config.block_size_k})"
-    assert N >= config.block_size_n, f"N ({N}) must be >= block_size_n ({config.block_size_n})"
-
-    shmem.ops.all_gather_matmul(output, A_sharded_shmem, B_shmem, config=config)
-
-    torch.cuda.synchronize()
-    shmem.barrier()
-
-    max_diff = (output - ref_output).abs().max().item()
-
-    assert torch.allclose(output, ref_output, atol=atol, rtol=rtol), (
-        f"Rank {rank}: Max diff {max_diff}, expected < {atol}"
-    )
-
-
-if __name__ == "__main__":
-    # For quick debugging
-    import sys
-
-    if not dist.is_initialized():
-        print("Run with: torchrun --nproc_per_node=2 tests/ops/test_all_gather_matmul.py")
-        sys.exit(1)
-
-    rank = dist.get_rank()
-    torch.cuda.set_device(rank)
-
-    print(f"[Rank {rank}] Testing all_gather_matmul...")
-    test_all_gather_matmul(torch.float16, 128, 32, 64)
-    print(f"[Rank {rank}] ✓ Test passed!")
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/ops/test_matmul_all_gather.py b/tests/ops/test_matmul_all_gather.py
index 78ec0e47a..541e78898 100644
--- a/tests/ops/test_matmul_all_gather.py
+++ b/tests/ops/test_matmul_all_gather.py
@@ -14,106 +14,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 0.5, 0.01),
-        (torch.float32, 0.5, 0.01),
-        (torch.bfloat16, 0.5, 0.01),
-    ],
-)
-@pytest.mark.parametrize(
-    "M, N, K",
-    [
-        (64, 64, 32),
-        (512, 256, 512),
-        (1024, 2048, 1024),
-    ],
-)
-def test_matmul_all_gather(dtype, atol, rtol, M, N, K):
-    """Test matmul_all_gather using shmem.ops API with proper config."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
 
-    heap_size = 2**33  # 8GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
+pytestmark = pytest.mark.multi_rank_required
 
-    # M must be divisible by world_size for row-wise sharding
-    if M % world_size != 0:
-        pytest.skip(f"M={M} not divisible by world_size={world_size}")
-
-    M_local = M // world_size
-
-    # Skip if problem size is too small for world_size
-    # With default or custom configs, we need at least one tile per rank
-    min_block_size = 32  # Smallest block size we use
-    if M_local < min_block_size:
-        pytest.skip(f"M_local={M_local} too small for world_size={world_size} (need >= {min_block_size})")
-    if K < min_block_size:
-        pytest.skip(f"K={K} too small (need >= {min_block_size})")
-    if N < min_block_size:
-        pytest.skip(f"N={N} too small (need >= {min_block_size})")
-
-    # Create shmem tensors directly
-    A_local = shmem.randn((M_local, K), dtype=dtype)
-    B = shmem.randn((K, N), dtype=dtype)
-    output = shmem.zeros((M, N), dtype=dtype)
-
-    # Reference: compute local GEMM, then all-gather along M dimension
-    A_ref = A_local.clone()
-    B_ref = B.clone()
-    C_local_ref = torch.matmul(A_ref, B_ref)
-    C_gathered_list = [torch.zeros(M_local, N, dtype=dtype, device=f"cuda:{rank}") for _ in range(world_size)]
-    dist.all_gather(C_gathered_list, C_local_ref)
-    pytorch_output = torch.cat(C_gathered_list, dim=0)  # Concatenate along M dimension
-    torch.cuda.synchronize()
-
-    shmem.barrier()
-
-    # Use appropriate block sizes based on problem size
-    from iris.ops.config import FusedConfig
-
-    # Select config based on actual problem dimensions
-    # Ensure block sizes don't exceed actual dimensions
-    if M_local <= 64 or K <= 64 or N <= 64:
-        # Small problems - use 32x32x32 blocks
-        config = FusedConfig(block_size_m=32, block_size_n=32, block_size_k=32)
-    elif M_local <= 128 or K <= 128 or N <= 128:
-        # Medium problems - use 64x64x32 blocks
-        config = FusedConfig(block_size_m=64, block_size_n=64, block_size_k=32)
-    elif dtype == torch.float32:
-        # Larger problems with fp32 - use 128x128x64 blocks
-        config = FusedConfig(block_size_m=128, block_size_n=128, block_size_k=64)
-    else:
-        # Larger problems with fp16/bf16 - use 128x128x64 blocks
-        config = FusedConfig(block_size_m=128, block_size_n=128, block_size_k=64)
-
-    # Validate config against problem size
-    if config is not None:
-        assert M_local >= config.block_size_m, f"M_local ({M_local}) must be >= block_size_m ({config.block_size_m})"
-        assert K >= config.block_size_k, f"K ({K}) must be >= block_size_k ({config.block_size_k})"
-        assert N >= config.block_size_n, f"N ({N}) must be >= block_size_n ({config.block_size_n})"
-
-    # Use shmem.ops API with proper config
-    shmem.ops.matmul_all_gather(output, A_local, B, config=config)
-
-    torch.cuda.synchronize()
-    shmem.barrier()
-
-    max_diff = torch.abs(output - pytorch_output).max().item()
-
-    assert torch.allclose(output, pytorch_output, atol=atol, rtol=rtol), (
-        f"Max difference: {max_diff}, expected < {atol}\n"
-        f"Rank {rank}: shmem.ops.matmul_all_gather output doesn't match reference"
-    )
-
-    if rank == 0:
-        print(f"✓ matmul_all_gather test passed: {dtype}, M={M}, N={N}, K={K}")
-
-    shmem.barrier()
-    del shmem
-    import gc
-
-    gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/ops/test_matmul_all_reduce.py b/tests/ops/test_matmul_all_reduce.py
index 5780b5d4d..a81a6a079 100644
--- a/tests/ops/test_matmul_all_reduce.py
+++ b/tests/ops/test_matmul_all_reduce.py
@@ -15,136 +15,7 @@
 import iris.ops as ops
 
 
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 0.2, 0.01),
-        (torch.float32, 0.3, 0.01),
-        (torch.bfloat16, 2.5, 0.02),  # Increased from 1.5 to 2.5 for 8-rank tests
-    ],
-)
-@pytest.mark.parametrize(
-    "M, N, K",
-    [
-        (128, 64, 32),
-        (1024, 256, 512),
-    ],
-)
-@pytest.mark.parametrize(
-    "variant",
-    [
-        "atomic",
-        # TODO enable these tests when support for cache-modifiers is in place.
-        # "spinlock",
-        "one_shot",
-        "two_shot",
-    ],
-)
-def test_matmul_all_reduce(dtype, atol, rtol, M, N, K, variant):
-    """Test matmul_all_reduce by comparing against torch.matmul + dist.all_reduce."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
 
-    heap_size = 2**33  # 8GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
+pytestmark = pytest.mark.multi_rank_required
 
-    # Create input matrices
-    A_local = torch.randn(M, K, dtype=dtype, device=f"cuda:{rank}")
-    B = torch.randn(K, N, dtype=dtype, device=f"cuda:{rank}")
-
-    # Compute reference: torch.matmul + dist.all_reduce
-    C_local_ref = torch.matmul(A_local, B)
-    pytorch_output = C_local_ref.clone()
-    shmem.barrier()
-    dist.all_reduce(pytorch_output, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
-
-    # Set up Iris tensors
-    iris_A = shmem.zeros((M, K), dtype=dtype)
-    iris_A.copy_(A_local)
-    iris_B = shmem.zeros((K, N), dtype=dtype)
-    iris_B.copy_(B)
-    iris_C = shmem.zeros((M, N), dtype=dtype)
-
-    shmem.barrier()
-
-    # Select appropriate config based on problem size
-    from iris.ops.config import FusedConfig
-
-    if M <= 128 or K <= 64 or N <= 128:
-        config = FusedConfig(block_size_m=64, block_size_n=64, block_size_k=32, all_reduce_variant=variant)
-    elif dtype == torch.float32:
-        config = FusedConfig(block_size_m=64, block_size_n=64, block_size_k=32, all_reduce_variant=variant)
-    else:
-        config = FusedConfig(all_reduce_variant=variant)
-
-    # Use high-level API
-    ops.matmul_all_reduce(shmem, iris_C, iris_A, iris_B, config=config)
-
-    torch.cuda.synchronize()
-    shmem.barrier()
-
-    max_diff = torch.abs(iris_C - pytorch_output).max().item()
-
-    assert torch.allclose(iris_C, pytorch_output, atol=atol, rtol=rtol), (
-        f"Max difference: {max_diff}, expected < {atol}\n"
-        f"Rank {rank}: iris.ops.matmul_all_reduce output doesn't match reference"
-    )
-
-    if rank == 0:
-        print(f"✓ matmul_all_reduce test passed: {dtype}, M={M}, N={N}, K={K}, variant={variant}")
-
-    shmem.barrier()
-    del shmem
-    import gc
-
-    gc.collect()
-
-
-def test_matmul_all_reduce_via_shmem_ops():
-    """Test accessing matmul_all_reduce via shmem.ops namespace."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
-
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-
-    M, N, K = 256, 128, 64
-    dtype = torch.float16
-
-    A = shmem.randn((M, K), dtype=dtype)
-    B = shmem.randn((K, N), dtype=dtype)
-    output = shmem.zeros((M, N), dtype=dtype)
-
-    # Reference using PyTorch
-    A_ref = A.clone()
-    B_ref = B.clone()
-    C_ref = torch.matmul(A_ref, B_ref)
-    pytorch_output = C_ref.clone()
-    shmem.barrier()
-    dist.all_reduce(pytorch_output, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
-
-    # Use shmem.ops interface
-    shmem.ops.matmul_all_reduce(output, A, B)
-
-    torch.cuda.synchronize()
-    shmem.barrier()
-
-    atol = 0.2
-    rtol = 0.01
-    assert torch.allclose(output, pytorch_output, atol=atol, rtol=rtol), (
-        f"Rank {rank}: shmem.ops.matmul_all_reduce doesn't match reference"
-    )
-
-    if rank == 0:
-        print("✓ shmem.ops.matmul_all_reduce test passed")
-
-    shmem.barrier()
-    del shmem
-    import gc
-
-    gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/ops/test_matmul_reduce_scatter.py b/tests/ops/test_matmul_reduce_scatter.py
index 7f75a1b0c..a7bab8b42 100644
--- a/tests/ops/test_matmul_reduce_scatter.py
+++ b/tests/ops/test_matmul_reduce_scatter.py
@@ -12,166 +12,7 @@
 import iris.ops as ops
 
 
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 2e-1, 1e-2),
-        (torch.float32, 1e-1, 1e-2),
-    ],
-)
-@pytest.mark.parametrize("M, N, K", [(128, 128, 32)])
-def test_matmul_reduce_scatter(dtype, atol, rtol, M, N, K):
-    """
-    Test matmul_reduce_scatter by comparing against torch matmul + all_reduce.
 
-    Note: We use all_reduce for reference because our tile-based reduce_scatter
-    is semantically equivalent to: matmul -> all_reduce -> each rank keeps assigned tiles.
-    PyTorch's reduce_scatter operates on different semantics (scatter along dimensions).
-    """
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
+pytestmark = pytest.mark.multi_rank_required
 
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
-
-    A = torch.randn(M, K, dtype=dtype, device=f"cuda:{rank}")
-    B = torch.randn(K, N, dtype=dtype, device=f"cuda:{rank}")
-
-    C_local = torch.matmul(A, B)
-    C_reduced = C_local.clone()
-    dist.all_reduce(C_reduced, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
-
-    config = ops.FusedConfig(block_size_m=64, block_size_n=64, block_size_k=32)
-    num_pid_m = (M + config.block_size_m - 1) // config.block_size_m
-    num_pid_n = (N + config.block_size_n - 1) // config.block_size_n
-    total_tiles = num_pid_m * num_pid_n
-    tiles_per_rank = total_tiles // world_size
-    start_tile = rank * tiles_per_rank
-    if rank == world_size - 1:
-        tiles_per_rank = total_tiles - start_tile
-
-    iris_A = shmem.zeros((M, K), dtype=dtype)
-    iris_A.copy_(A)
-    iris_B = shmem.zeros((K, N), dtype=dtype)
-    iris_B.copy_(B)
-    iris_C = shmem.zeros((M, N), dtype=dtype)
-
-    shmem.barrier()
-
-    ops.matmul_reduce_scatter(shmem, iris_C, iris_A, iris_B, config=config)
-
-    torch.cuda.synchronize()
-    shmem.barrier()
-
-    # Adjust tolerance for 8 ranks due to accumulation error
-    if world_size == 8 and dtype == torch.float32:
-        atol = 2e-1
-
-    for local_tile_idx in range(tiles_per_rank):
-        tile_id = start_tile + local_tile_idx
-        pid_m = tile_id // num_pid_n
-        pid_n = tile_id % num_pid_n
-
-        m_start = pid_m * config.block_size_m
-        m_end = min(m_start + config.block_size_m, M)
-        n_start = pid_n * config.block_size_n
-        n_end = min(n_start + config.block_size_n, N)
-
-        iris_tile = iris_C[m_start:m_end, n_start:n_end]
-        ref_tile = C_reduced[m_start:m_end, n_start:n_end]
-
-        max_diff = torch.abs(iris_tile - ref_tile).max().item()
-        assert torch.allclose(iris_tile, ref_tile, atol=atol, rtol=rtol), (
-            f"Rank {rank}, tile {tile_id} ({pid_m},{pid_n}): Max diff: {max_diff}, expected < {atol}"
-        )
-
-    if rank == 0:
-        print(f"matmul_reduce_scatter: {dtype}, M={M}, N={N}, K={K}")
-
-    shmem.barrier()
-    del shmem
-    import gc
-
-    gc.collect()
-
-
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 2e-1, 1e-2),
-        (torch.float32, 1e-1, 1e-2),
-    ],
-)
-def test_matmul_reduce_scatter_semantics(dtype, atol, rtol):
-    """
-    Test that matmul_reduce_scatter is equivalent to:
-    result = matmul(A, B)
-    reduced = all_reduce(result)
-    each rank keeps its assigned tile block
-    """
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
-
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
-
-    M, N, K = 128, 128, 32
-
-    A = shmem.randn((M, K), dtype=dtype)
-    B = shmem.randn((K, N), dtype=dtype)
-    output = shmem.zeros((M, N), dtype=dtype)
-
-    A_ref = A.clone()
-    B_ref = B.clone()
-    C_ref = torch.matmul(A_ref, B_ref)
-    dist.all_reduce(C_ref, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
-
-    config = ops.FusedConfig(block_size_m=64, block_size_n=64, block_size_k=32)
-    from iris.ops.matmul_reduce_scatter import matmul_reduce_scatter
-
-    matmul_reduce_scatter(shmem, output, A, B, config=config)
-
-    torch.cuda.synchronize()
-    shmem.barrier()
-
-    num_pid_m = (M + config.block_size_m - 1) // config.block_size_m
-    num_pid_n = (N + config.block_size_n - 1) // config.block_size_n
-    total_tiles = num_pid_m * num_pid_n
-    tiles_per_rank = total_tiles // world_size
-    start_tile = rank * tiles_per_rank
-    if rank == world_size - 1:
-        tiles_per_rank = total_tiles - start_tile
-
-    # Adjust tolerance for 8 ranks
-    if world_size == 8 and dtype == torch.float32:
-        atol = 2e-1
-
-    for local_tile_idx in range(tiles_per_rank):
-        tile_id = start_tile + local_tile_idx
-        pid_m = tile_id // num_pid_n
-        pid_n = tile_id % num_pid_n
-
-        m_start = pid_m * config.block_size_m
-        m_end = min(m_start + config.block_size_m, M)
-        n_start = pid_n * config.block_size_n
-        n_end = min(n_start + config.block_size_n, N)
-
-        output_tile = output[m_start:m_end, n_start:n_end]
-        ref_tile = C_ref[m_start:m_end, n_start:n_end]
-
-        assert torch.allclose(output_tile, ref_tile, atol=atol, rtol=rtol), f"Rank {rank}, tile {tile_id}: mismatch"
-
-    if rank == 0:
-        print("matmul_reduce_scatter semantics verified")
-
-    shmem.barrier()
-    del shmem
-    import gc
-
-    gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_arange.py b/tests/unittests/test_arange.py
index e3183faf5..974c0d7e6 100644
--- a/tests/unittests/test_arange.py
+++ b/tests/unittests/test_arange.py
@@ -6,279 +6,7 @@
 import iris
 
 
-def test_arange_basic_functionality():
-    """Test basic arange functionality with various argument combinations."""
-    shmem = iris.iris(1 << 20)
 
-    # Test 1: arange(end) - single argument
-    result1 = shmem.arange(5)
-    assert result1.shape == (5,)
-    assert torch.all(result1 == torch.tensor([0, 1, 2, 3, 4], device=result1.device))
-    assert result1.dtype == torch.int64
-    assert shmem._Iris__on_symmetric_heap(result1)
+pytestmark = pytest.mark.single_rank
 
-    # Test 2: arange(start, end) - two arguments
-    result2 = shmem.arange(1, 4)
-    assert result2.shape == (3,)
-    assert torch.all(result2 == torch.tensor([1, 2, 3], device=result2.device))
-    assert result2.dtype == torch.int64
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Test 3: arange(start, end, step) - three arguments
-    result3 = shmem.arange(1, 2.5, 0.5)
-    assert result3.shape == (3,)
-    assert torch.allclose(result3, torch.tensor([1.0, 1.5, 2.0], device=result3.device))
-    assert result3.dtype == torch.float32
-    assert shmem._Iris__on_symmetric_heap(result3)
-
-    # Test 4: arange with negative step
-    result4 = shmem.arange(5, 0, -1)
-    assert result4.shape == (5,)
-    assert torch.all(result4 == torch.tensor([5, 4, 3, 2, 1], device=result4.device))
-    assert shmem._Iris__on_symmetric_heap(result4)
-
-
-def test_arange_dtype_inference():
-    """Test dtype inference logic."""
-    shmem = iris.iris(1 << 20)
-
-    # Test integer dtype inference
-    result_int = shmem.arange(3)
-    assert result_int.dtype == torch.int64
-    assert shmem._Iris__on_symmetric_heap(result_int)
-
-    # Test float dtype inference
-    result_float = shmem.arange(1.0, 3.0)
-    assert result_float.dtype == torch.float32
-    assert shmem._Iris__on_symmetric_heap(result_float)
-
-    # Test explicit dtype override
-    result_explicit = shmem.arange(3, dtype=torch.float64)
-    assert result_explicit.dtype == torch.float64
-    assert shmem._Iris__on_symmetric_heap(result_explicit)
-
-    # Test mixed types (should infer float)
-    result_mixed = shmem.arange(1, 3.5, 0.5)
-    assert result_mixed.dtype == torch.float32
-    assert shmem._Iris__on_symmetric_heap(result_mixed)
-
-
-def test_arange_device_handling():
-    """Test device parameter handling."""
-    shmem = iris.iris(1 << 20)
-
-    # Test default device (should use Iris device)
-    result_default = shmem.arange(3)
-    assert str(result_default.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result_default)
-
-    # Test explicit device
-    iris_device = str(shmem.get_device())
-    result_explicit = shmem.arange(3, device=iris_device)
-    assert str(result_explicit.device) == iris_device
-    assert shmem._Iris__on_symmetric_heap(result_explicit)
-
-    # Test device=None (should use Iris device)
-    result_none = shmem.arange(3, device=None)
-    assert str(result_none.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result_none)
-
-
-def test_arange_layout_handling():
-    """Test layout parameter handling."""
-    shmem = iris.iris(1 << 20)
-
-    # Test default layout (strided)
-    result_strided = shmem.arange(3, layout=torch.strided)
-    assert result_strided.layout == torch.strided
-    assert shmem._Iris__on_symmetric_heap(result_strided)
-
-
-def test_arange_requires_grad():
-    """Test requires_grad parameter."""
-    shmem = iris.iris(1 << 20)
-
-    # Test default (False)
-    result_default = shmem.arange(3)
-    assert not result_default.requires_grad
-    assert shmem._Iris__on_symmetric_heap(result_default)
-
-    # Test True
-    result_true = shmem.arange(3, dtype=torch.float32, requires_grad=True)
-    assert result_true.requires_grad
-    assert shmem._Iris__on_symmetric_heap(result_true)
-
-    # Test False explicitly
-    result_false = shmem.arange(3, requires_grad=False)
-    assert not result_false.requires_grad
-    assert shmem._Iris__on_symmetric_heap(result_false)
-
-
-def test_arange_out_parameter():
-    """Test out parameter functionality."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(3, torch.int64)
-    result = shmem.arange(3, out=out_tensor)
-
-    # Should return the same tensor object
-    assert result is out_tensor
-    assert torch.all(result == torch.tensor([0, 1, 2], device=result.device))
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with different dtype out tensor
-    out_tensor_float = shmem._Iris__allocate(3, torch.float32)
-    result_float = shmem.arange(3, dtype=torch.float32, out=out_tensor_float)
-    assert result_float is out_tensor_float
-    assert result_float.dtype == torch.float32
-    assert shmem._Iris__on_symmetric_heap(result_float)
-
-
-def test_arange_error_handling():
-    """Test error handling for invalid inputs."""
-    shmem = iris.iris(1 << 20)
-
-    # Test step = 0 (should raise ValueError)
-    with pytest.raises(ValueError, match="step must be non-zero"):
-        shmem.arange(1, 5, 0)
-
-    # Test invalid device (should raise RuntimeError)
-    with pytest.raises(RuntimeError):
-        shmem.arange(3, device="cpu")  # Iris only supports CUDA
-
-
-def test_arange_edge_cases():
-    """Test edge cases and boundary conditions."""
-    shmem = iris.iris(1 << 20)
-
-    # Test invalid ranges (should throw ValueError like PyTorch)
-    with pytest.raises(ValueError):
-        shmem.arange(5, 1)  # start > end with positive step
-
-    with pytest.raises(ValueError):
-        shmem.arange(1, 5, -1)  # start < end with negative step
-
-    # Test single element result
-    result_single = shmem.arange(1, 2)
-    assert result_single.shape == (1,)
-    assert result_single.numel() == 1
-    assert result_single[0] == 1
-    assert shmem._Iris__on_symmetric_heap(result_single)
-
-    # Test large tensor
-    result_large = shmem.arange(1000)
-    assert result_large.shape == (1000,)
-    assert result_large.numel() == 1000
-    assert result_large[0] == 0
-    assert result_large[-1] == 999
-    assert shmem._Iris__on_symmetric_heap(result_large)
-
-    # Test floating point precision
-    result_float = shmem.arange(0, 1, 0.1)
-    assert result_float.shape == (10,)
-    assert torch.allclose(result_float[0], torch.tensor(0.0))
-    assert torch.allclose(result_float[-1], torch.tensor(0.9))
-    assert shmem._Iris__on_symmetric_heap(result_float)
-
-
-def test_arange_pytorch_equivalence():
-    """Test that Iris arange produces equivalent results to PyTorch arange."""
-    shmem = iris.iris(1 << 20)
-
-    # Test basic equivalence
-    iris_result = shmem.arange(5)
-    pytorch_result = torch.arange(5, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert torch.all(iris_result == pytorch_result)
-
-    # Test with start, end, step
-    iris_result = shmem.arange(1, 4, 0.5)
-    pytorch_result = torch.arange(1, 4, 0.5, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert torch.allclose(iris_result, pytorch_result)
-
-    # Test dtype inference equivalence
-    iris_result = shmem.arange(1.0, 3.0)
-    pytorch_result = torch.arange(1.0, 3.0, device="cuda")
-
-    assert iris_result.dtype == pytorch_result.dtype
-    assert torch.allclose(iris_result, pytorch_result)
-
-
-@pytest.mark.parametrize(
-    "params",
-    [
-        {"start": 0, "end": 5, "step": 1, "dtype": torch.int64},
-        {"start": 1, "end": 4, "step": 1, "dtype": torch.int64},
-        {"start": 0, "end": 1, "step": 0.1, "dtype": torch.float32},
-        {"start": 5, "end": 0, "step": -1, "dtype": torch.int64},
-        {"start": 0, "end": 10, "step": 2, "dtype": torch.int64},
-        {"start": 1.0, "end": 2.0, "step": 0.25, "dtype": torch.float32},
-    ],
-)
-def test_arange_parameter_combinations(params):
-    """Test arange with various parameter combinations."""
-    shmem = iris.iris(1 << 20)
-
-    result = shmem.arange(start=params["start"], end=params["end"], step=params["step"], dtype=params["dtype"])
-
-    # Verify basic properties
-    assert result.dtype == params["dtype"]
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Verify values match PyTorch
-    pytorch_result = torch.arange(
-        start=params["start"], end=params["end"], step=params["step"], dtype=params["dtype"], device="cuda"
-    )
-
-    assert result.shape == pytorch_result.shape
-    assert torch.allclose(result, pytorch_result)
-
-
-@pytest.mark.parametrize(
-    "arange_args",
-    [
-        (5,),  # arange(end)
-        (1, 4),  # arange(start, end)
-        (0, 1, 0.1),  # arange(start, end, step)
-        (10,),  # arange(end) with default dtype
-        (3,),  # arange(end) for device test
-        (5,),  # arange(end) for requires_grad test
-        (3,),  # arange(end) for layout test
-    ],
-)
-@pytest.mark.parametrize(
-    "kwargs",
-    [
-        {},  # No kwargs
-        {"dtype": torch.float64},  # dtype override
-        {"device": "cuda:0"},  # device override (will be replaced with actual Iris device)
-        {"dtype": torch.float32, "requires_grad": True},  # requires_grad True with float dtype
-        {"layout": torch.strided},  # strided layout
-    ],
-)
-def test_arange_symmetric_heap_verification(arange_args, kwargs):
-    """Test that all arange results are on the symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    # Replace hardcoded device with actual Iris device
-    if "device" in kwargs and kwargs["device"] == "cuda:0":
-        kwargs["device"] = str(shmem.get_device())
-
-    # Call arange with the given arguments and kwargs
-    result = shmem.arange(*arange_args, **kwargs)
-
-    # Verify symmetric heap allocation
-    assert shmem._Iris__on_symmetric_heap(result), (
-        f"Tensor {result} with args={arange_args}, kwargs={kwargs} is not on symmetric heap"
-    )
-
-    # Verify CUDA device
-    assert result.device.type == "cuda", (
-        f"Tensor {result} with args={arange_args}, kwargs={kwargs} is not on CUDA device"
-    )
+def test_arange_basic_functionality():
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_add_gluon.py b/tests/unittests/test_atomic_add_gluon.py
index 36d26801a..2533d2ea1 100644
--- a/tests/unittests/test_atomic_add_gluon.py
+++ b/tests/unittests/test_atomic_add_gluon.py
@@ -40,84 +40,7 @@ def atomic_add_kernel(
         )
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_add_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
 
-    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = (1,)
-    atomic_add_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        results,
-        sem,
-        scope,
-        cur_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # Verify the results
-    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * num_ranks
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_add_triton.py b/tests/unittests/test_atomic_add_triton.py
index 8cf2f7f45..29cce2f90 100644
--- a/tests/unittests/test_atomic_add_triton.py
+++ b/tests/unittests/test_atomic_add_triton.py
@@ -40,74 +40,7 @@ def atomic_add_kernel(
         )
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_add_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
 
-    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    atomic_add_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    # Verify the results
-    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * num_ranks
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_and_gluon.py b/tests/unittests/test_atomic_and_gluon.py
index 31ebdbc53..fe6190471 100644
--- a/tests/unittests/test_atomic_and_gluon.py
+++ b/tests/unittests/test_atomic_and_gluon.py
@@ -34,88 +34,7 @@ def atomic_and_kernel(
         ctx.atomic_and(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_and_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
 
-    bit_width = 32 if dtype == torch.int32 else 64
-    effective_bits = min(num_ranks, bit_width)
-    initial_mask = (1 << effective_bits) - 1
+pytestmark = pytest.mark.multi_rank_required
 
-    results = shmem.full((BLOCK_SIZE,), initial_mask, dtype=dtype)
-
-    shmem.barrier()
-
-    grid = (1,)
-    atomic_and_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        results,
-        sem,
-        scope,
-        cur_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # All ranks start out with a full mask vector 0xFFFFFF (initial_mask)
-    # All ranks then take turns in clearing their bit position in the mask
-    # By the end we would have effective_bits - num_ranks many ones followed by num_ranks zeros
-    expected_scalar = ~((1 << num_ranks) - 1) & initial_mask
-    expected = torch.full((BLOCK_SIZE,), expected_scalar, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual  :", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_and_triton.py b/tests/unittests/test_atomic_and_triton.py
index 7b2bdf668..cee7adf9d 100644
--- a/tests/unittests/test_atomic_and_triton.py
+++ b/tests/unittests/test_atomic_and_triton.py
@@ -31,78 +31,7 @@ def atomic_and_kernel(
         iris.atomic_and(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_and_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
 
-    bit_width = 32 if dtype == torch.int32 else 64
-    effective_bits = min(num_ranks, bit_width)
-    initial_mask = (1 << effective_bits) - 1
+pytestmark = pytest.mark.multi_rank_required
 
-    results = shmem.full((BLOCK_SIZE,), initial_mask, dtype=dtype)
-
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    atomic_and_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    # All ranks start out with a full mask vector 0xFFFFFF (initial_mask)
-    # All ranks then take turns in clearing their bit position in the mask
-    # By the end we would have effective_bits - num_ranks many ones followed by num_ranks zeros
-    expected_scalar = ~((1 << num_ranks) - 1) & initial_mask
-    expected = torch.full((BLOCK_SIZE,), expected_scalar, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual  :", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_cas_gluon.py b/tests/unittests/test_atomic_cas_gluon.py
index e10c77c59..11eeddafd 100644
--- a/tests/unittests/test_atomic_cas_gluon.py
+++ b/tests/unittests/test_atomic_cas_gluon.py
@@ -31,77 +31,7 @@ def atomic_cas_kernel(
         ctx.atomic_cas(results, cmp, val, target_rank, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int16,
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-def test_atomic_cas_api(dtype, sem, scope):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
 
-    results = shmem.zeros((1,), dtype=dtype)
-    # Create single-element tensors for cmp and val values (workaround for 0D tensor limitation)
-    cmp_val = shmem.zeros((1,), dtype=dtype)  # Will be 0
-    val_tensor = shmem.full((1,), num_ranks, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = (1,)
-    atomic_cas_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        results,
-        cmp_val,
-        val_tensor,
-        sem,
-        scope,
-        cur_rank,
-        num_ranks,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # Verify the results
-    expected = torch.full((1,), num_ranks, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_cas_triton.py b/tests/unittests/test_atomic_cas_triton.py
index fdd59a886..abb7b0a92 100644
--- a/tests/unittests/test_atomic_cas_triton.py
+++ b/tests/unittests/test_atomic_cas_triton.py
@@ -26,63 +26,7 @@ def atomic_cas_kernel(
         iris.atomic_cas(results, cmp, val, cur_rank, target_rank, heap_bases, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int16,
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-def test_atomic_cas_api(dtype, sem, scope):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
 
-    results = shmem.zeros((1,), dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    atomic_cas_kernel[grid](results, sem, scope, cur_rank, num_ranks, heap_bases)
-    shmem.barrier()
-
-    # Verify the results
-    expected = torch.full((1,), num_ranks, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_max_gluon.py b/tests/unittests/test_atomic_max_gluon.py
index 5ff71ea3f..61f860273 100644
--- a/tests/unittests/test_atomic_max_gluon.py
+++ b/tests/unittests/test_atomic_max_gluon.py
@@ -32,84 +32,7 @@ def atomic_max_kernel(
         ctx.atomic_max(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_max_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
 
-    min_val = torch.iinfo(dtype).min
-    results = shmem.full((BLOCK_SIZE,), min_val, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = (1,)
-    atomic_max_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        results,
-        sem,
-        scope,
-        cur_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # All ranks participate in performing the max operation
-    # Each rank performs the atomic operation: max(rank_id + 1)
-    # The result equals the ID of the last rank + 1
-    expected = torch.full((BLOCK_SIZE,), num_ranks, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual  :", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_max_triton.py b/tests/unittests/test_atomic_max_triton.py
index 69d9d96d7..f3a7b8dc2 100644
--- a/tests/unittests/test_atomic_max_triton.py
+++ b/tests/unittests/test_atomic_max_triton.py
@@ -29,74 +29,7 @@ def atomic_max_kernel(
         iris.atomic_max(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_max_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
 
-    min_val = torch.iinfo(dtype).min
-    results = shmem.full((BLOCK_SIZE,), min_val, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    atomic_max_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    # All ranks participate in performing the max operation
-    # Each rank performs the atomic operation: max(rank_id + 1)
-    # The result equals the ID of the last rank + 1
-    expected = torch.full((BLOCK_SIZE,), num_ranks, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual  :", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_min_gluon.py b/tests/unittests/test_atomic_min_gluon.py
index e18836b87..277f727dd 100644
--- a/tests/unittests/test_atomic_min_gluon.py
+++ b/tests/unittests/test_atomic_min_gluon.py
@@ -32,83 +32,7 @@ def atomic_min_kernel(
         ctx.atomic_min(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_min_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
 
-    max_val = torch.iinfo(dtype).max
-    results = shmem.full((BLOCK_SIZE,), max_val, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = (1,)
-    atomic_min_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        results,
-        sem,
-        scope,
-        cur_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-    # All ranks participate in performing the min operation
-    # Each rank performs the atomic operation: min(rank_id + 1)
-    # The result equals the ID of the first rank + 1
-    expected = torch.full((BLOCK_SIZE,), 1, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual  :", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_min_triton.py b/tests/unittests/test_atomic_min_triton.py
index 139e473de..030f1250e 100644
--- a/tests/unittests/test_atomic_min_triton.py
+++ b/tests/unittests/test_atomic_min_triton.py
@@ -29,73 +29,7 @@ def atomic_min_kernel(
         iris.atomic_min(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_min_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
 
-    max_val = torch.iinfo(dtype).max
-    results = shmem.full((BLOCK_SIZE,), max_val, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    atomic_min_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-    # All ranks participate in performing the min operation
-    # Each rank performs the atomic operation: min(rank_id + 1)
-    # The result equals the ID of the first rank + 1
-    expected = torch.full((BLOCK_SIZE,), 1, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual  :", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_or_gluon.py b/tests/unittests/test_atomic_or_gluon.py
index bcda75b39..638c05072 100644
--- a/tests/unittests/test_atomic_or_gluon.py
+++ b/tests/unittests/test_atomic_or_gluon.py
@@ -33,88 +33,7 @@ def atomic_or_kernel(
         ctx.atomic_or(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_or_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
 
-    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = (1,)
-    atomic_or_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        results,
-        sem,
-        scope,
-        cur_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    bit_width = 32 if dtype == torch.int32 else 64
-    effective_bits = min(num_ranks, bit_width)
-    expected_scalar = (1 << effective_bits) - 1
-
-    # All ranks start out with a zero mask
-    # All ranks then take turns in setting the their bit position in the mask to 1
-    # By the end we would have a bit vector with num_ranks many 1's as long as num_ranks <= bit_width
-    # or a full bit vector if num_ranks > bit_width
-    expected = torch.full((BLOCK_SIZE,), expected_scalar, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual  :", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_or_triton.py b/tests/unittests/test_atomic_or_triton.py
index c0b8cc25d..4f5bc901d 100644
--- a/tests/unittests/test_atomic_or_triton.py
+++ b/tests/unittests/test_atomic_or_triton.py
@@ -30,78 +30,7 @@ def atomic_or_kernel(
         iris.atomic_or(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_or_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
 
-    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    atomic_or_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    bit_width = 32 if dtype == torch.int32 else 64
-    effective_bits = min(num_ranks, bit_width)
-    expected_scalar = (1 << effective_bits) - 1
-
-    # All ranks start out with a zero mask
-    # All ranks then take turns in setting the their bit position in the mask to 1
-    # By the end we would have a bit vector with num_ranks many 1's as long as num_ranks <= bit_width
-    # or a full bit vector if num_ranks > bit_width
-    expected = torch.full((BLOCK_SIZE,), expected_scalar, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual  :", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_xchg_gluon.py b/tests/unittests/test_atomic_xchg_gluon.py
index 09ef1e2f2..6ec0db713 100644
--- a/tests/unittests/test_atomic_xchg_gluon.py
+++ b/tests/unittests/test_atomic_xchg_gluon.py
@@ -29,75 +29,7 @@ def atomic_xchg_kernel(
         ctx.atomic_xchg(results, val, target_rank, mask=None, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-def test_atomic_xchg_api(dtype, sem, scope):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
 
-    results = shmem.zeros((1,), dtype=dtype)
-    # Create single-element tensor for val value (workaround for 0D tensor limitation)
-    val_tensor = shmem.full((1,), num_ranks, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = (1,)
-    atomic_xchg_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        results,
-        val_tensor,
-        sem,
-        scope,
-        cur_rank,
-        num_ranks,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # Verify the results
-    expected = torch.full((1,), num_ranks, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_xchg_triton.py b/tests/unittests/test_atomic_xchg_triton.py
index ffea37e78..8c3119321 100644
--- a/tests/unittests/test_atomic_xchg_triton.py
+++ b/tests/unittests/test_atomic_xchg_triton.py
@@ -25,63 +25,7 @@ def atomic_xchg_kernel(
         iris.atomic_xchg(results, val, cur_rank, target_rank, heap_bases, mask=None, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-def test_atomic_xchg_api(dtype, sem, scope):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
 
-    results = shmem.zeros((1,), dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    atomic_xchg_kernel[grid](results, sem, scope, cur_rank, num_ranks, heap_bases)
-    shmem.barrier()
-
-    # Verify the results
-    expected = torch.full((1,), num_ranks, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_xor_gluon.py b/tests/unittests/test_atomic_xor_gluon.py
index b9e77ce60..712e5b26e 100644
--- a/tests/unittests/test_atomic_xor_gluon.py
+++ b/tests/unittests/test_atomic_xor_gluon.py
@@ -34,86 +34,7 @@ def atomic_xor_kernel(
         ctx.atomic_xor(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_xor_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
 
-    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = (1,)
-    atomic_xor_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        results,
-        sem,
-        scope,
-        cur_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # If we xor '1' in num_ranks times:
-    # - If num_ranks is odd  -> final = 1
-    # - If num_ranks is even -> final = 0
-    if (num_ranks % 2) == 1:
-        expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
-    else:
-        expected = torch.zeros(BLOCK_SIZE, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_atomic_xor_triton.py b/tests/unittests/test_atomic_xor_triton.py
index 639abfcdd..d01da7e0e 100644
--- a/tests/unittests/test_atomic_xor_triton.py
+++ b/tests/unittests/test_atomic_xor_triton.py
@@ -31,76 +31,7 @@ def atomic_xor_kernel(
         iris.atomic_xor(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int32,
-        torch.int64,
-    ],
-)
-@pytest.mark.parametrize(
-    "sem",
-    [
-        "acquire",
-        "release",
-        "acq_rel",
-    ],
-)
-@pytest.mark.parametrize(
-    "scope",
-    [
-        "cta",
-        "gpu",
-        "sys",
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_atomic_xor_api(dtype, sem, scope, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
 
-    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    atomic_xor_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    # If we xor '1' in num_ranks times:
-    # - If num_ranks is odd  -> final = 1
-    # - If num_ranks is even -> final = 0
-    if (num_ranks % 2) == 1:
-        expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
-    else:
-        expected = torch.zeros(BLOCK_SIZE, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_broadcast_gluon.py b/tests/unittests/test_broadcast_gluon.py
index e2eaac6d1..edaec2799 100644
--- a/tests/unittests/test_broadcast_gluon.py
+++ b/tests/unittests/test_broadcast_gluon.py
@@ -7,103 +7,7 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
-@pytest.mark.parametrize(
-    "value,expected",
-    [
-        (42, 42),
-        (3.14159, 3.14159),
-        (True, True),
-        (False, False),
-        ("Hello, Iris!", "Hello, Iris!"),
-        ({"key": "value", "num": 42}, {"key": "value", "num": 42}),
-    ],
-)
-def test_broadcast_scalar(value, expected):
-    """Test broadcasting scalar values (int, float, bool, string, dict)."""
-    shmem = iris_gl.iris(1 << 20)
-    try:
-        rank = shmem.get_rank()
 
-        val = value if rank == 0 else None
-        result = shmem.broadcast(val, src_rank=0)
+pytestmark = pytest.mark.multi_rank_required
 
-        if isinstance(expected, float):
-            assert abs(result - expected) < 1e-6
-        else:
-            assert result == expected
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float32,
-        torch.float16,
-        torch.int32,
-        torch.int64,
-    ],
-)
-def test_broadcast_tensor_dtype(dtype):
-    """Test broadcasting tensors with different dtypes."""
-    shmem = iris_gl.iris(1 << 20)
-    try:
-        rank = shmem.get_rank()
-
-        value = torch.arange(10, dtype=dtype) if rank == 0 else None
-        result = shmem.broadcast(value, src_rank=0)
-
-        assert isinstance(result, np.ndarray)
-        np.testing.assert_array_equal(result, np.arange(10))
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
-
-
-@pytest.mark.parametrize(
-    "shape",
-    [
-        (10,),
-        (10, 20),
-        (5, 10, 15),
-    ],
-)
-def test_broadcast_tensor_shape(shape):
-    """Test broadcasting tensors with different shapes."""
-    shmem = iris_gl.iris(1 << 25)
-    try:
-        rank = shmem.get_rank()
-
-        value = torch.randn(shape) if rank == 0 else None
-        result = shmem.broadcast(value, src_rank=0)
-
-        assert isinstance(result, np.ndarray)
-        assert result.shape == shape
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_broadcast_triton.py b/tests/unittests/test_broadcast_triton.py
index 9563a5916..1b0ead6db 100644
--- a/tests/unittests/test_broadcast_triton.py
+++ b/tests/unittests/test_broadcast_triton.py
@@ -7,103 +7,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "value,expected",
-    [
-        (42, 42),
-        (3.14159, 3.14159),
-        (True, True),
-        (False, False),
-        ("Hello, Iris!", "Hello, Iris!"),
-        ({"key": "value", "num": 42}, {"key": "value", "num": 42}),
-    ],
-)
-def test_broadcast_scalar(value, expected):
-    """Test broadcasting scalar values (int, float, bool, string, dict)."""
-    shmem = iris.iris(1 << 20)
-    try:
-        rank = shmem.get_rank()
 
-        val = value if rank == 0 else None
-        result = shmem.broadcast(val, source_rank=0)
+pytestmark = pytest.mark.multi_rank_required
 
-        if isinstance(expected, float):
-            assert abs(result - expected) < 1e-6
-        else:
-            assert result == expected
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float32,
-        torch.float16,
-        torch.int32,
-        torch.int64,
-    ],
-)
-def test_broadcast_tensor_dtype(dtype):
-    """Test broadcasting tensors with different dtypes."""
-    shmem = iris.iris(1 << 20)
-    try:
-        rank = shmem.get_rank()
-
-        value = torch.arange(10, dtype=dtype) if rank == 0 else None
-        result = shmem.broadcast(value, source_rank=0)
-
-        assert isinstance(result, np.ndarray)
-        np.testing.assert_array_equal(result, np.arange(10))
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
-
-
-@pytest.mark.parametrize(
-    "shape",
-    [
-        (10,),
-        (10, 20),
-        (5, 10, 15),
-    ],
-)
-def test_broadcast_tensor_shape(shape):
-    """Test broadcasting tensors with different shapes."""
-    shmem = iris.iris(1 << 25)
-    try:
-        rank = shmem.get_rank()
-
-        value = torch.randn(shape) if rank == 0 else None
-        result = shmem.broadcast(value, source_rank=0)
-
-        assert isinstance(result, np.ndarray)
-        assert result.shape == shape
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_copy_gluon.py b/tests/unittests/test_copy_gluon.py
index 8102640da..f0559687a 100644
--- a/tests/unittests/test_copy_gluon.py
+++ b/tests/unittests/test_copy_gluon.py
@@ -80,206 +80,7 @@ def copy_local_kernel(
         ctx.copy(src_data + offsets, dest_data + offsets, cur_rank, cur_rank, mask=mask)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_copy_get(dtype, BLOCK_SIZE):
-    """Test GET operation: cur_rank == to_rank"""
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
 
-    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    base = cur_rank + num_ranks
-    for i in range(num_ranks):
-        data[i, :] = base * (i + 1)
-
-    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    grid = (1,)
-    copy_get_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        data,
-        results,
-        cur_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    expected = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    for rank_id in range(num_ranks):
-        expected[rank_id, :] = (rank_id + num_ranks) * (cur_rank + 1)
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_copy_put(dtype, BLOCK_SIZE):
-    """Test PUT operation: cur_rank == from_rank"""
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
-
-    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    base = cur_rank + num_ranks
-    for i in range(num_ranks):
-        data[i, :] = base * (i + 1)
-
-    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    grid = (1,)
-    copy_put_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        data,
-        results,
-        cur_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # Each rank writes to results[cur_rank] on all targets
-    # After barrier, results[rank_id] contains data from rank_id
-    expected = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    for rank_id in range(num_ranks):
-        expected[rank_id, :] = (rank_id + num_ranks) * (rank_id + 1)
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_copy_local(dtype, BLOCK_SIZE):
-    """Test LOCAL operation: from_rank == to_rank == cur_rank"""
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
-
-    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    base = cur_rank + num_ranks
-    for i in range(num_ranks):
-        data[i, :] = base * (i + 1)
-
-    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    grid = (1,)
-    copy_local_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        data,
-        results,
-        cur_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # Local copy: results should match data
-    expected = data
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
+pytestmark = pytest.mark.multi_rank_required
 
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_copy_triton.py b/tests/unittests/test_copy_triton.py
index 00bc43e47..81eb0a47e 100644
--- a/tests/unittests/test_copy_triton.py
+++ b/tests/unittests/test_copy_triton.py
@@ -71,179 +71,7 @@ def copy_local_kernel(
         iris.copy(src_data + offsets, dest_data + offsets, cur_rank, cur_rank, cur_rank, heap_bases, mask)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_copy_get(dtype, BLOCK_SIZE):
-    """Test GET operation: cur_rank == to_rank"""
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
 
-    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    base = cur_rank + num_ranks
-    for i in range(num_ranks):
-        data[i, :] = base * (i + 1)
-
-    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    grid = lambda meta: (1,)
-    copy_get_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    expected = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    for rank_id in range(num_ranks):
-        expected[rank_id, :] = (rank_id + num_ranks) * (cur_rank + 1)
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_copy_put(dtype, BLOCK_SIZE):
-    """Test PUT operation: cur_rank == from_rank"""
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
-
-    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    base = cur_rank + num_ranks
-    for i in range(num_ranks):
-        data[i, :] = base * (i + 1)
-
-    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    grid = lambda meta: (1,)
-    copy_put_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    # Each rank writes to results[cur_rank] on all targets
-    # After barrier, results[rank_id] contains data from rank_id
-    expected = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    for rank_id in range(num_ranks):
-        expected[rank_id, :] = (rank_id + num_ranks) * (rank_id + 1)
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_copy_local(dtype, BLOCK_SIZE):
-    """Test LOCAL operation: from_rank == to_rank == cur_rank"""
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
-
-    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    base = cur_rank + num_ranks
-    for i in range(num_ranks):
-        data[i, :] = base * (i + 1)
-
-    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
-    grid = lambda meta: (1,)
-    copy_local_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    # Local copy: results should match data
-    expected = data
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
+pytestmark = pytest.mark.multi_rank_required
 
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_empty.py b/tests/unittests/test_empty.py
index e51fb4c2f..a981e532d 100644
--- a/tests/unittests/test_empty.py
+++ b/tests/unittests/test_empty.py
@@ -6,425 +6,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-        torch.float16,
-        torch.float32,
-        torch.float64,
-        torch.bool,
-    ],
-)
-@pytest.mark.parametrize(
-    "size",
-    [
-        (1,),
-        (5,),
-        (2, 3),
-        (3, 4, 5),
-        (1, 1, 1),
-        (10, 20),
-    ],
-)
-def test_empty_basic(dtype, size):
-    shmem = iris.iris(1 << 20)
 
-    # Test basic empty
-    result = shmem.empty(*size, dtype=dtype)
+pytestmark = pytest.mark.single_rank
 
-    # Verify shape matches
-    assert result.shape == size
-    assert result.dtype == dtype
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Note: We don't check the values since they are uninitialized
-
-
-def test_empty_default_dtype():
-    shmem = iris.iris(1 << 20)
-
-    # Test with default dtype (should use torch.get_default_dtype())
-    result = shmem.empty(2, 3)
-    expected_dtype = torch.get_default_dtype()
-    assert result.dtype == expected_dtype
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-@pytest.mark.parametrize(
-    "requires_grad",
-    [
-        True,
-        False,
-    ],
-)
-def test_empty_requires_grad(requires_grad):
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad parameter
-    result = shmem.empty(2, 2, dtype=torch.float32, requires_grad=requires_grad)
-
-    # Verify requires_grad is set
-    assert result.requires_grad == requires_grad
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_empty_device_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test default behavior (should use Iris device)
-    result = shmem.empty(3, 3)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test explicit device
-    result = shmem.empty(3, 3, device=shmem.device)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that "cuda" shorthand works (should use current CUDA device)
-    if shmem.device.startswith("cuda:"):
-        result = shmem.empty(3, 3, device="cuda")
-        assert str(result.device) == str(shmem.get_device())
-        assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test None device defaults to Iris device
-    result = shmem.empty(3, 3, device=None)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that different device throws error
-    different_device = "cpu"  # CPU is always different from CUDA
-    with pytest.raises(RuntimeError):
-        shmem.empty(3, 3, device=different_device)
-
-    # Test that different CUDA device throws error
-    if shmem.device.startswith("cuda:") and torch.cuda.device_count() >= 2:
-        current_device = torch.device(shmem.device)
-        different_cuda = f"cuda:{(current_device.index + 1) % torch.cuda.device_count()}"  # Use next GPU
-        with pytest.raises(RuntimeError):
-            shmem.empty(3, 3, device=different_cuda)
-
-
-def test_empty_layout_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test with strided layout (default)
-    result = shmem.empty(2, 4, layout=torch.strided)
-    assert result.layout == torch.strided
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that unsupported layout throws error
-    with pytest.raises(ValueError):
-        shmem.empty(2, 4, layout=torch.sparse_coo)
-
-
-def test_empty_out_parameter():
-    shmem = iris.iris(1 << 20)
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(6, torch.float32)
-    result = shmem.empty(2, 3, out=out_tensor)
-
-    # Should share the same underlying data (same data_ptr)
-    assert result.data_ptr() == out_tensor.data_ptr()
-    assert result.shape == (2, 3)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with different dtype out tensor
-    out_tensor_int = shmem._Iris__allocate(6, torch.int32)
-    result_int = shmem.empty(2, 3, dtype=torch.int32, out=out_tensor_int)
-    assert result_int.data_ptr() == out_tensor_int.data_ptr()
-    assert result_int.dtype == torch.int32
-    assert shmem._Iris__on_symmetric_heap(result_int)
-
-
-def test_empty_size_variations():
-    shmem = iris.iris(1 << 20)
-
-    # Test single dimension
-    result1 = shmem.empty(5)
-    assert result1.shape == (5,)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test multiple dimensions
-    result2 = shmem.empty(2, 3, 4)
-    assert result2.shape == (2, 3, 4)
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Test with tuple as single argument
-    result3 = shmem.empty((3, 4))
-    assert result3.shape == (3, 4)
-    assert shmem._Iris__on_symmetric_heap(result3)
-
-    # Test with list as single argument
-    result4 = shmem.empty([2, 5])
-    assert result4.shape == (2, 5)
-    assert shmem._Iris__on_symmetric_heap(result4)
-
-
-def test_empty_edge_cases():
-    shmem = iris.iris(1 << 20)
-
-    # Empty tensor
-    empty_result = shmem.empty(0)
-    assert empty_result.shape == (0,)
-    assert empty_result.numel() == 0
-    assert shmem._Iris__on_symmetric_heap(empty_result)
-
-    # Single element tensor
-    single_result = shmem.empty(1)
-    assert single_result.shape == (1,)
-    assert single_result.numel() == 1
-    assert shmem._Iris__on_symmetric_heap(single_result)
-
-    # Large tensor
-    large_result = shmem.empty(100, 100)
-    assert large_result.shape == (100, 100)
-    assert large_result.numel() == 10000
-    assert shmem._Iris__on_symmetric_heap(large_result)
-
-    # Zero-dimensional tensor (scalar)
-    scalar_result = shmem.empty(())
-    assert scalar_result.shape == ()
-    assert scalar_result.numel() == 1
-    assert shmem._Iris__on_symmetric_heap(scalar_result)
-
-
-def test_empty_pytorch_equivalence():
-    shmem = iris.iris(1 << 20)
-
-    # Test basic equivalence
-    iris_result = shmem.empty(4, 3)
-    pytorch_result = torch.empty(4, 3, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-
-    # Test with explicit dtype
-    iris_result = shmem.empty(2, 2, dtype=torch.float64)
-    pytorch_result = torch.empty(2, 2, dtype=torch.float64, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-
-    # Test with requires_grad
-    iris_result = shmem.empty(3, 3, dtype=torch.float32, requires_grad=True)
-    pytorch_result = torch.empty(3, 3, dtype=torch.float32, device="cuda", requires_grad=True)
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert iris_result.requires_grad == pytorch_result.requires_grad
-
-
-@pytest.mark.parametrize(
-    "params",
-    [
-        {"dtype": torch.float32, "requires_grad": True},
-        {"dtype": torch.float64, "requires_grad": False},
-        {"dtype": torch.int32},
-        {"dtype": torch.float16},
-        {"layout": torch.strided},
-        {"memory_format": torch.contiguous_format},
-        {"memory_format": torch.channels_last},
-        {},
-    ],
-)
-def test_empty_parameter_combinations(params):
-    shmem = iris.iris(1 << 20)
-
-    # Test various combinations of parameters
-    result = shmem.empty(3, 3, **params)
-
-    # Verify basic functionality
-    assert result.shape == (3, 3)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Verify dtype if specified
-    if "dtype" in params:
-        assert result.dtype == params["dtype"]
-
-    # Verify requires_grad if specified
-    if "requires_grad" in params:
-        assert result.requires_grad == params["requires_grad"]
-
-    # Verify layout if specified
-    if "layout" in params:
-        assert result.layout == params["layout"]
-
-
-@pytest.mark.parametrize(
-    "size,dtype",
-    [
-        ((1,), torch.float32),
-        ((5,), torch.int32),
-        ((2, 3), torch.float64),
-        ((3, 4, 5), torch.float16),
-        ((0,), torch.float32),  # Empty tensor
-        ((100, 100), torch.float32),  # Large tensor
-        ((), torch.float32),  # Scalar tensor
-    ],
-)
-def test_empty_symmetric_heap_shapes_dtypes(size, dtype):
-    """Test that empty returns tensors on symmetric heap for various shapes and dtypes."""
-    shmem = iris.iris(1 << 20)
-
-    # Test empty with this size and dtype
-    result = shmem.empty(*size, dtype=dtype)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with size {size}, dtype {dtype} is NOT on symmetric heap!"
-
-    # Also verify basic functionality
-    assert result.shape == size
-    assert result.dtype == dtype
-
-
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64, torch.int32, torch.int64])
-def test_empty_symmetric_heap_dtype_override(dtype):
-    """Test that empty with dtype override returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    result = shmem.empty(3, 3, dtype=dtype)
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with dtype {dtype} is NOT on symmetric heap!"
-    assert result.dtype == dtype
-
-
-def test_empty_symmetric_heap_other_params():
-    """Test that empty with other parameters returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad
-    result = shmem.empty(3, 3, dtype=torch.float32, requires_grad=True)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with requires_grad=True is NOT on symmetric heap!"
-
-    # Test with device override
-    result = shmem.empty(3, 3, device=shmem.device)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with device override is NOT on symmetric heap!"
-
-    # Test with layout override (only strided is supported)
-    result = shmem.empty(3, 3, layout=torch.strided)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with layout override is NOT on symmetric heap!"
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(9, torch.float32)
-    result = shmem.empty(3, 3, out=out_tensor)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with out parameter is NOT on symmetric heap!"
-
-
-def test_empty_invalid_output_tensor():
-    """Test error handling for invalid output tensors."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with wrong size output tensor
-    wrong_size_tensor = shmem._Iris__allocate(4, torch.float32)  # Wrong size for (3, 3)
-    with pytest.raises(RuntimeError):
-        shmem.empty(3, 3, out=wrong_size_tensor)
-
-    # Test with wrong dtype output tensor
-    wrong_dtype_tensor = shmem._Iris__allocate(9, torch.int32)  # Wrong dtype
-    with pytest.raises(RuntimeError):
-        shmem.empty(3, 3, dtype=torch.float32, out=wrong_dtype_tensor)
-
-    # Test with tensor not on symmetric heap (create a regular PyTorch tensor)
-    regular_tensor = torch.empty(3, 3, device="cuda")
-    with pytest.raises(RuntimeError):
-        shmem.empty(3, 3, out=regular_tensor)
-
-
-def test_empty_default_dtype_behavior():
-    """Test that empty uses the global default dtype when dtype=None."""
-    shmem = iris.iris(1 << 20)
-
-    # Save original default dtype
-    original_default = torch.get_default_dtype()
-
-    try:
-        # Test with float32 default
-        torch.set_default_dtype(torch.float32)
-        result1 = shmem.empty(2, 2)
-        assert result1.dtype == torch.float32
-
-        # Test with float64 default
-        torch.set_default_dtype(torch.float64)
-        result2 = shmem.empty(2, 2)
-        assert result2.dtype == torch.float64
-
-    finally:
-        # Restore original default dtype
-        torch.set_default_dtype(original_default)
-
-
-def test_empty_size_parsing():
-    """Test various ways of specifying size."""
-    shmem = iris.iris(1 << 20)
-
-    # Test individual arguments
-    result1 = shmem.empty(2, 3, 4)
-    assert result1.shape == (2, 3, 4)
-
-    # Test single tuple argument
-    result2 = shmem.empty((2, 3, 4))
-    assert result2.shape == (2, 3, 4)
-
-    # Test single list argument
-    result3 = shmem.empty([2, 3, 4])
-    assert result3.shape == (2, 3, 4)
-
-    # Test nested tuple (should be flattened)
-    result4 = shmem.empty(((2, 3, 4),))
-    assert result4.shape == (2, 3, 4)
-
-    # All should produce the same result shape
-    assert result1.shape == result2.shape
-    assert result2.shape == result3.shape
-    assert result3.shape == result4.shape
-
-
-def test_empty_memory_format():
-    """Test memory format parameter."""
-    shmem = iris.iris(1 << 20)
-
-    # Test contiguous format (default)
-    result_contig = shmem.empty(2, 3, 4, memory_format=torch.contiguous_format)
-    assert result_contig.is_contiguous()
-    assert shmem._Iris__on_symmetric_heap(result_contig)
-
-    # Test channels_last format (should work for 4D tensors)
-    result_cl = shmem.empty(2, 3, 4, 5, memory_format=torch.channels_last)
-    assert result_cl.shape == (2, 3, 4, 5)
-    assert shmem._Iris__on_symmetric_heap(result_cl)
-
-    # Test channels_last_3d format (should work for 5D tensors)
-    result_cl3d = shmem.empty(2, 3, 4, 5, 6, memory_format=torch.channels_last_3d)
-    assert result_cl3d.shape == (2, 3, 4, 5, 6)
-    assert shmem._Iris__on_symmetric_heap(result_cl3d)
-
-
-def test_empty_pin_memory():
-    """Test pin_memory parameter (should be ignored for Iris tensors)."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with pin_memory=True (should work but be ignored since Iris tensors are on GPU)
-    result = shmem.empty(2, 3, pin_memory=True)
-    assert result.shape == (2, 3)
-    assert shmem._Iris__on_symmetric_heap(result)
-    # Note: pin_memory is ignored for GPU tensors, so we just verify it doesn't cause errors
-
-
-def test_empty_deterministic_behavior():
-    """Test that empty handles deterministic algorithms correctly."""
-    shmem = iris.iris(1 << 20)
-
-    # Test that empty works regardless of deterministic settings
-    result = shmem.empty(2, 3)
-    assert result.shape == (2, 3)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Note: The actual deterministic behavior (filling with NaN/max values)
-    # is handled by PyTorch internally, so we just verify our function works
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_full.py b/tests/unittests/test_full.py
index a42d4ddb6..a981e532d 100644
--- a/tests/unittests/test_full.py
+++ b/tests/unittests/test_full.py
@@ -6,451 +6,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "fill_value",
-    [
-        0,
-        1,
-        -1,
-        3.141592,
-        -2.718,
-        42,
-        -100,
-        0.5,
-        -0.25,
-    ],
-)
-@pytest.mark.parametrize(
-    "size",
-    [
-        (1,),
-        (5,),
-        (2, 3),
-        (3, 4, 5),
-        (1, 1, 1),
-        (10, 20),
-    ],
-)
-def test_full_basic(fill_value, size):
-    shmem = iris.iris(1 << 20)
 
-    # Test basic full
-    result = shmem.full(size, fill_value)
+pytestmark = pytest.mark.single_rank
 
-    # Verify shape matches
-    assert result.shape == size
-
-    # Verify all values are the fill_value
-    assert torch.all(result == fill_value)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_full_dtype_inference():
-    shmem = iris.iris(1 << 20)
-
-    # Test integer fill_value (should infer int64)
-    result_int = shmem.full((2, 3), 42)
-    assert result_int.dtype == torch.int64
-    assert torch.all(result_int == 42)
-    assert shmem._Iris__on_symmetric_heap(result_int)
-
-    # Test float fill_value (should infer default float dtype)
-    result_float = shmem.full((2, 3), 3.141592)
-    assert result_float.dtype == torch.get_default_dtype()
-    assert torch.allclose(result_float, torch.tensor(3.141592))
-    assert shmem._Iris__on_symmetric_heap(result_float)
-
-    # Test explicit dtype override
-    result_explicit = shmem.full((2, 3), 42, dtype=torch.float32)
-    assert result_explicit.dtype == torch.float32
-    assert torch.all(result_explicit == 42)
-    assert shmem._Iris__on_symmetric_heap(result_explicit)
-
-
-@pytest.mark.parametrize(
-    "requires_grad",
-    [
-        True,
-        False,
-    ],
-)
-def test_full_requires_grad(requires_grad):
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad parameter
-    result = shmem.full((2, 2), 1.5, dtype=torch.float32, requires_grad=requires_grad)
-
-    # Verify requires_grad is set
-    assert result.requires_grad == requires_grad
-    assert torch.all(result == 1.5)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_full_device_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test default behavior (should use Iris device)
-    result = shmem.full((3, 3), 2.5)
-    assert str(result.device) == str(shmem.get_device())
-    assert torch.all(result == 2.5)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test explicit device
-    result = shmem.full((3, 3), 2.5, device=shmem.device)
-    assert str(result.device) == str(shmem.get_device())
-    assert torch.all(result == 2.5)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that "cuda" shorthand works (should use current CUDA device)
-    if shmem.device.startswith("cuda:"):
-        result = shmem.full((3, 3), 2.5, device="cuda")
-        assert str(result.device) == str(shmem.get_device())
-        assert torch.all(result == 2.5)
-        assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test None device defaults to Iris device
-    result = shmem.full((3, 3), 2.5, device=None)
-    assert str(result.device) == str(shmem.get_device())
-    assert torch.all(result == 2.5)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that different device throws error
-    different_device = "cpu"  # CPU is always different from CUDA
-    with pytest.raises(RuntimeError):
-        shmem.full((3, 3), 2.5, device=different_device)
-
-    # Test that different CUDA device throws error
-    if shmem.device.startswith("cuda:") and torch.cuda.device_count() >= 2:
-        current_device = torch.device(shmem.device)
-        different_cuda = f"cuda:{(current_device.index + 1) % torch.cuda.device_count()}"  # Use next GPU
-        with pytest.raises(RuntimeError):
-            shmem.full((3, 3), 2.5, device=different_cuda)
-
-
-def test_full_layout_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test with strided layout (default)
-    result = shmem.full((2, 4), 1.0, layout=torch.strided)
-    assert result.layout == torch.strided
-    assert torch.all(result == 1.0)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that unsupported layout throws error
-    with pytest.raises(ValueError):
-        shmem.full((2, 4), 1.0, layout=torch.sparse_coo)
-
-
-def test_full_out_parameter():
-    shmem = iris.iris(1 << 20)
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(6, torch.float32)
-    result = shmem.full((2, 3), 3.141592, out=out_tensor)
-
-    # Should share the same underlying data (same data_ptr)
-    assert result.data_ptr() == out_tensor.data_ptr()
-    assert torch.allclose(result, torch.tensor(3.141592))
-    assert result.shape == (2, 3)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with different dtype out tensor
-    out_tensor_int = shmem._Iris__allocate(6, torch.int32)
-    result_int = shmem.full((2, 3), 42, dtype=torch.int32, out=out_tensor_int)
-    assert result_int.data_ptr() == out_tensor_int.data_ptr()
-    assert result_int.dtype == torch.int32
-    assert torch.all(result_int == 42)
-    assert shmem._Iris__on_symmetric_heap(result_int)
-
-
-def test_full_size_variations():
-    shmem = iris.iris(1 << 20)
-
-    # Test single dimension
-    result1 = shmem.full((5,), 2.0)
-    assert result1.shape == (5,)
-    assert torch.all(result1 == 2.0)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test multiple dimensions
-    result2 = shmem.full((2, 3, 4), 1.5)
-    assert result2.shape == (2, 3, 4)
-    assert torch.all(result2 == 1.5)
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Test with tuple as single argument
-    result3 = shmem.full((3, 4), 0.5)
-    assert result3.shape == (3, 4)
-    assert torch.all(result3 == 0.5)
-    assert shmem._Iris__on_symmetric_heap(result3)
-
-    # Test with list as single argument
-    result4 = shmem.full([2, 5], -1.0)
-    assert result4.shape == (2, 5)
-    assert torch.all(result4 == -1.0)
-    assert shmem._Iris__on_symmetric_heap(result4)
-
-
-def test_full_edge_cases():
-    shmem = iris.iris(1 << 20)
-
-    # Empty tensor
-    empty_result = shmem.full((0,), 1.0)
-    assert empty_result.shape == (0,)
-    assert empty_result.numel() == 0
-    assert shmem._Iris__on_symmetric_heap(empty_result)
-
-    # Single element tensor
-    single_result = shmem.full((1,), 5.0)
-    assert single_result.shape == (1,)
-    assert single_result.numel() == 1
-    assert single_result[0] == 5.0
-    assert shmem._Iris__on_symmetric_heap(single_result)
-
-    # Large tensor
-    large_result = shmem.full((100, 100), 0.1)
-    assert large_result.shape == (100, 100)
-    assert large_result.numel() == 10000
-    assert torch.all(large_result == 0.1)
-    assert shmem._Iris__on_symmetric_heap(large_result)
-
-    # Zero-dimensional tensor (scalar)
-    scalar_result = shmem.full((), 2.718)
-    assert scalar_result.shape == ()
-    assert scalar_result.numel() == 1
-    assert torch.allclose(scalar_result, torch.tensor(2.718))
-    assert shmem._Iris__on_symmetric_heap(scalar_result)
-
-
-def test_full_pytorch_equivalence():
-    shmem = iris.iris(1 << 20)
-
-    # Test basic equivalence
-    iris_result = shmem.full((4, 3), 3.141592)
-    pytorch_result = torch.full((4, 3), 3.141592, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert torch.allclose(iris_result, pytorch_result)
-
-    # Test with explicit dtype
-    iris_result = shmem.full((2, 2), 42, dtype=torch.float64)
-    pytorch_result = torch.full((2, 2), 42, dtype=torch.float64, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert torch.allclose(iris_result, pytorch_result)
-
-    # Test with requires_grad
-    iris_result = shmem.full((3, 3), 1.5, dtype=torch.float32, requires_grad=True)
-    pytorch_result = torch.full((3, 3), 1.5, dtype=torch.float32, device="cuda", requires_grad=True)
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert iris_result.requires_grad == pytorch_result.requires_grad
-    assert torch.allclose(iris_result, pytorch_result)
-
-
-@pytest.mark.parametrize(
-    "params",
-    [
-        {"dtype": torch.float32, "requires_grad": True},
-        {"dtype": torch.float64, "requires_grad": False},
-        {"dtype": torch.int32},
-        {"dtype": torch.float16},
-        {"layout": torch.strided},
-        {},
-    ],
-)
-def test_full_parameter_combinations(params):
-    shmem = iris.iris(1 << 20)
-
-    # Test various combinations of parameters
-    result = shmem.full((3, 3), 2.5, **params)
-
-    # Verify basic functionality
-    assert result.shape == (3, 3)
-    # Use appropriate comparison based on dtype
-    if torch.is_floating_point(result):
-        # For float dtypes, use close comparison with matching dtype
-        expected = torch.tensor(2.5, dtype=result.dtype, device=result.device)
-        assert torch.allclose(result, expected)
-    else:
-        # For integer dtypes, the fill value gets truncated
-        assert torch.all(result == 2)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Verify dtype if specified
-    if "dtype" in params:
-        assert result.dtype == params["dtype"]
-
-    # Verify requires_grad if specified
-    if "requires_grad" in params:
-        assert result.requires_grad == params["requires_grad"]
-
-    # Verify layout if specified
-    if "layout" in params:
-        assert result.layout == params["layout"]
-
-
-@pytest.mark.parametrize(
-    "size,fill_value,dtype",
-    [
-        ((1,), 1.0, torch.float32),
-        ((5,), 42, torch.int32),
-        ((2, 3), 3.141592, torch.float64),
-        ((3, 4, 5), 0.5, torch.float16),
-        ((0,), 1.0, torch.float32),  # Empty tensor
-        ((100, 100), 0.1, torch.float32),  # Large tensor
-        ((), 2.718, torch.float32),  # Scalar tensor
-    ],
-)
-def test_full_symmetric_heap_shapes_dtypes(size, fill_value, dtype):
-    """Test that full returns tensors on symmetric heap for various shapes and dtypes."""
-    shmem = iris.iris(1 << 20)
-
-    # Test full with this size, fill_value, and dtype
-    result = shmem.full(size, fill_value, dtype=dtype)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result), (
-        f"Tensor with size {size}, fill_value {fill_value}, dtype {dtype} is NOT on symmetric heap!"
-    )
-
-    # Also verify basic functionality
-    assert result.shape == size
-    assert result.dtype == dtype
-    assert torch.allclose(result, torch.tensor(fill_value, dtype=dtype))
-
-
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64, torch.int32, torch.int64])
-def test_full_symmetric_heap_dtype_override(dtype):
-    """Test that full with dtype override returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    result = shmem.full((3, 3), 1.5, dtype=dtype)
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with dtype {dtype} is NOT on symmetric heap!"
-    assert result.dtype == dtype
-
-
-def test_full_symmetric_heap_other_params():
-    """Test that full with other parameters returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad
-    result = shmem.full((3, 3), 1.5, dtype=torch.float32, requires_grad=True)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with requires_grad=True is NOT on symmetric heap!"
-
-    # Test with device override
-    result = shmem.full((3, 3), 1.5, device=shmem.device)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with device override is NOT on symmetric heap!"
-
-    # Test with layout override (only strided is supported)
-    result = shmem.full((3, 3), 1.5, layout=torch.strided)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with layout override is NOT on symmetric heap!"
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(9, torch.float32)
-    result = shmem.full((3, 3), 1.5, out=out_tensor)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with out parameter is NOT on symmetric heap!"
-
-
-def test_full_invalid_output_tensor():
-    """Test error handling for invalid output tensors."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with wrong size output tensor
-    wrong_size_tensor = shmem._Iris__allocate(4, torch.float32)  # Wrong size for (3, 3)
-    with pytest.raises(RuntimeError):
-        shmem.full((3, 3), 1.5, out=wrong_size_tensor)
-
-    # Test with wrong dtype output tensor
-    wrong_dtype_tensor = shmem._Iris__allocate(9, torch.int32)  # Wrong dtype
-    with pytest.raises(RuntimeError):
-        shmem.full((3, 3), 1.5, dtype=torch.float32, out=wrong_dtype_tensor)
-
-    # Test with tensor not on symmetric heap (create a regular PyTorch tensor)
-    regular_tensor = torch.full((3, 3), 1.5, device="cuda")
-    with pytest.raises(RuntimeError):
-        shmem.full((3, 3), 1.5, out=regular_tensor)
-
-
-def test_full_size_parsing():
-    """Test various ways of specifying size."""
-    shmem = iris.iris(1 << 20)
-
-    # Test individual arguments
-    result1 = shmem.full((2, 3, 4), 1.0)
-    assert result1.shape == (2, 3, 4)
-
-    # Test single tuple argument
-    result2 = shmem.full((2, 3, 4), 1.0)
-    assert result2.shape == (2, 3, 4)
-
-    # Test single list argument
-    result3 = shmem.full([2, 3, 4], 1.0)
-    assert result3.shape == (2, 3, 4)
-
-    # Test nested tuple (should be flattened)
-    result4 = shmem.full(((2, 3, 4),), 1.0)
-    assert result4.shape == (2, 3, 4)
-
-    # All should produce the same result
-    assert torch.all(result1 == result2)
-    assert torch.all(result2 == result3)
-    assert torch.all(result3 == result4)
-
-
-def test_full_examples():
-    """Test the examples from PyTorch documentation."""
-    shmem = iris.iris(1 << 20)
-
-    # Example: torch.full((2, 3), 3.141592)
-    result = shmem.full((2, 3), 3.141592)
-    expected = torch.tensor([[3.141592, 3.141592, 3.141592], [3.141592, 3.141592, 3.141592]], device=result.device)
-    assert result.shape == (2, 3)
-    assert torch.allclose(result, expected)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_full_different_fill_values():
-    """Test various fill values to ensure they work correctly."""
-    shmem = iris.iris(1 << 20)
-
-    # Test different numeric types
-    test_cases = [
-        (0, torch.int64),
-        (1, torch.int64),
-        (-1, torch.int64),
-        (42, torch.int64),
-        (0.0, torch.get_default_dtype()),
-        (1.0, torch.get_default_dtype()),
-        (-1.0, torch.get_default_dtype()),
-        (3.141592, torch.get_default_dtype()),
-        (-2.718, torch.get_default_dtype()),
-    ]
-
-    for fill_value, expected_dtype in test_cases:
-        result = shmem.full((2, 2), fill_value)
-        assert result.dtype == expected_dtype
-        assert torch.allclose(result, torch.tensor(fill_value, dtype=expected_dtype))
-        assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_full_dtype_override():
-    """Test that explicit dtype overrides inference."""
-    shmem = iris.iris(1 << 20)
-
-    # Integer fill_value with float dtype
-    result = shmem.full((2, 2), 42, dtype=torch.float32)
-    assert result.dtype == torch.float32
-    assert torch.allclose(result, torch.tensor(42.0, dtype=torch.float32))
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Float fill_value with int dtype
-    result = shmem.full((2, 2), 3.14, dtype=torch.int32)
-    assert result.dtype == torch.int32
-    assert torch.all(result == 3)  # Truncated to int
-    assert shmem._Iris__on_symmetric_heap(result)
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_get_gluon.py b/tests/unittests/test_get_gluon.py
index 5cabc054c..633d69a4b 100644
--- a/tests/unittests/test_get_gluon.py
+++ b/tests/unittests/test_get_gluon.py
@@ -41,67 +41,7 @@ def get_kernel(
     gl.store(results + offsets, acc, mask=mask)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_get_api(dtype, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
 
-    data = shmem.ones(BLOCK_SIZE, dtype=dtype)
-    results = shmem.zeros_like(data)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = (1,)
-    get_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        data,
-        results,
-        cur_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # Verify the results
-    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * num_ranks
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_get_triton.py b/tests/unittests/test_get_triton.py
index b19cf235d..c0241dae8 100644
--- a/tests/unittests/test_get_triton.py
+++ b/tests/unittests/test_get_triton.py
@@ -38,58 +38,7 @@ def get_kernel(
     tl.store(results + offsets, acc, mask=mask)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_get_api(dtype, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
 
-    data = shmem.ones(BLOCK_SIZE, dtype=dtype)
-    results = shmem.zeros_like(data)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    get_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    # Verify the results
-    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * num_ranks
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_linspace.py b/tests/unittests/test_linspace.py
index 02d26b248..a981e532d 100644
--- a/tests/unittests/test_linspace.py
+++ b/tests/unittests/test_linspace.py
@@ -6,471 +6,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,
-        torch.float32,
-        torch.float64,
-        torch.complex64,
-        torch.complex128,
-    ],
-)
-@pytest.mark.parametrize(
-    "start,end,steps",
-    [
-        (0.0, 1.0, 5),
-        (-10.0, 10.0, 11),
-        (3.0, 10.0, 5),
-        (0.0, 100.0, 101),
-        (1.0, 2.0, 2),
-        (0.0, 0.0, 5),
-    ],
-)
-def test_linspace_basic(dtype, start, end, steps):
-    shmem = iris.iris(1 << 20)
 
-    # Test basic linspace
-    result = shmem.linspace(start, end, steps, dtype=dtype)
+pytestmark = pytest.mark.single_rank
 
-    # Verify shape matches
-    assert result.shape == (steps,)
-    assert result.dtype == dtype
-
-    # Verify first and last values
-    assert torch.allclose(result[0], torch.tensor(start, dtype=dtype))
-    assert torch.allclose(result[-1], torch.tensor(end, dtype=dtype))
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_linspace_default_dtype():
-    shmem = iris.iris(1 << 20)
-
-    # Test with default dtype (should use torch.get_default_dtype())
-    result = shmem.linspace(0.0, 1.0, 5)
-    expected_dtype = torch.get_default_dtype()
-    assert result.dtype == expected_dtype
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-@pytest.mark.parametrize(
-    "requires_grad",
-    [
-        True,
-        False,
-    ],
-)
-def test_linspace_requires_grad(requires_grad):
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad parameter
-    result = shmem.linspace(0.0, 1.0, 5, dtype=torch.float32, requires_grad=requires_grad)
-
-    # Verify requires_grad is set
-    assert result.requires_grad == requires_grad
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_linspace_device_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test default behavior (should use Iris device)
-    result = shmem.linspace(0.0, 1.0, 5)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test explicit device
-    result = shmem.linspace(0.0, 1.0, 5, device=shmem.device)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that "cuda" shorthand works (should use current CUDA device)
-    if shmem.device.startswith("cuda:"):
-        result = shmem.linspace(0.0, 1.0, 5, device="cuda")
-        assert str(result.device) == str(shmem.get_device())
-        assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test None device defaults to Iris device
-    result = shmem.linspace(0.0, 1.0, 5, device=None)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that different device throws error
-    different_device = "cpu"  # CPU is always different from CUDA
-    with pytest.raises(RuntimeError):
-        shmem.linspace(0.0, 1.0, 5, device=different_device)
-
-    # Test that different CUDA device throws error
-    if shmem.device.startswith("cuda:") and torch.cuda.device_count() >= 2:
-        current_device = torch.device(shmem.device)
-        different_cuda = f"cuda:{(current_device.index + 1) % torch.cuda.device_count()}"  # Use next GPU
-        with pytest.raises(RuntimeError):
-            shmem.linspace(0.0, 1.0, 5, device=different_cuda)
-
-
-def test_linspace_layout_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test with strided layout (default)
-    result = shmem.linspace(0.0, 1.0, 5, layout=torch.strided)
-    assert result.layout == torch.strided
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that unsupported layout throws error
-    with pytest.raises(ValueError):
-        shmem.linspace(0.0, 1.0, 5, layout=torch.sparse_coo)
-
-
-def test_linspace_out_parameter():
-    shmem = iris.iris(1 << 20)
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(5, torch.float32)
-    result = shmem.linspace(0.0, 1.0, 5, out=out_tensor)
-
-    # Should share the same underlying data (same data_ptr)
-    assert result.data_ptr() == out_tensor.data_ptr()
-    assert result.shape == (5,)
-    assert torch.allclose(result[0], torch.tensor(0.0))
-    assert torch.allclose(result[-1], torch.tensor(1.0))
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with different dtype out tensor
-    out_tensor_float64 = shmem._Iris__allocate(5, torch.float64)
-    result_float64 = shmem.linspace(0.0, 1.0, 5, dtype=torch.float64, out=out_tensor_float64)
-    assert result_float64.data_ptr() == out_tensor_float64.data_ptr()
-    assert result_float64.dtype == torch.float64
-    assert shmem._Iris__on_symmetric_heap(result_float64)
-
-
-def test_linspace_steps_variations():
-    shmem = iris.iris(1 << 20)
-
-    # Test single step
-    result1 = shmem.linspace(0.0, 1.0, 1)
-    assert result1.shape == (1,)
-    assert torch.allclose(result1[0], torch.tensor(0.0))
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test multiple steps
-    result2 = shmem.linspace(0.0, 1.0, 10)
-    assert result2.shape == (10,)
-    assert torch.allclose(result2[0], torch.tensor(0.0))
-    assert torch.allclose(result2[-1], torch.tensor(1.0))
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Test with tuple as steps argument
-    result3 = shmem.linspace(0.0, 1.0, (5,))
-    assert result3.shape == (5,)
-    assert shmem._Iris__on_symmetric_heap(result3)
-
-    # Test with list as steps argument
-    result4 = shmem.linspace(0.0, 1.0, [5])
-    assert result4.shape == (5,)
-    assert shmem._Iris__on_symmetric_heap(result4)
-
-
-def test_linspace_edge_cases():
-    shmem = iris.iris(1 << 20)
-
-    # Single step (start == end)
-    single_result = shmem.linspace(5.0, 5.0, 1)
-    assert single_result.shape == (1,)
-    assert torch.allclose(single_result[0], torch.tensor(5.0))
-    assert shmem._Iris__on_symmetric_heap(single_result)
-
-    # Two steps
-    two_result = shmem.linspace(0.0, 1.0, 2)
-    assert two_result.shape == (2,)
-    assert torch.allclose(two_result[0], torch.tensor(0.0))
-    assert torch.allclose(two_result[1], torch.tensor(1.0))
-    assert shmem._Iris__on_symmetric_heap(two_result)
-
-    # Large number of steps
-    large_result = shmem.linspace(0.0, 100.0, 1000)
-    assert large_result.shape == (1000,)
-    assert torch.allclose(large_result[0], torch.tensor(0.0))
-    assert torch.allclose(large_result[-1], torch.tensor(100.0))
-    assert shmem._Iris__on_symmetric_heap(large_result)
-
-    # Negative range
-    neg_result = shmem.linspace(-10.0, -5.0, 6)
-    assert neg_result.shape == (6,)
-    assert torch.allclose(neg_result[0], torch.tensor(-10.0))
-    assert torch.allclose(neg_result[-1], torch.tensor(-5.0))
-    assert shmem._Iris__on_symmetric_heap(neg_result)
-
-
-def test_linspace_pytorch_equivalence():
-    shmem = iris.iris(1 << 20)
-
-    # Test basic equivalence
-    iris_result = shmem.linspace(0.0, 1.0, 5)
-    pytorch_result = torch.linspace(0.0, 1.0, 5, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert torch.allclose(iris_result, pytorch_result)
-
-    # Test with explicit dtype
-    iris_result = shmem.linspace(0.0, 1.0, 5, dtype=torch.float64)
-    pytorch_result = torch.linspace(0.0, 1.0, 5, dtype=torch.float64, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert torch.allclose(iris_result, pytorch_result)
-
-    # Test with requires_grad
-    iris_result = shmem.linspace(0.0, 1.0, 5, dtype=torch.float32, requires_grad=True)
-    pytorch_result = torch.linspace(0.0, 1.0, 5, dtype=torch.float32, device="cuda", requires_grad=True)
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert iris_result.requires_grad == pytorch_result.requires_grad
-
-
-@pytest.mark.parametrize(
-    "params",
-    [
-        {"dtype": torch.float32, "requires_grad": True},
-        {"dtype": torch.float64, "requires_grad": False},
-        {"dtype": torch.complex64},
-        {"dtype": torch.complex128},
-        {"layout": torch.strided},
-        {},
-    ],
-)
-def test_linspace_parameter_combinations(params):
-    shmem = iris.iris(1 << 20)
-
-    # Test various combinations of parameters
-    result = shmem.linspace(0.0, 1.0, 5, **params)
-
-    # Verify basic functionality
-    assert result.shape == (5,)
-    assert torch.allclose(result[0], torch.tensor(0.0, dtype=result.dtype))
-    assert torch.allclose(result[-1], torch.tensor(1.0, dtype=result.dtype))
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Verify dtype if specified
-    if "dtype" in params:
-        assert result.dtype == params["dtype"]
-
-    # Verify requires_grad if specified
-    if "requires_grad" in params:
-        assert result.requires_grad == params["requires_grad"]
-
-    # Verify layout if specified
-    if "layout" in params:
-        assert result.layout == params["layout"]
-
-
-@pytest.mark.parametrize(
-    "start,end,steps,dtype",
-    [
-        (0.0, 1.0, 5, torch.float32),
-        (-10.0, 10.0, 11, torch.float64),
-        (3.0, 10.0, 5, torch.float16),
-        (0.0, 100.0, 101, torch.complex64),
-        (1.0, 2.0, 2, torch.complex128),
-    ],
-)
-def test_linspace_symmetric_heap_shapes_dtypes(start, end, steps, dtype):
-    """Test that linspace returns tensors on symmetric heap for various shapes and dtypes."""
-    shmem = iris.iris(1 << 20)
-
-    # Test linspace with these parameters
-    result = shmem.linspace(start, end, steps, dtype=dtype)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result), (
-        f"Tensor with start={start}, end={end}, steps={steps}, dtype={dtype} is NOT on symmetric heap!"
-    )
-
-    # Also verify basic functionality
-    assert result.shape == (steps,)
-    assert result.dtype == dtype
-    assert torch.allclose(result[0], torch.tensor(start, dtype=dtype))
-    assert torch.allclose(result[-1], torch.tensor(end, dtype=dtype))
-
-
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64, torch.complex64, torch.complex128])
-def test_linspace_symmetric_heap_dtype_override(dtype):
-    """Test that linspace with dtype override returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    result = shmem.linspace(0.0, 1.0, 5, dtype=dtype)
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with dtype {dtype} is NOT on symmetric heap!"
-    assert result.dtype == dtype
-
-
-def test_linspace_symmetric_heap_other_params():
-    """Test that linspace with other parameters returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad
-    result = shmem.linspace(0.0, 1.0, 5, dtype=torch.float32, requires_grad=True)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with requires_grad=True is NOT on symmetric heap!"
-
-    # Test with device override
-    result = shmem.linspace(0.0, 1.0, 5, device=shmem.device)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with device override is NOT on symmetric heap!"
-
-    # Test with layout override (only strided is supported)
-    result = shmem.linspace(0.0, 1.0, 5, layout=torch.strided)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with layout override is NOT on symmetric heap!"
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(5, torch.float32)
-    result = shmem.linspace(0.0, 1.0, 5, out=out_tensor)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with out parameter is NOT on symmetric heap!"
-
-
-def test_linspace_invalid_output_tensor():
-    """Test error handling for invalid output tensors."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with wrong size output tensor
-    wrong_size_tensor = shmem._Iris__allocate(3, torch.float32)  # Wrong size for 5 steps
-    with pytest.raises(RuntimeError):
-        shmem.linspace(0.0, 1.0, 5, out=wrong_size_tensor)
-
-    # Test with wrong dtype output tensor
-    wrong_dtype_tensor = shmem._Iris__allocate(5, torch.int32)  # Wrong dtype
-    with pytest.raises(RuntimeError):
-        shmem.linspace(0.0, 1.0, 5, dtype=torch.float32, out=wrong_dtype_tensor)
-
-    # Test with tensor not on symmetric heap (create a regular PyTorch tensor)
-    regular_tensor = torch.linspace(0.0, 1.0, 5, device="cuda")
-    with pytest.raises(RuntimeError):
-        shmem.linspace(0.0, 1.0, 5, out=regular_tensor)
-
-
-def test_linspace_default_dtype_behavior():
-    """Test that linspace uses the global default dtype when dtype=None."""
-    shmem = iris.iris(1 << 20)
-
-    # Save original default dtype
-    original_default = torch.get_default_dtype()
-
-    try:
-        # Test with float32 default
-        torch.set_default_dtype(torch.float32)
-        result1 = shmem.linspace(0.0, 1.0, 5)
-        assert result1.dtype == torch.float32
-
-        # Test with float64 default
-        torch.set_default_dtype(torch.float64)
-        result2 = shmem.linspace(0.0, 1.0, 5)
-        assert result2.dtype == torch.float64
-
-    finally:
-        # Restore original default dtype
-        torch.set_default_dtype(original_default)
-
-
-def test_linspace_steps_parsing():
-    """Test various ways of specifying steps."""
-    shmem = iris.iris(1 << 20)
-
-    # Test integer argument
-    result1 = shmem.linspace(0.0, 1.0, 5)
-    assert result1.shape == (5,)
-
-    # Test single tuple argument
-    result2 = shmem.linspace(0.0, 1.0, (5,))
-    assert result2.shape == (5,)
-
-    # Test single list argument
-    result3 = shmem.linspace(0.0, 1.0, [5])
-    assert result3.shape == (5,)
-
-    # Test nested tuple (should be flattened)
-    result4 = shmem.linspace(0.0, 1.0, ((5,),))
-    assert result4.shape == (5,)
-
-    # All should produce the same result shape
-    assert result1.shape == result2.shape
-    assert result2.shape == result3.shape
-    assert result3.shape == result4.shape
-
-
-def test_linspace_complex_numbers():
-    """Test linspace with complex numbers."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with complex start and end
-    result = shmem.linspace(0.0 + 0.0j, 1.0 + 1.0j, 5, dtype=torch.complex64)
-    assert result.shape == (5,)
-    assert result.dtype == torch.complex64
-    assert torch.allclose(result[0], torch.tensor(0.0 + 0.0j, dtype=torch.complex64))
-    assert torch.allclose(result[-1], torch.tensor(1.0 + 1.0j, dtype=torch.complex64))
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with complex dtype inference
-    result = shmem.linspace(0.0 + 0.0j, 1.0 + 1.0j, 5)
-    assert result.dtype == torch.complex64  # Should infer complex dtype
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_linspace_tensor_inputs():
-    """Test linspace with tensor inputs."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with 0-dimensional tensor inputs
-    start_tensor = torch.tensor(0.0, device="cuda")
-    end_tensor = torch.tensor(1.0, device="cuda")
-
-    result = shmem.linspace(start_tensor, end_tensor, 5)
-    assert result.shape == (5,)
-    assert torch.allclose(result[0], torch.tensor(0.0))
-    assert torch.allclose(result[-1], torch.tensor(1.0))
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with complex tensor inputs
-    start_complex = torch.tensor(0.0 + 0.0j, device="cuda")
-    end_complex = torch.tensor(1.0 + 1.0j, device="cuda")
-
-    result_complex = shmem.linspace(start_complex, end_complex, 5)
-    assert result_complex.shape == (5,)
-    assert result_complex.dtype == torch.complex64
-    assert shmem._Iris__on_symmetric_heap(result_complex)
-
-
-def test_linspace_accuracy():
-    """Test that linspace produces accurate results."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with simple range
-    result = shmem.linspace(0.0, 1.0, 5)
-    expected = torch.tensor([0.0, 0.25, 0.5, 0.75, 1.0], device="cuda")
-    assert torch.allclose(result, expected, atol=1e-6)
-
-    # Test with negative range
-    result = shmem.linspace(-10.0, 10.0, 5)
-    expected = torch.tensor([-10.0, -5.0, 0.0, 5.0, 10.0], device="cuda")
-    assert torch.allclose(result, expected, atol=1e-6)
-
-    # Test with many steps
-    result = shmem.linspace(0.0, 1.0, 100)
-    assert result.shape == (100,)
-    assert torch.allclose(result[0], torch.tensor(0.0))
-    assert torch.allclose(result[-1], torch.tensor(1.0))
-    # Check that step size is correct
-    step_size = result[1] - result[0]
-    expected_step = 1.0 / 99.0  # (end - start) / (steps - 1)
-    assert torch.allclose(step_size, torch.tensor(expected_step), atol=1e-6)
-
-
-def test_linspace_deterministic_behavior():
-    """Test that linspace works with deterministic settings."""
-    shmem = iris.iris(1 << 20)
-
-    # Test that linspace works regardless of deterministic settings
-    result = shmem.linspace(0.0, 1.0, 5)
-    assert result.shape == (5,)
-    assert torch.allclose(result[0], torch.tensor(0.0))
-    assert torch.allclose(result[-1], torch.tensor(1.0))
-    assert shmem._Iris__on_symmetric_heap(result)
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_load_gluon.py b/tests/unittests/test_load_gluon.py
index adce4ce39..f3263c6ff 100644
--- a/tests/unittests/test_load_gluon.py
+++ b/tests/unittests/test_load_gluon.py
@@ -33,68 +33,7 @@ def load_kernel(
     gl.store(results + offsets, result, mask=mask)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_load_api(dtype, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    source_rank = shmem.get_rank()
-    partner = int((source_rank + num_ranks // 2) % num_ranks)
-
-    data = shmem.full((BLOCK_SIZE,), source_rank, dtype=dtype)
-    results = shmem.zeros_like(data)
-
-    shmem.barrier()
-
-    grid = (1,)
-    load_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        data,
-        results,
-        source_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # Verify the result
-    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * partner
 
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
+pytestmark = pytest.mark.multi_rank_required
 
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_load_triton.py b/tests/unittests/test_load_triton.py
index b73dda3f3..144ac58bd 100644
--- a/tests/unittests/test_load_triton.py
+++ b/tests/unittests/test_load_triton.py
@@ -30,59 +30,7 @@ def load_kernel(
     tl.store(results + offsets, result, mask=mask)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_load_api(dtype, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    source_rank = shmem.get_rank()
-    partner = int((source_rank + num_ranks // 2) % num_ranks)
-
-    data = shmem.full((BLOCK_SIZE,), source_rank, dtype=dtype)
-    results = shmem.zeros_like(data)
-
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    load_kernel[grid](data, results, source_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    # Verify the result
-    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * partner
 
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
+pytestmark = pytest.mark.multi_rank_required
 
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_ones.py b/tests/unittests/test_ones.py
index e70c63f88..a981e532d 100644
--- a/tests/unittests/test_ones.py
+++ b/tests/unittests/test_ones.py
@@ -6,420 +6,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-        torch.float16,
-        torch.float32,
-        torch.float64,
-        torch.bool,
-    ],
-)
-@pytest.mark.parametrize(
-    "size",
-    [
-        (1,),
-        (5,),
-        (2, 3),
-        (3, 4, 5),
-        (1, 1, 1),
-        (10, 20),
-    ],
-)
-def test_ones_basic(dtype, size):
-    shmem = iris.iris(1 << 20)
 
-    # Test basic ones
-    result = shmem.ones(*size, dtype=dtype)
+pytestmark = pytest.mark.single_rank
 
-    # Verify shape matches
-    assert result.shape == size
-    assert result.dtype == dtype
-
-    # Verify all values are one
-    assert torch.all(result == 1)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_ones_default_dtype():
-    shmem = iris.iris(1 << 20)
-
-    # Test with default dtype (should use torch.get_default_dtype())
-    result = shmem.ones(2, 3)
-    expected_dtype = torch.get_default_dtype()
-    assert result.dtype == expected_dtype
-    assert torch.all(result == 1)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-@pytest.mark.parametrize(
-    "requires_grad",
-    [
-        True,
-        False,
-    ],
-)
-def test_ones_requires_grad(requires_grad):
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad parameter
-    result = shmem.ones(2, 2, dtype=torch.float32, requires_grad=requires_grad)
-
-    # Verify requires_grad is set
-    assert result.requires_grad == requires_grad
-    assert torch.all(result == 1)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_ones_device_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test default behavior (should use Iris device)
-    result = shmem.ones(3, 3)
-    assert str(result.device) == str(shmem.get_device())
-    assert torch.all(result == 1)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test explicit device
-    result = shmem.ones(3, 3, device=shmem.device)
-    assert str(result.device) == str(shmem.get_device())
-    assert torch.all(result == 1)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that "cuda" shorthand works (should use current CUDA device)
-    if shmem.device.startswith("cuda:"):
-        result = shmem.ones(3, 3, device="cuda")
-        assert str(result.device) == str(shmem.get_device())
-        assert torch.all(result == 1)
-        assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test None device defaults to Iris device
-    result = shmem.ones(3, 3, device=None)
-    assert str(result.device) == str(shmem.get_device())
-    assert torch.all(result == 1)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that different device throws error
-    different_device = "cpu"  # CPU is always different from CUDA
-    with pytest.raises(RuntimeError):
-        shmem.ones(3, 3, device=different_device)
-
-    # Test that different CUDA device throws error
-    if shmem.device.startswith("cuda:") and torch.cuda.device_count() >= 2:
-        current_device = torch.device(shmem.device)
-        different_cuda = f"cuda:{(current_device.index + 1) % torch.cuda.device_count()}"  # Use next GPU
-        with pytest.raises(RuntimeError):
-            shmem.ones(3, 3, device=different_cuda)
-
-
-def test_ones_layout_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test with strided layout (default)
-    result = shmem.ones(2, 4, layout=torch.strided)
-    assert result.layout == torch.strided
-    assert torch.all(result == 1)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that unsupported layout throws error
-    with pytest.raises(ValueError):
-        shmem.ones(2, 4, layout=torch.sparse_coo)
-
-
-def test_ones_out_parameter():
-    shmem = iris.iris(1 << 20)
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(6, torch.float32)
-    result = shmem.ones(2, 3, out=out_tensor)
-
-    # Should share the same underlying data (same data_ptr)
-    assert result.data_ptr() == out_tensor.data_ptr()
-    assert torch.all(result == 1)
-    assert result.shape == (2, 3)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with different dtype out tensor
-    out_tensor_int = shmem._Iris__allocate(6, torch.int32)
-    result_int = shmem.ones(2, 3, dtype=torch.int32, out=out_tensor_int)
-    assert result_int.data_ptr() == out_tensor_int.data_ptr()
-    assert result_int.dtype == torch.int32
-    assert torch.all(result_int == 1)
-    assert shmem._Iris__on_symmetric_heap(result_int)
-
-
-def test_ones_size_variations():
-    shmem = iris.iris(1 << 20)
-
-    # Test single dimension
-    result1 = shmem.ones(5)
-    assert result1.shape == (5,)
-    assert torch.all(result1 == 1)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test multiple dimensions
-    result2 = shmem.ones(2, 3, 4)
-    assert result2.shape == (2, 3, 4)
-    assert torch.all(result2 == 1)
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Test with tuple as single argument
-    result3 = shmem.ones((3, 4))
-    assert result3.shape == (3, 4)
-    assert torch.all(result3 == 1)
-    assert shmem._Iris__on_symmetric_heap(result3)
-
-    # Test with list as single argument
-    result4 = shmem.ones([2, 5])
-    assert result4.shape == (2, 5)
-    assert torch.all(result4 == 1)
-    assert shmem._Iris__on_symmetric_heap(result4)
-
-
-def test_ones_edge_cases():
-    shmem = iris.iris(1 << 20)
-
-    # Empty tensor
-    empty_result = shmem.ones(0)
-    assert empty_result.shape == (0,)
-    assert empty_result.numel() == 0
-    assert shmem._Iris__on_symmetric_heap(empty_result)
-
-    # Single element tensor
-    single_result = shmem.ones(1)
-    assert single_result.shape == (1,)
-    assert single_result.numel() == 1
-    assert single_result[0] == 1
-    assert shmem._Iris__on_symmetric_heap(single_result)
-
-    # Large tensor
-    large_result = shmem.ones(100, 100)
-    assert large_result.shape == (100, 100)
-    assert large_result.numel() == 10000
-    assert torch.all(large_result == 1)
-    assert shmem._Iris__on_symmetric_heap(large_result)
-
-    # Zero-dimensional tensor (scalar)
-    scalar_result = shmem.ones(())
-    assert scalar_result.shape == ()
-    assert scalar_result.numel() == 1
-    assert scalar_result.item() == 1
-    assert shmem._Iris__on_symmetric_heap(scalar_result)
-
-
-def test_ones_pytorch_equivalence():
-    shmem = iris.iris(1 << 20)
-
-    # Test basic equivalence
-    iris_result = shmem.ones(4, 3)
-    pytorch_result = torch.ones(4, 3, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert torch.all(iris_result == pytorch_result)
-
-    # Test with explicit dtype
-    iris_result = shmem.ones(2, 2, dtype=torch.float64)
-    pytorch_result = torch.ones(2, 2, dtype=torch.float64, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert torch.all(iris_result == pytorch_result)
-
-    # Test with requires_grad
-    iris_result = shmem.ones(3, 3, dtype=torch.float32, requires_grad=True)
-    pytorch_result = torch.ones(3, 3, dtype=torch.float32, device="cuda", requires_grad=True)
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert iris_result.requires_grad == pytorch_result.requires_grad
-    assert torch.all(iris_result == pytorch_result)
-
-
-@pytest.mark.parametrize(
-    "params",
-    [
-        {"dtype": torch.float32, "requires_grad": True},
-        {"dtype": torch.float64, "requires_grad": False},
-        {"dtype": torch.int32},
-        {"dtype": torch.float16},
-        {"layout": torch.strided},
-        {},
-    ],
-)
-def test_ones_parameter_combinations(params):
-    shmem = iris.iris(1 << 20)
-
-    # Test various combinations of parameters
-    result = shmem.ones(3, 3, **params)
-
-    # Verify basic functionality
-    assert result.shape == (3, 3)
-    assert torch.all(result == 1)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Verify dtype if specified
-    if "dtype" in params:
-        assert result.dtype == params["dtype"]
-
-    # Verify requires_grad if specified
-    if "requires_grad" in params:
-        assert result.requires_grad == params["requires_grad"]
-
-    # Verify layout if specified
-    if "layout" in params:
-        assert result.layout == params["layout"]
-
-
-@pytest.mark.parametrize(
-    "size,dtype",
-    [
-        ((1,), torch.float32),
-        ((5,), torch.int32),
-        ((2, 3), torch.float64),
-        ((3, 4, 5), torch.float16),
-        ((0,), torch.float32),  # Empty tensor
-        ((100, 100), torch.float32),  # Large tensor
-        ((), torch.float32),  # Scalar tensor
-    ],
-)
-def test_ones_symmetric_heap_shapes_dtypes(size, dtype):
-    """Test that ones returns tensors on symmetric heap for various shapes and dtypes."""
-    shmem = iris.iris(1 << 20)
-
-    # Test ones with this size and dtype
-    result = shmem.ones(*size, dtype=dtype)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with size {size}, dtype {dtype} is NOT on symmetric heap!"
-
-    # Also verify basic functionality
-    assert result.shape == size
-    assert result.dtype == dtype
-    assert torch.all(result == 1)
-
-
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64, torch.int32, torch.int64])
-def test_ones_symmetric_heap_dtype_override(dtype):
-    """Test that ones with dtype override returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    result = shmem.ones(3, 3, dtype=dtype)
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with dtype {dtype} is NOT on symmetric heap!"
-    assert result.dtype == dtype
-
-
-def test_ones_symmetric_heap_other_params():
-    """Test that ones with other parameters returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad
-    result = shmem.ones(3, 3, dtype=torch.float32, requires_grad=True)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with requires_grad=True is NOT on symmetric heap!"
-
-    # Test with device override
-    result = shmem.ones(3, 3, device=shmem.device)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with device override is NOT on symmetric heap!"
-
-    # Test with layout override (only strided is supported)
-    result = shmem.ones(3, 3, layout=torch.strided)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with layout override is NOT on symmetric heap!"
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(9, torch.float32)
-    result = shmem.ones(3, 3, out=out_tensor)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with out parameter is NOT on symmetric heap!"
-
-
-def test_ones_invalid_output_tensor():
-    """Test error handling for invalid output tensors."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with wrong size output tensor
-    wrong_size_tensor = shmem._Iris__allocate(4, torch.float32)  # Wrong size for (3, 3)
-    with pytest.raises(RuntimeError):
-        shmem.ones(3, 3, out=wrong_size_tensor)
-
-    # Test with wrong dtype output tensor
-    wrong_dtype_tensor = shmem._Iris__allocate(9, torch.int32)  # Wrong dtype
-    with pytest.raises(RuntimeError):
-        shmem.ones(3, 3, dtype=torch.float32, out=wrong_dtype_tensor)
-
-    # Test with tensor not on symmetric heap (create a regular PyTorch tensor)
-    regular_tensor = torch.ones(3, 3, device="cuda")
-    with pytest.raises(RuntimeError):
-        shmem.ones(3, 3, out=regular_tensor)
-
-
-def test_ones_default_dtype_behavior():
-    """Test that ones uses the global default dtype when dtype=None."""
-    shmem = iris.iris(1 << 20)
-
-    # Save original default dtype
-    original_default = torch.get_default_dtype()
-
-    try:
-        # Test with float32 default
-        torch.set_default_dtype(torch.float32)
-        result1 = shmem.ones(2, 2)
-        assert result1.dtype == torch.float32
-
-        # Test with float64 default
-        torch.set_default_dtype(torch.float64)
-        result2 = shmem.ones(2, 2)
-        assert result2.dtype == torch.float64
-
-    finally:
-        # Restore original default dtype
-        torch.set_default_dtype(original_default)
-
-
-def test_ones_size_parsing():
-    """Test various ways of specifying size."""
-    shmem = iris.iris(1 << 20)
-
-    # Test individual arguments
-    result1 = shmem.ones(2, 3, 4)
-    assert result1.shape == (2, 3, 4)
-
-    # Test single tuple argument
-    result2 = shmem.ones((2, 3, 4))
-    assert result2.shape == (2, 3, 4)
-
-    # Test single list argument
-    result3 = shmem.ones([2, 3, 4])
-    assert result3.shape == (2, 3, 4)
-
-    # Test nested tuple (should be flattened)
-    result4 = shmem.ones(((2, 3, 4),))
-    assert result4.shape == (2, 3, 4)
-
-    # All should produce the same result
-    assert torch.all(result1 == result2)
-    assert torch.all(result2 == result3)
-    assert torch.all(result3 == result4)
-
-
-def test_ones_examples():
-    """Test the examples from PyTorch documentation."""
-    shmem = iris.iris(1 << 20)
-
-    # Example 1: torch.ones(2, 3)
-    result1 = shmem.ones(2, 3)
-    expected1 = torch.tensor([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], device=result1.device)
-    assert result1.shape == (2, 3)
-    assert torch.all(result1 == expected1)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Example 2: torch.ones(5)
-    result2 = shmem.ones(5)
-    expected2 = torch.tensor([1.0, 1.0, 1.0, 1.0, 1.0], device=result2.device)
-    assert result2.shape == (5,)
-    assert torch.all(result2 == expected2)
-    assert shmem._Iris__on_symmetric_heap(result2)
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_put_gluon.py b/tests/unittests/test_put_gluon.py
index 6f1172602..1c59365c7 100644
--- a/tests/unittests/test_put_gluon.py
+++ b/tests/unittests/test_put_gluon.py
@@ -35,67 +35,7 @@ def put_kernel(
         ctx.put(data + offsets, results + offsets, target_rank, mask=mask)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_put_api(dtype, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    cur_rank = shmem.get_rank()
 
-    data = shmem.ones(BLOCK_SIZE, dtype=dtype)
-    results = shmem.zeros_like(data)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = (1,)
-    put_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        data,
-        results,
-        cur_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # Verify the results
-    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_put_triton.py b/tests/unittests/test_put_triton.py
index d953b42c6..987c1a8cd 100644
--- a/tests/unittests/test_put_triton.py
+++ b/tests/unittests/test_put_triton.py
@@ -32,58 +32,7 @@ def put_kernel(
         iris.put(data + offsets, results + offsets, cur_rank, target_rank, heap_bases, mask=mask)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_put_api(dtype, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    cur_rank = shmem.get_rank()
 
-    data = shmem.ones(BLOCK_SIZE, dtype=dtype)
-    results = shmem.zeros_like(data)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    put_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    # Verify the results
-    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_rand.py b/tests/unittests/test_rand.py
index 75b6968b0..a981e532d 100644
--- a/tests/unittests/test_rand.py
+++ b/tests/unittests/test_rand.py
@@ -6,478 +6,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,
-        torch.float32,
-        torch.float64,
-    ],
-)
-@pytest.mark.parametrize(
-    "size",
-    [
-        (1,),
-        (5,),
-        (2, 3),
-        (3, 4, 5),
-        (1, 1, 1),
-        (10, 20),
-    ],
-)
-def test_rand_basic(dtype, size):
-    shmem = iris.iris(1 << 20)
 
-    # Test basic rand
-    result = shmem.rand(*size, dtype=dtype)
+pytestmark = pytest.mark.single_rank
 
-    # Verify shape matches
-    assert result.shape == size
-    assert result.dtype == dtype
-
-    # Verify values are within range [0, 1)
-    assert torch.all(result >= 0)
-    assert torch.all(result < 1)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_rand_default_dtype():
-    shmem = iris.iris(1 << 20)
-
-    # Test with default dtype (should use torch.get_default_dtype())
-    result = shmem.rand(2, 3)
-    expected_dtype = torch.get_default_dtype()
-    assert result.dtype == expected_dtype
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-@pytest.mark.parametrize(
-    "requires_grad",
-    [
-        True,
-        False,
-    ],
-)
-def test_rand_requires_grad(requires_grad):
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad parameter
-    result = shmem.rand(2, 2, dtype=torch.float32, requires_grad=requires_grad)
-
-    # Verify requires_grad is set
-    assert result.requires_grad == requires_grad
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_rand_device_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test default behavior (should use Iris device)
-    result = shmem.rand(3, 3)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test explicit device
-    result = shmem.rand(3, 3, device=shmem.device)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that "cuda" shorthand works (should use current CUDA device)
-    if shmem.device.startswith("cuda:"):
-        result = shmem.rand(3, 3, device="cuda")
-        assert str(result.device) == str(shmem.get_device())
-        assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test None device defaults to Iris device
-    result = shmem.rand(3, 3, device=None)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that different device throws error
-    different_device = "cpu"  # CPU is always different from CUDA
-    with pytest.raises(RuntimeError):
-        shmem.rand(3, 3, device=different_device)
-
-    # Test that different CUDA device throws error
-    if shmem.device.startswith("cuda:") and torch.cuda.device_count() >= 2:
-        current_device = torch.device(shmem.device)
-        different_cuda = f"cuda:{(current_device.index + 1) % torch.cuda.device_count()}"  # Use next GPU
-        with pytest.raises(RuntimeError):
-            shmem.rand(3, 3, device=different_cuda)
-
-
-def test_rand_layout_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test with strided layout (default)
-    result = shmem.rand(2, 4, layout=torch.strided)
-    assert result.layout == torch.strided
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that unsupported layout throws error
-    with pytest.raises(ValueError):
-        shmem.rand(2, 4, layout=torch.sparse_coo)
-
-
-def test_rand_out_parameter():
-    shmem = iris.iris(1 << 20)
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(6, torch.float32)
-    result = shmem.rand(2, 3, out=out_tensor)
-
-    # Should share the same underlying data (same data_ptr)
-    assert result.data_ptr() == out_tensor.data_ptr()
-    assert result.shape == (2, 3)
-    assert torch.all(result >= 0)
-    assert torch.all(result < 1)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with different dtype out tensor
-    out_tensor_float64 = shmem._Iris__allocate(6, torch.float64)
-    result_float64 = shmem.rand(2, 3, dtype=torch.float64, out=out_tensor_float64)
-    assert result_float64.data_ptr() == out_tensor_float64.data_ptr()
-    assert result_float64.dtype == torch.float64
-    assert shmem._Iris__on_symmetric_heap(result_float64)
-
-
-def test_rand_size_variations():
-    shmem = iris.iris(1 << 20)
-
-    # Test single dimension
-    result1 = shmem.rand(5)
-    assert result1.shape == (5,)
-    assert torch.all(result1 >= 0)
-    assert torch.all(result1 < 1)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test multiple dimensions
-    result2 = shmem.rand(2, 3, 4)
-    assert result2.shape == (2, 3, 4)
-    assert torch.all(result2 >= 0)
-    assert torch.all(result2 < 1)
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Test with tuple as single argument
-    result3 = shmem.rand((3, 4))
-    assert result3.shape == (3, 4)
-    assert torch.all(result3 >= 0)
-    assert torch.all(result3 < 1)
-    assert shmem._Iris__on_symmetric_heap(result3)
-
-    # Test with list as single argument
-    result4 = shmem.rand([2, 5])
-    assert result4.shape == (2, 5)
-    assert torch.all(result4 >= 0)
-    assert torch.all(result4 < 1)
-    assert shmem._Iris__on_symmetric_heap(result4)
-
-
-def test_rand_edge_cases():
-    shmem = iris.iris(1 << 20)
-
-    # Empty tensor
-    empty_result = shmem.rand(0)
-    assert empty_result.shape == (0,)
-    assert empty_result.numel() == 0
-    assert shmem._Iris__on_symmetric_heap(empty_result)
-
-    # Single element tensor
-    single_result = shmem.rand(1)
-    assert single_result.shape == (1,)
-    assert single_result.numel() == 1
-    assert torch.all(single_result >= 0)
-    assert torch.all(single_result < 1)
-    assert shmem._Iris__on_symmetric_heap(single_result)
-
-    # Large tensor
-    large_result = shmem.rand(50, 50)
-    assert large_result.shape == (50, 50)
-    assert large_result.numel() == 2500
-    assert torch.all(large_result >= 0)
-    assert torch.all(large_result < 1)
-    assert shmem._Iris__on_symmetric_heap(large_result)
-
-    # Zero-dimensional tensor (scalar)
-    scalar_result = shmem.rand(())
-    assert scalar_result.shape == ()
-    assert scalar_result.numel() == 1
-    assert torch.all(scalar_result >= 0)
-    assert torch.all(scalar_result < 1)
-    assert shmem._Iris__on_symmetric_heap(scalar_result)
-
-
-def test_rand_pytorch_equivalence():
-    shmem = iris.iris(1 << 20)
-
-    # Test basic equivalence
-    iris_result = shmem.rand(4, 3)
-    pytorch_result = torch.rand(4, 3, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-
-    # Test with explicit dtype
-    iris_result = shmem.rand(2, 2, dtype=torch.float64)
-    pytorch_result = torch.rand(2, 2, dtype=torch.float64, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-
-    # Test with requires_grad
-    iris_result = shmem.rand(3, 3, dtype=torch.float32, requires_grad=True)
-    pytorch_result = torch.rand(3, 3, dtype=torch.float32, device="cuda", requires_grad=True)
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert iris_result.requires_grad == pytorch_result.requires_grad
-
-
-@pytest.mark.parametrize(
-    "params",
-    [
-        {"dtype": torch.float32, "requires_grad": True},
-        {"dtype": torch.float64, "requires_grad": False},
-        {"dtype": torch.float16},
-        {"layout": torch.strided},
-        {},
-    ],
-)
-def test_rand_parameter_combinations(params):
-    shmem = iris.iris(1 << 20)
-
-    # Test various combinations of parameters
-    result = shmem.rand(3, 3, **params)
-
-    # Verify basic functionality
-    assert result.shape == (3, 3)
-    assert torch.all(result >= 0)
-    assert torch.all(result < 1)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Verify dtype if specified
-    if "dtype" in params:
-        assert result.dtype == params["dtype"]
-
-    # Verify requires_grad if specified
-    if "requires_grad" in params:
-        assert result.requires_grad == params["requires_grad"]
-
-    # Verify layout if specified
-    if "layout" in params:
-        assert result.layout == params["layout"]
-
-
-@pytest.mark.parametrize(
-    "size,dtype",
-    [
-        ((1,), torch.float32),
-        ((5,), torch.float64),
-        ((2, 3), torch.float16),
-        ((3, 4, 5), torch.float32),
-        ((0,), torch.float32),  # Empty tensor
-        ((50, 50), torch.float32),  # Large tensor
-        ((), torch.float32),  # Scalar tensor
-    ],
-)
-def test_rand_symmetric_heap_shapes_dtypes(size, dtype):
-    """Test that rand returns tensors on symmetric heap for various shapes and dtypes."""
-    shmem = iris.iris(1 << 20)
-
-    # Test rand with this size and dtype
-    result = shmem.rand(*size, dtype=dtype)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with size {size}, dtype {dtype} is NOT on symmetric heap!"
-
-    # Also verify basic functionality
-    assert result.shape == size
-    assert result.dtype == dtype
-    assert torch.all(result >= 0)
-    assert torch.all(result < 1)
-
-
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64])
-def test_rand_symmetric_heap_dtype_override(dtype):
-    """Test that rand with dtype override returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    result = shmem.rand(3, 3, dtype=dtype)
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with dtype {dtype} is NOT on symmetric heap!"
-    assert result.dtype == dtype
-
-
-def test_rand_symmetric_heap_other_params():
-    """Test that rand with other parameters returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad
-    result = shmem.rand(3, 3, dtype=torch.float32, requires_grad=True)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with requires_grad=True is NOT on symmetric heap!"
-
-    # Test with device override
-    result = shmem.rand(3, 3, device=shmem.device)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with device override is NOT on symmetric heap!"
-
-    # Test with layout override (only strided is supported)
-    result = shmem.rand(3, 3, layout=torch.strided)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with layout override is NOT on symmetric heap!"
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(9, torch.float32)
-    result = shmem.rand(3, 3, out=out_tensor)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with out parameter is NOT on symmetric heap!"
-
-
-def test_rand_invalid_output_tensor():
-    """Test error handling for invalid output tensors."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with wrong size output tensor
-    wrong_size_tensor = shmem._Iris__allocate(4, torch.float32)  # Wrong size for (3, 3)
-    with pytest.raises(RuntimeError):
-        shmem.rand(3, 3, out=wrong_size_tensor)
-
-    # Test with wrong dtype output tensor
-    wrong_dtype_tensor = shmem._Iris__allocate(9, torch.int32)  # Wrong dtype
-    with pytest.raises(RuntimeError):
-        shmem.rand(3, 3, dtype=torch.float32, out=wrong_dtype_tensor)
-
-    # Test with tensor not on symmetric heap (create a regular PyTorch tensor)
-    regular_tensor = torch.rand(3, 3, device="cuda")
-    with pytest.raises(RuntimeError):
-        shmem.rand(3, 3, out=regular_tensor)
-
-
-def test_rand_default_dtype_behavior():
-    """Test that rand uses the global default dtype when dtype=None."""
-    shmem = iris.iris(1 << 20)
-
-    # Save original default dtype
-    original_default = torch.get_default_dtype()
-
-    try:
-        # Test with float32 default
-        torch.set_default_dtype(torch.float32)
-        result1 = shmem.rand(2, 2)
-        assert result1.dtype == torch.float32
-
-        # Test with float64 default
-        torch.set_default_dtype(torch.float64)
-        result2 = shmem.rand(2, 2)
-        assert result2.dtype == torch.float64
-
-    finally:
-        # Restore original default dtype
-        torch.set_default_dtype(original_default)
-
-
-def test_rand_size_parsing():
-    """Test various ways of specifying size."""
-    shmem = iris.iris(1 << 20)
-
-    # Test individual arguments
-    result1 = shmem.rand(2, 3, 4)
-    assert result1.shape == (2, 3, 4)
-
-    # Test single tuple argument
-    result2 = shmem.rand((2, 3, 4))
-    assert result2.shape == (2, 3, 4)
-
-    # Test single list argument
-    result3 = shmem.rand([2, 3, 4])
-    assert result3.shape == (2, 3, 4)
-
-    # Test nested tuple (should be flattened)
-    result4 = shmem.rand(((2, 3, 4),))
-    assert result4.shape == (2, 3, 4)
-
-    # All should produce the same result shape
-    assert result1.shape == result2.shape
-    assert result2.shape == result3.shape
-    assert result3.shape == result4.shape
-
-
-def test_rand_generator():
-    """Test generator parameter."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with generator
-    generator = torch.Generator(device="cuda")
-    generator.manual_seed(42)
-    result1 = shmem.rand(3, 3, generator=generator)
-    assert result1.shape == (3, 3)
-    assert torch.all(result1 >= 0)
-    assert torch.all(result1 < 1)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test without generator (should still work)
-    result2 = shmem.rand(3, 3)
-    assert result2.shape == (3, 3)
-    assert torch.all(result2 >= 0)
-    assert torch.all(result2 < 1)
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Test that generator produces reproducible results
-    generator1 = torch.Generator(device="cuda")
-    generator1.manual_seed(123)
-    result3 = shmem.rand(3, 3, generator=generator1)
-
-    generator2 = torch.Generator(device="cuda")
-    generator2.manual_seed(123)
-    result4 = shmem.rand(3, 3, generator=generator2)
-
-    # Results should be identical with same seed
-    assert torch.allclose(result3, result4)
-
-
-def test_rand_pin_memory():
-    """Test pin_memory parameter (should be ignored for Iris tensors)."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with pin_memory=True (should work but be ignored since Iris tensors are on GPU)
-    result = shmem.rand(2, 3, pin_memory=True)
-    assert result.shape == (2, 3)
-    assert torch.all(result >= 0)
-    assert torch.all(result < 1)
-    assert shmem._Iris__on_symmetric_heap(result)
-    # Note: pin_memory is ignored for GPU tensors, so we just verify it doesn't cause errors
-
-
-def test_rand_distribution():
-    """Test that rand produces values in the correct range [0, 1)."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with reasonably sized tensor to get good statistical coverage
-    result = shmem.rand(100, 100)
-    assert result.shape == (100, 100)
-
-    # All values should be >= 0 and < 1
-    assert torch.all(result >= 0)
-    assert torch.all(result < 1)
-
-    # Check that we have some values close to 0 and close to 1
-    # (this is a statistical test, so we check for reasonable bounds)
-    min_val = torch.min(result).item()
-    max_val = torch.max(result).item()
-
-    # Should have some values close to 0
-    assert min_val < 0.1, f"Minimum value {min_val} is too high"
-    # Should have some values close to 1
-    assert max_val > 0.9, f"Maximum value {max_val} is too low"
-
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_rand_deterministic_behavior():
-    """Test that rand works with deterministic settings."""
-    shmem = iris.iris(1 << 20)
-
-    # Test that rand works regardless of deterministic settings
-    result = shmem.rand(2, 3)
-    assert result.shape == (2, 3)
-    assert torch.all(result >= 0)
-    assert torch.all(result < 1)
-    assert shmem._Iris__on_symmetric_heap(result)
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_randint.py b/tests/unittests/test_randint.py
index a636be386..a981e532d 100644
--- a/tests/unittests/test_randint.py
+++ b/tests/unittests/test_randint.py
@@ -6,478 +6,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-        torch.uint8,
-    ],
-)
-@pytest.mark.parametrize(
-    "size",
-    [
-        (1,),
-        (5,),
-        (2, 3),
-        (3, 4, 5),
-        (1, 1, 1),
-        (10, 20),
-    ],
-)
-def test_randint_basic(dtype, size):
-    shmem = iris.iris(1 << 20)
 
-    # Test basic randint with low, high, size
-    result = shmem.randint(0, 10, size, dtype=dtype)
+pytestmark = pytest.mark.single_rank
 
-    # Verify shape matches
-    assert result.shape == size
-    assert result.dtype == dtype
-
-    # Verify values are within range [0, 10)
-    assert torch.all(result >= 0)
-    assert torch.all(result < 10)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_randint_default_dtype():
-    shmem = iris.iris(1 << 20)
-
-    # Test with default dtype (should use torch.int64)
-    result = shmem.randint(0, 10, (2, 3))
-    assert result.dtype == torch.int64
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-@pytest.mark.parametrize(
-    "requires_grad",
-    [
-        True,
-        False,
-    ],
-)
-def test_randint_requires_grad(requires_grad):
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad parameter
-    result = shmem.randint(0, 10, (2, 2), dtype=torch.float32, requires_grad=requires_grad)
-
-    # Verify requires_grad is set
-    assert result.requires_grad == requires_grad
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_randint_device_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test default behavior (should use Iris device)
-    result = shmem.randint(0, 10, (3, 3))
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test explicit device
-    result = shmem.randint(0, 10, (3, 3), device=shmem.device)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that "cuda" shorthand works (should use current CUDA device)
-    if shmem.device.startswith("cuda:"):
-        result = shmem.randint(0, 10, (3, 3), device="cuda")
-        assert str(result.device) == str(shmem.get_device())
-        assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test None device defaults to Iris device
-    result = shmem.randint(0, 10, (3, 3), device=None)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that different device throws error
-    different_device = "cpu"  # CPU is always different from CUDA
-    with pytest.raises(RuntimeError):
-        shmem.randint(0, 10, (3, 3), device=different_device)
-
-    # Test that different CUDA device throws error
-    if shmem.device.startswith("cuda:") and torch.cuda.device_count() >= 2:
-        current_device = torch.device(shmem.device)
-        different_cuda = f"cuda:{(current_device.index + 1) % torch.cuda.device_count()}"  # Use next GPU
-        with pytest.raises(RuntimeError):
-            shmem.randint(0, 10, (3, 3), device=different_cuda)
-
-
-def test_randint_layout_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test with strided layout (default)
-    result = shmem.randint(0, 10, (2, 4), layout=torch.strided)
-    assert result.layout == torch.strided
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that unsupported layout throws error
-    with pytest.raises(ValueError):
-        shmem.randint(0, 10, (2, 4), layout=torch.sparse_coo)
-
-
-def test_randint_out_parameter():
-    shmem = iris.iris(1 << 20)
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(6, torch.int64)
-    result = shmem.randint(0, 10, (2, 3), out=out_tensor)
-
-    # Should share the same underlying data (same data_ptr)
-    assert result.data_ptr() == out_tensor.data_ptr()
-    assert result.shape == (2, 3)
-    assert torch.all(result >= 0)
-    assert torch.all(result < 10)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with explicit dtype
-    out_tensor_int32 = shmem._Iris__allocate(6, torch.int32)
-    result_int32 = shmem.randint(0, 10, (2, 3), dtype=torch.int32, out=out_tensor_int32)
-    assert result_int32.data_ptr() == out_tensor_int32.data_ptr()
-    assert result_int32.dtype == torch.int32
-    assert shmem._Iris__on_symmetric_heap(result_int32)
-
-
-def test_randint_size_variations():
-    shmem = iris.iris(1 << 20)
-
-    # Test single dimension
-    result1 = shmem.randint(0, 5, (5,))
-    assert result1.shape == (5,)
-    assert torch.all(result1 >= 0)
-    assert torch.all(result1 < 5)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test multiple dimensions
-    result2 = shmem.randint(0, 10, (2, 3, 4))
-    assert result2.shape == (2, 3, 4)
-    assert torch.all(result2 >= 0)
-    assert torch.all(result2 < 10)
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Test with tuple as single argument
-    result3 = shmem.randint(0, 10, (3, 4))
-    assert result3.shape == (3, 4)
-    assert torch.all(result3 >= 0)
-    assert torch.all(result3 < 10)
-    assert shmem._Iris__on_symmetric_heap(result3)
-
-    # Test with list as single argument
-    result4 = shmem.randint(0, 10, [2, 5])
-    assert result4.shape == (2, 5)
-    assert torch.all(result4 >= 0)
-    assert torch.all(result4 < 10)
-    assert shmem._Iris__on_symmetric_heap(result4)
-
-
-def test_randint_edge_cases():
-    shmem = iris.iris(1 << 20)
-
-    # Empty tensor
-    empty_result = shmem.randint(0, 5, (0,))
-    assert empty_result.shape == (0,)
-    assert empty_result.numel() == 0
-    assert shmem._Iris__on_symmetric_heap(empty_result)
-
-    # Single element tensor
-    single_result = shmem.randint(0, 10, (1,))
-    assert single_result.shape == (1,)
-    assert single_result.numel() == 1
-    assert torch.all(single_result >= 0)
-    assert torch.all(single_result < 10)
-    assert shmem._Iris__on_symmetric_heap(single_result)
-
-    # Large tensor
-    large_result = shmem.randint(0, 100, (100, 100))
-    assert large_result.shape == (100, 100)
-    assert large_result.numel() == 10000
-    assert torch.all(large_result >= 0)
-    assert torch.all(large_result < 100)
-    assert shmem._Iris__on_symmetric_heap(large_result)
-
-    # Zero-dimensional tensor (scalar)
-    scalar_result = shmem.randint(0, 10, ())
-    assert scalar_result.shape == ()
-    assert scalar_result.numel() == 1
-    assert torch.all(scalar_result >= 0)
-    assert torch.all(scalar_result < 10)
-    assert shmem._Iris__on_symmetric_heap(scalar_result)
-
-
-def test_randint_pytorch_equivalence():
-    shmem = iris.iris(1 << 20)
-
-    # Test basic equivalence
-    iris_result = shmem.randint(0, 10, (4, 3))
-    pytorch_result = torch.randint(0, 10, (4, 3), device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-
-    # Test with explicit dtype
-    iris_result = shmem.randint(0, 10, (2, 2), dtype=torch.int32)
-    pytorch_result = torch.randint(0, 10, (2, 2), dtype=torch.int32, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-
-    # Test with requires_grad
-    iris_result = shmem.randint(0, 10, (3, 3), dtype=torch.float32, requires_grad=True)
-    pytorch_result = torch.randint(0, 10, (3, 3), dtype=torch.float32, device="cuda", requires_grad=True)
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert iris_result.requires_grad == pytorch_result.requires_grad
-
-
-@pytest.mark.parametrize(
-    "params",
-    [
-        {"dtype": torch.float32, "requires_grad": True},
-        {"dtype": torch.int64, "requires_grad": False},
-        {"dtype": torch.int8},
-        {"dtype": torch.uint8},
-        {"layout": torch.strided},
-        {},
-    ],
-)
-def test_randint_parameter_combinations(params):
-    shmem = iris.iris(1 << 20)
-
-    # Test various combinations of parameters
-    result = shmem.randint(0, 10, (3, 3), **params)
-
-    # Verify basic functionality
-    assert result.shape == (3, 3)
-    assert torch.all(result >= 0)
-    assert torch.all(result < 10)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Verify dtype if specified
-    if "dtype" in params:
-        assert result.dtype == params["dtype"]
-
-    # Verify requires_grad if specified
-    if "requires_grad" in params:
-        assert result.requires_grad == params["requires_grad"]
-
-    # Verify layout if specified
-    if "layout" in params:
-        assert result.layout == params["layout"]
-
-
-@pytest.mark.parametrize(
-    "size,dtype",
-    [
-        ((1,), torch.int32),
-        ((5,), torch.int64),
-        ((2, 3), torch.int8),
-        ((3, 4, 5), torch.uint8),
-        ((0,), torch.int32),  # Empty tensor
-        ((100, 100), torch.int32),  # Large tensor
-        ((), torch.int32),  # Scalar tensor
-    ],
-)
-def test_randint_symmetric_heap_shapes_dtypes(size, dtype):
-    """Test that randint returns tensors on symmetric heap for various shapes and dtypes."""
-    shmem = iris.iris(1 << 20)
-
-    # Test randint with this size and dtype
-    result = shmem.randint(0, 10, size, dtype=dtype)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with size {size}, dtype {dtype} is NOT on symmetric heap!"
-
-    # Also verify basic functionality
-    assert result.shape == size
-    assert result.dtype == dtype
-    assert torch.all(result >= 0)
-    assert torch.all(result < 10)
-
-
-@pytest.mark.parametrize("dtype", [torch.int8, torch.int16, torch.int32, torch.int64, torch.uint8])
-def test_randint_symmetric_heap_dtype_override(dtype):
-    """Test that randint with dtype override returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    result = shmem.randint(0, 10, (3, 3), dtype=dtype)
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with dtype {dtype} is NOT on symmetric heap!"
-    assert result.dtype == dtype
-
-
-def test_randint_symmetric_heap_other_params():
-    """Test that randint with other parameters returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad
-    result = shmem.randint(0, 10, (3, 3), dtype=torch.float32, requires_grad=True)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with requires_grad=True is NOT on symmetric heap!"
-
-    # Test with device override
-    result = shmem.randint(0, 10, (3, 3), device=shmem.device)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with device override is NOT on symmetric heap!"
-
-    # Test with layout override (only strided is supported)
-    result = shmem.randint(0, 10, (3, 3), layout=torch.strided)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with layout override is NOT on symmetric heap!"
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(9, torch.int64)  # Use default dtype
-    result = shmem.randint(0, 10, (3, 3), out=out_tensor)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with out parameter is NOT on symmetric heap!"
-
-
-def test_randint_invalid_output_tensor():
-    """Test error handling for invalid output tensors."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with wrong size output tensor
-    wrong_size_tensor = shmem._Iris__allocate(4, torch.int32)  # Wrong size for (3, 3)
-    with pytest.raises(RuntimeError):
-        shmem.randint(0, 10, (3, 3), out=wrong_size_tensor)
-
-    # Test with wrong dtype output tensor
-    wrong_dtype_tensor = shmem._Iris__allocate(9, torch.float32)  # Wrong dtype
-    with pytest.raises(RuntimeError):
-        shmem.randint(0, 10, (3, 3), dtype=torch.int32, out=wrong_dtype_tensor)
-
-    # Test with tensor not on symmetric heap (create a regular PyTorch tensor)
-    regular_tensor = torch.randint(0, 10, (3, 3), device="cuda")
-    with pytest.raises(RuntimeError):
-        shmem.randint(0, 10, (3, 3), out=regular_tensor)
-
-
-def test_randint_default_dtype_behavior():
-    """Test that randint uses torch.int64 when dtype=None."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with default dtype (should be torch.int64)
-    result = shmem.randint(0, 10, (2, 2))
-    assert result.dtype == torch.int64
-
-
-def test_randint_size_parsing():
-    """Test various ways of specifying size."""
-    shmem = iris.iris(1 << 20)
-
-    # Test individual arguments
-    result1 = shmem.randint(0, 10, (2, 3, 4))
-    assert result1.shape == (2, 3, 4)
-
-    # Test single tuple argument
-    result2 = shmem.randint(0, 10, (2, 3, 4))
-    assert result2.shape == (2, 3, 4)
-
-    # Test single list argument
-    result3 = shmem.randint(0, 10, [2, 3, 4])
-    assert result3.shape == (2, 3, 4)
-
-    # Test nested tuple (should be flattened)
-    result4 = shmem.randint(0, 10, ((2, 3, 4),))
-    assert result4.shape == (2, 3, 4)
-
-    # All should produce the same result shape
-    assert result1.shape == result2.shape
-    assert result2.shape == result3.shape
-    assert result3.shape == result4.shape
-
-
-def test_randint_generator():
-    """Test generator parameter."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with generator
-    generator = torch.Generator(device="cuda")
-    generator.manual_seed(42)
-    result1 = shmem.randint(0, 10, (3, 3), generator=generator)
-    assert result1.shape == (3, 3)
-    assert torch.all(result1 >= 0)
-    assert torch.all(result1 < 10)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test without generator (should still work)
-    result2 = shmem.randint(0, 10, (3, 3))
-    assert result2.shape == (3, 3)
-    assert torch.all(result2 >= 0)
-    assert torch.all(result2 < 10)
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-
-def test_randint_argument_validation():
-    """Test argument validation."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with wrong number of arguments
-    with pytest.raises(ValueError):
-        shmem.randint(10)  # Missing size
-
-    with pytest.raises(ValueError):
-        shmem.randint(0, 10, (2, 3), (4, 5))  # Too many arguments
-
-    # Test with invalid range (should throw error)
-    with pytest.raises(RuntimeError):
-        shmem.randint(10, 5, (2, 3))  # low > high should throw error
-
-
-def test_randint_range_validation():
-    """Test that randint respects the range [low, high)."""
-    shmem = iris.iris(1 << 20)
-
-    # Test positive range
-    result = shmem.randint(5, 15, (100,))
-    assert torch.all(result >= 5)
-    assert torch.all(result < 15)
-
-    # Test negative range
-    result = shmem.randint(-10, -5, (100,))
-    assert torch.all(result >= -10)
-    assert torch.all(result < -5)
-
-    # Test zero range
-    result = shmem.randint(0, 1, (100,))
-    assert torch.all(result == 0)
-
-    # Test single value range
-    result = shmem.randint(42, 43, (100,))
-    assert torch.all(result == 42)
-
-
-def test_randint_pytorch_signatures():
-    """Test that randint supports both PyTorch signatures."""
-    shmem = iris.iris(1 << 20)
-
-    # Test randint(high, size) signature
-    result1 = shmem.randint(10, (2, 3))
-    assert result1.shape == (2, 3)
-    assert torch.all(result1 >= 0)
-    assert torch.all(result1 < 10)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test randint(low, high, size) signature
-    result2 = shmem.randint(5, 15, (2, 3))
-    assert result2.shape == (2, 3)
-    assert torch.all(result2 >= 5)
-    assert torch.all(result2 < 15)
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Both should work correctly
-    assert result1.shape == result2.shape
-    assert result1.dtype == result2.dtype
-
-
-def test_randint_deterministic_behavior():
-    """Test that randint works with deterministic settings."""
-    shmem = iris.iris(1 << 20)
-
-    # Test that randint works regardless of deterministic settings
-    result = shmem.randint(0, 10, (2, 3))
-    assert result.shape == (2, 3)
-    assert torch.all(result >= 0)
-    assert torch.all(result < 10)
-    assert shmem._Iris__on_symmetric_heap(result)
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_randn.py b/tests/unittests/test_randn.py
index cb20ec9a9..a981e532d 100644
--- a/tests/unittests/test_randn.py
+++ b/tests/unittests/test_randn.py
@@ -6,443 +6,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,
-        torch.float32,
-        torch.float64,
-    ],
-)
-@pytest.mark.parametrize(
-    "size",
-    [
-        (1,),
-        (5,),
-        (2, 3),
-        (3, 4, 5),
-        (1, 1, 1),
-        (10, 20),
-    ],
-)
-def test_randn_basic(dtype, size):
-    shmem = iris.iris(1 << 20)
 
-    # Test basic randn
-    result = shmem.randn(*size, dtype=dtype)
+pytestmark = pytest.mark.single_rank
 
-    # Verify shape matches
-    assert result.shape == size
-    assert result.dtype == dtype
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_randn_default_dtype():
-    shmem = iris.iris(1 << 20)
-
-    # Test with default dtype (should use torch.get_default_dtype())
-    result = shmem.randn(2, 3)
-    expected_dtype = torch.get_default_dtype()
-    assert result.dtype == expected_dtype
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-@pytest.mark.parametrize(
-    "requires_grad",
-    [
-        True,
-        False,
-    ],
-)
-def test_randn_requires_grad(requires_grad):
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad parameter
-    result = shmem.randn(2, 2, dtype=torch.float32, requires_grad=requires_grad)
-
-    # Verify requires_grad is set
-    assert result.requires_grad == requires_grad
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_randn_device_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test default behavior (should use Iris device)
-    result = shmem.randn(3, 3)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test explicit device
-    result = shmem.randn(3, 3, device=shmem.device)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that "cuda" shorthand works (should use current CUDA device)
-    if shmem.device.startswith("cuda:"):
-        result = shmem.randn(3, 3, device="cuda")
-        assert str(result.device) == str(shmem.get_device())
-        assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test None device defaults to Iris device
-    result = shmem.randn(3, 3, device=None)
-    assert str(result.device) == str(shmem.get_device())
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that different device throws error
-    different_device = "cpu"  # CPU is always different from CUDA
-    with pytest.raises(RuntimeError):
-        shmem.randn(3, 3, device=different_device)
-
-    # Test that different CUDA device throws error
-    if shmem.device.startswith("cuda:") and torch.cuda.device_count() >= 2:
-        current_device = torch.device(shmem.device)
-        num_devices = torch.cuda.device_count()
-        different_cuda = f"cuda:{(current_device.index + 1) % num_devices}"  # Use next GPU
-        with pytest.raises(RuntimeError):
-            shmem.randn(3, 3, device=different_cuda)
-
-
-def test_randn_layout_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test with strided layout (default)
-    result = shmem.randn(2, 4, layout=torch.strided)
-    assert result.layout == torch.strided
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_randn_out_parameter():
-    shmem = iris.iris(1 << 20)
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(6, torch.float32)
-    result = shmem.randn(2, 3, out=out_tensor)
-
-    # Should share the same underlying data (same data_ptr)
-    assert result.data_ptr() == out_tensor.data_ptr()
-    assert result.shape == (2, 3)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with different dtype out tensor (float32)
-    out_tensor_float = shmem._Iris__allocate(6, torch.float32)
-    result_float = shmem.randn(2, 3, dtype=torch.float32, out=out_tensor_float)
-    assert result_float.data_ptr() == out_tensor_float.data_ptr()
-    assert result_float.dtype == torch.float32
-    assert shmem._Iris__on_symmetric_heap(result_float)
-
-
-def test_randn_size_variations():
-    shmem = iris.iris(1 << 20)
-
-    # Test single dimension
-    result1 = shmem.randn(5)
-    assert result1.shape == (5,)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test multiple dimensions
-    result2 = shmem.randn(2, 3, 4)
-    assert result2.shape == (2, 3, 4)
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Test with tuple as single argument
-    result3 = shmem.randn((3, 4))
-    assert result3.shape == (3, 4)
-    assert shmem._Iris__on_symmetric_heap(result3)
-
-    # Test with list as single argument
-    result4 = shmem.randn([2, 5])
-    assert result4.shape == (2, 5)
-    assert shmem._Iris__on_symmetric_heap(result4)
-
-
-def test_randn_edge_cases():
-    shmem = iris.iris(1 << 20)
-
-    # Empty tensor
-    empty_result = shmem.randn(0)
-    assert empty_result.shape == (0,)
-    assert empty_result.numel() == 0
-    assert shmem._Iris__on_symmetric_heap(empty_result)
-
-    # Single element tensor
-    single_result = shmem.randn(1)
-    assert single_result.shape == (1,)
-    assert single_result.numel() == 1
-    assert shmem._Iris__on_symmetric_heap(single_result)
-
-    # Large tensor
-    large_result = shmem.randn(50, 50)
-    assert large_result.shape == (50, 50)
-    assert large_result.numel() == 2500
-    assert shmem._Iris__on_symmetric_heap(large_result)
-
-    # Zero-dimensional tensor (scalar)
-    scalar_result = shmem.randn(())
-    assert scalar_result.shape == ()
-    assert scalar_result.numel() == 1
-    assert shmem._Iris__on_symmetric_heap(scalar_result)
-
-
-def test_randn_pytorch_equivalence():
-    shmem = iris.iris(1 << 20)
-
-    # Test basic equivalence
-    iris_result = shmem.randn(4, 3)
-    pytorch_result = torch.randn(4, 3, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-
-    # Test with explicit dtype
-    iris_result = shmem.randn(2, 2, dtype=torch.float64)
-    pytorch_result = torch.randn(2, 2, dtype=torch.float64, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-
-    # Test with requires_grad
-    iris_result = shmem.randn(3, 3, dtype=torch.float32, requires_grad=True)
-    pytorch_result = torch.randn(3, 3, dtype=torch.float32, device="cuda", requires_grad=True)
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert iris_result.requires_grad == pytorch_result.requires_grad
-
-
-@pytest.mark.parametrize(
-    "params",
-    [
-        {"dtype": torch.float32, "requires_grad": True},
-        {"dtype": torch.float64, "requires_grad": False},
-        {"dtype": torch.float16},
-        {"layout": torch.strided},
-        {},
-    ],
-)
-def test_randn_parameter_combinations(params):
-    shmem = iris.iris(1 << 20)
-
-    # Test various combinations of parameters
-    result = shmem.randn(3, 3, **params)
-
-    # Verify basic functionality
-    assert result.shape == (3, 3)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Verify dtype if specified
-    if "dtype" in params:
-        assert result.dtype == params["dtype"]
-
-    # Verify requires_grad if specified
-    if "requires_grad" in params:
-        assert result.requires_grad == params["requires_grad"]
-
-    # Verify layout if specified
-    if "layout" in params:
-        assert result.layout == params["layout"]
-
-
-@pytest.mark.parametrize(
-    "size,dtype",
-    [
-        ((1,), torch.float32),
-        ((5,), torch.float64),
-        ((2, 3), torch.float32),
-        ((3, 4, 5), torch.float16),
-        ((0,), torch.float32),  # Empty tensor
-        ((100, 100), torch.float32),  # Large tensor
-        ((), torch.float32),  # Scalar tensor
-    ],
-)
-def test_randn_symmetric_heap_shapes_dtypes(size, dtype):
-    """Test that randn returns tensors on symmetric heap for various shapes and dtypes."""
-    shmem = iris.iris(1 << 20)
-
-    # Test randn with this size and dtype
-    result = shmem.randn(*size, dtype=dtype)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with size {size}, dtype {dtype} is NOT on symmetric heap!"
-
-    # Also verify basic functionality
-    assert result.shape == size
-    assert result.dtype == dtype
-
-
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64])
-def test_randn_symmetric_heap_dtype_override(dtype):
-    """Test that randn with dtype override returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    result = shmem.randn(3, 3, dtype=dtype)
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with dtype {dtype} is NOT on symmetric heap!"
-    assert result.dtype == dtype
-
-
-def test_randn_symmetric_heap_other_params():
-    """Test that randn with other parameters returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad
-    result = shmem.randn(3, 3, dtype=torch.float32, requires_grad=True)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with requires_grad=True is NOT on symmetric heap!"
-
-    # Test with device override
-    result = shmem.randn(3, 3, device=shmem.device)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with device override is NOT on symmetric heap!"
-
-    # Test with layout override (only strided is supported)
-    result = shmem.randn(3, 3, layout=torch.strided)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with layout override is NOT on symmetric heap!"
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(9, torch.float32)
-    result = shmem.randn(3, 3, out=out_tensor)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with out parameter is NOT on symmetric heap!"
-
-
-def test_randn_invalid_output_tensor():
-    """Test error handling for invalid output tensors."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with wrong size output tensor
-    wrong_size_tensor = shmem._Iris__allocate(4, torch.float32)  # Wrong size for (3, 3)
-    with pytest.raises(RuntimeError):
-        shmem.randn(3, 3, out=wrong_size_tensor)
-
-    # Test with wrong dtype output tensor
-    wrong_dtype_tensor = shmem._Iris__allocate(9, torch.float64)  # Wrong dtype
-    with pytest.raises(RuntimeError):
-        shmem.randn(3, 3, dtype=torch.float32, out=wrong_dtype_tensor)
-
-    # Test with tensor not on symmetric heap (create a regular PyTorch tensor)
-    regular_tensor = torch.randn(3, 3, device="cuda")
-    with pytest.raises(RuntimeError):
-        shmem.randn(3, 3, out=regular_tensor)
-
-
-def test_randn_default_dtype_behavior():
-    """Test that randn uses the global default dtype when dtype=None."""
-    shmem = iris.iris(1 << 20)
-
-    # Save original default dtype
-    original_default = torch.get_default_dtype()
-
-    try:
-        # Test with float32 default
-        torch.set_default_dtype(torch.float32)
-        result1 = shmem.randn(2, 2)
-        assert result1.dtype == torch.float32
-
-        # Test with float64 default
-        torch.set_default_dtype(torch.float64)
-        result2 = shmem.randn(2, 2)
-        assert result2.dtype == torch.float64
-
-    finally:
-        # Restore original default dtype
-        torch.set_default_dtype(original_default)
-
-
-def test_randn_size_parsing():
-    """Test various ways of specifying size."""
-    shmem = iris.iris(1 << 20)
-
-    # Test individual arguments
-    result1 = shmem.randn(2, 3, 4)
-    assert result1.shape == (2, 3, 4)
-
-    # Test single tuple argument
-    result2 = shmem.randn((2, 3, 4))
-    assert result2.shape == (2, 3, 4)
-
-    # Test single list argument
-    result3 = shmem.randn([2, 3, 4])
-    assert result3.shape == (2, 3, 4)
-
-    # Test nested tuple (should be flattened)
-    result4 = shmem.randn(((2, 3, 4),))
-    assert result4.shape == (2, 3, 4)
-
-    # All should produce the same shape
-    assert result1.shape == result2.shape
-    assert result2.shape == result3.shape
-    assert result3.shape == result4.shape
-
-
-def test_randn_generator():
-    """Test generator parameter."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with generator
-    generator = torch.Generator(device="cuda")
-    generator.manual_seed(42)
-    result1 = shmem.randn(3, 3, generator=generator)
-    assert result1.shape == (3, 3)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test without generator (should still work)
-    result2 = shmem.randn(3, 3)
-    assert result2.shape == (3, 3)
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Test that generator produces reproducible results
-    generator1 = torch.Generator(device="cuda")
-    generator1.manual_seed(123)
-    result3 = shmem.randn(3, 3, generator=generator1)
-
-    generator2 = torch.Generator(device="cuda")
-    generator2.manual_seed(123)
-    result4 = shmem.randn(3, 3, generator=generator2)
-
-    # Results should be identical with same seed
-    assert torch.allclose(result3, result4)
-
-
-def test_randn_pin_memory():
-    """Test pin_memory parameter (should be ignored for Iris tensors)."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with pin_memory=True
-    result = shmem.randn(3, 3, pin_memory=True)
-    assert result.shape == (3, 3)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with pin_memory=False
-    result = shmem.randn(3, 3, pin_memory=False)
-    assert result.shape == (3, 3)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Note: pin_memory is ignored for GPU tensors, so we just verify it doesn't cause errors
-
-
-def test_randn_deterministic_behavior():
-    """Test that randn works with deterministic settings."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with deterministic mode
-    torch.use_deterministic_algorithms(True)
-    try:
-        result = shmem.randn(3, 3)
-        assert result.shape == (3, 3)
-        assert shmem._Iris__on_symmetric_heap(result)
-    finally:
-        torch.use_deterministic_algorithms(False)
-
-
-def test_randn_examples():
-    """Test the examples from PyTorch documentation."""
-    shmem = iris.iris(1 << 20)
-
-    # Example 1: torch.randn(4)
-    result1 = shmem.randn(4)
-    assert result1.shape == (4,)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Example 2: torch.randn(2, 3)
-    result2 = shmem.randn(2, 3)
-    assert result2.shape == (2, 3)
-    assert shmem._Iris__on_symmetric_heap(result2)
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_store_gluon.py b/tests/unittests/test_store_gluon.py
index 81ff3c608..6d34e9afe 100644
--- a/tests/unittests/test_store_gluon.py
+++ b/tests/unittests/test_store_gluon.py
@@ -36,67 +36,7 @@ def store_kernel(
         ctx.store(results + offsets, value, dst_rank, mask=mask)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_store_api(dtype, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris_gl.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    context_tensor = shmem.get_device_context()
-    destination_rank = shmem.get_rank()
 
-    src = shmem.ones(BLOCK_SIZE, dtype=dtype)
-    results = shmem.zeros_like(src)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = (1,)
-    store_kernel[grid](
-        iris_gl.IrisDeviceCtx,
-        context_tensor,
-        src,
-        results,
-        destination_rank,
-        num_ranks,
-        BLOCK_SIZE,
-        num_warps=1,
-    )
-    shmem.barrier()
-
-    # Verify the result
-    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_store_triton.py b/tests/unittests/test_store_triton.py
index 0632180c8..66e4fa35c 100644
--- a/tests/unittests/test_store_triton.py
+++ b/tests/unittests/test_store_triton.py
@@ -33,58 +33,7 @@ def store_kernel(
         iris.store(results + offsets, value, destination_rank, dst_rank, heap_bases, mask=mask)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.float16,
-        torch.bfloat16,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "BLOCK_SIZE",
-    [
-        1,
-        8,
-        16,
-        32,
-    ],
-)
-def test_store_api(dtype, BLOCK_SIZE):
-    # TODO: Adjust heap size.
-    shmem = iris.iris(1 << 20)
-    num_ranks = shmem.get_num_ranks()
-    heap_bases = shmem.get_heap_bases()
-    destination_rank = shmem.get_rank()
 
-    src = shmem.ones(BLOCK_SIZE, dtype=dtype)
-    results = shmem.zeros_like(src)
+pytestmark = pytest.mark.multi_rank_required
 
-    shmem.barrier()
-
-    grid = lambda meta: (1,)
-    store_kernel[grid](src, results, destination_rank, num_ranks, BLOCK_SIZE, heap_bases)
-    shmem.barrier()
-
-    # Verify the result
-    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
-
-    try:
-        torch.testing.assert_close(results, expected, rtol=0, atol=0)
-    except AssertionError as e:
-        print(e)
-        print("Expected:", expected)
-        print("Actual:", results)
-        raise
-    finally:
-        # Final barrier to ensure all ranks complete before test cleanup
-        # This helps with test isolation when running multiple tests
-        # Note: shmem.barrier() already does cuda.synchronize()
-        shmem.barrier()
-        # Explicitly delete the shmem instance to trigger cleanup
-        del shmem
-        # Force garbage collection to ensure IPC handles are cleaned up
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_zeros.py b/tests/unittests/test_zeros.py
index 51126fed3..a981e532d 100644
--- a/tests/unittests/test_zeros.py
+++ b/tests/unittests/test_zeros.py
@@ -6,401 +6,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-        torch.float16,
-        torch.float32,
-        torch.float64,
-        torch.bool,
-    ],
-)
-@pytest.mark.parametrize(
-    "size",
-    [
-        (1,),
-        (5,),
-        (2, 3),
-        (3, 4, 5),
-        (1, 1, 1),
-        (10, 20),
-    ],
-)
-def test_zeros_basic(dtype, size):
-    shmem = iris.iris(1 << 20)
 
-    # Test basic zeros
-    result = shmem.zeros(*size, dtype=dtype)
+pytestmark = pytest.mark.single_rank
 
-    # Verify shape matches
-    assert result.shape == size
-    assert result.dtype == dtype
-
-    # Verify all values are zero
-    assert torch.all(result == 0)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_zeros_default_dtype():
-    shmem = iris.iris(1 << 20)
-
-    # Test with default dtype (should use torch.get_default_dtype())
-    result = shmem.zeros(2, 3)
-    expected_dtype = torch.get_default_dtype()
-    assert result.dtype == expected_dtype
-    assert torch.all(result == 0)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-@pytest.mark.parametrize(
-    "requires_grad",
-    [
-        True,
-        False,
-    ],
-)
-def test_zeros_requires_grad(requires_grad):
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad parameter
-    result = shmem.zeros(2, 2, dtype=torch.float32, requires_grad=requires_grad)
-
-    # Verify requires_grad is set
-    assert result.requires_grad == requires_grad
-    assert torch.all(result == 0)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-def test_zeros_device_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test default behavior (should use Iris device)
-    result = shmem.zeros(3, 3)
-    assert str(result.device) == str(shmem.get_device())
-    assert torch.all(result == 0)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test explicit device
-    result = shmem.zeros(3, 3, device=shmem.device)
-    assert str(result.device) == str(shmem.get_device())
-    assert torch.all(result == 0)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that "cuda" shorthand works (should use current CUDA device)
-    if shmem.device.startswith("cuda:"):
-        result = shmem.zeros(3, 3, device="cuda")
-        assert str(result.device) == str(shmem.get_device())
-        assert torch.all(result == 0)
-        assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test None device defaults to Iris device
-    result = shmem.zeros(3, 3, device=None)
-    assert str(result.device) == str(shmem.get_device())
-    assert torch.all(result == 0)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that different device throws error
-    different_device = "cpu"  # CPU is always different from CUDA
-    with pytest.raises(RuntimeError):
-        shmem.zeros(3, 3, device=different_device)
-
-    # Test that different CUDA device throws error
-    if shmem.device.startswith("cuda:") and torch.cuda.device_count() >= 2:
-        current_device = torch.device(shmem.device)
-        different_cuda = f"cuda:{(current_device.index + 1) % torch.cuda.device_count()}"  # Use next GPU
-        with pytest.raises(RuntimeError):
-            shmem.zeros(3, 3, device=different_cuda)
-
-
-def test_zeros_layout_handling():
-    shmem = iris.iris(1 << 20)
-
-    # Test with strided layout (default)
-    result = shmem.zeros(2, 4, layout=torch.strided)
-    assert result.layout == torch.strided
-    assert torch.all(result == 0)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test that unsupported layout throws error
-    with pytest.raises(ValueError):
-        shmem.zeros(2, 4, layout=torch.sparse_coo)
-
-
-def test_zeros_out_parameter():
-    shmem = iris.iris(1 << 20)
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(6, torch.float32)
-    result = shmem.zeros(2, 3, out=out_tensor)
-
-    # Should share the same underlying data (same data_ptr)
-    assert result.data_ptr() == out_tensor.data_ptr()
-    assert torch.all(result == 0)
-    assert result.shape == (2, 3)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Test with different dtype out tensor
-    out_tensor_int = shmem._Iris__allocate(6, torch.int32)
-    result_int = shmem.zeros(2, 3, dtype=torch.int32, out=out_tensor_int)
-    assert result_int.data_ptr() == out_tensor_int.data_ptr()
-    assert result_int.dtype == torch.int32
-    assert torch.all(result_int == 0)
-    assert shmem._Iris__on_symmetric_heap(result_int)
-
-
-def test_zeros_size_variations():
-    shmem = iris.iris(1 << 20)
-
-    # Test single dimension
-    result1 = shmem.zeros(5)
-    assert result1.shape == (5,)
-    assert torch.all(result1 == 0)
-    assert shmem._Iris__on_symmetric_heap(result1)
-
-    # Test multiple dimensions
-    result2 = shmem.zeros(2, 3, 4)
-    assert result2.shape == (2, 3, 4)
-    assert torch.all(result2 == 0)
-    assert shmem._Iris__on_symmetric_heap(result2)
-
-    # Test with tuple/list as single argument
-    result3 = shmem.zeros((3, 4))
-    assert result3.shape == (3, 4)
-    assert torch.all(result3 == 0)
-    assert shmem._Iris__on_symmetric_heap(result3)
-
-    # Test with list as single argument
-    result4 = shmem.zeros([2, 5])
-    assert result4.shape == (2, 5)
-    assert torch.all(result4 == 0)
-    assert shmem._Iris__on_symmetric_heap(result4)
-
-
-def test_zeros_edge_cases():
-    shmem = iris.iris(1 << 20)
-
-    # Empty tensor
-    empty_result = shmem.zeros(0)
-    assert empty_result.shape == (0,)
-    assert empty_result.numel() == 0
-    assert shmem._Iris__on_symmetric_heap(empty_result)
-
-    # Single element tensor
-    single_result = shmem.zeros(1)
-    assert single_result.shape == (1,)
-    assert single_result.numel() == 1
-    assert single_result[0] == 0
-    assert shmem._Iris__on_symmetric_heap(single_result)
-
-    # Large tensor
-    large_result = shmem.zeros(100, 100)
-    assert large_result.shape == (100, 100)
-    assert large_result.numel() == 10000
-    assert torch.all(large_result == 0)
-    assert shmem._Iris__on_symmetric_heap(large_result)
-
-    # Zero-dimensional tensor (scalar)
-    scalar_result = shmem.zeros(())
-    assert scalar_result.shape == ()
-    assert scalar_result.numel() == 1
-    assert scalar_result.item() == 0
-    assert shmem._Iris__on_symmetric_heap(scalar_result)
-
-
-def test_zeros_pytorch_equivalence():
-    shmem = iris.iris(1 << 20)
-
-    # Test basic equivalence
-    iris_result = shmem.zeros(4, 3)
-    pytorch_result = torch.zeros(4, 3, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert torch.all(iris_result == pytorch_result)
-
-    # Test with explicit dtype
-    iris_result = shmem.zeros(2, 2, dtype=torch.float64)
-    pytorch_result = torch.zeros(2, 2, dtype=torch.float64, device="cuda")
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert torch.all(iris_result == pytorch_result)
-
-    # Test with requires_grad
-    iris_result = shmem.zeros(3, 3, dtype=torch.float32, requires_grad=True)
-    pytorch_result = torch.zeros(3, 3, dtype=torch.float32, device="cuda", requires_grad=True)
-
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-    assert iris_result.requires_grad == pytorch_result.requires_grad
-    assert torch.all(iris_result == pytorch_result)
-
-
-@pytest.mark.parametrize(
-    "params",
-    [
-        {"dtype": torch.float32, "requires_grad": True},
-        {"dtype": torch.float64, "requires_grad": False},
-        {"dtype": torch.int32},
-        {"dtype": torch.float16},
-        {"layout": torch.strided},
-        {},
-    ],
-)
-def test_zeros_parameter_combinations(params):
-    shmem = iris.iris(1 << 20)
-
-    # Test various combinations of parameters
-    result = shmem.zeros(3, 3, **params)
-
-    # Verify basic functionality
-    assert result.shape == (3, 3)
-    assert torch.all(result == 0)
-    assert shmem._Iris__on_symmetric_heap(result)
-
-    # Verify dtype if specified
-    if "dtype" in params:
-        assert result.dtype == params["dtype"]
-
-    # Verify requires_grad if specified
-    if "requires_grad" in params:
-        assert result.requires_grad == params["requires_grad"]
-
-    # Verify layout if specified
-    if "layout" in params:
-        assert result.layout == params["layout"]
-
-
-@pytest.mark.parametrize(
-    "size,dtype",
-    [
-        ((1,), torch.float32),
-        ((5,), torch.int32),
-        ((2, 3), torch.float64),
-        ((3, 4, 5), torch.float16),
-        ((0,), torch.float32),  # Empty tensor
-        ((100, 100), torch.float32),  # Large tensor
-        ((), torch.float32),  # Scalar tensor
-    ],
-)
-def test_zeros_symmetric_heap_shapes_dtypes(size, dtype):
-    """Test that zeros returns tensors on symmetric heap for various shapes and dtypes."""
-    shmem = iris.iris(1 << 20)
-
-    # Test zeros with this size and dtype
-    result = shmem.zeros(*size, dtype=dtype)
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with size {size}, dtype {dtype} is NOT on symmetric heap!"
-
-    # Also verify basic functionality
-    assert result.shape == size
-    assert result.dtype == dtype
-    assert torch.all(result == 0)
-
-
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64, torch.int32, torch.int64])
-def test_zeros_symmetric_heap_dtype_override(dtype):
-    """Test that zeros with dtype override returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    result = shmem.zeros(3, 3, dtype=dtype)
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with dtype {dtype} is NOT on symmetric heap!"
-    assert result.dtype == dtype
-
-
-def test_zeros_symmetric_heap_other_params():
-    """Test that zeros with other parameters returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with requires_grad
-    result = shmem.zeros(3, 3, dtype=torch.float32, requires_grad=True)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with requires_grad=True is NOT on symmetric heap!"
-
-    # Test with device override
-    result = shmem.zeros(3, 3, device=shmem.device)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with device override is NOT on symmetric heap!"
-
-    # Test with layout override (only strided is supported)
-    result = shmem.zeros(3, 3, layout=torch.strided)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with layout override is NOT on symmetric heap!"
-
-    # Test with out parameter
-    out_tensor = shmem._Iris__allocate(9, torch.float32)
-    result = shmem.zeros(3, 3, out=out_tensor)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with out parameter is NOT on symmetric heap!"
-
-
-def test_zeros_invalid_output_tensor():
-    """Test error handling for invalid output tensors."""
-    shmem = iris.iris(1 << 20)
-
-    # Test with wrong size output tensor
-    wrong_size_tensor = shmem._Iris__allocate(4, torch.float32)  # Wrong size for (3, 3)
-    with pytest.raises(RuntimeError, match="The output tensor has 4 elements, but 9 are required"):
-        shmem.zeros(3, 3, out=wrong_size_tensor)
-
-    # Test with wrong dtype output tensor
-    wrong_dtype_tensor = shmem._Iris__allocate(9, torch.int32)  # Wrong dtype
-    with pytest.raises(RuntimeError, match="The output tensor has dtype torch.int32, but torch.float32 is required"):
-        shmem.zeros(3, 3, dtype=torch.float32, out=wrong_dtype_tensor)
-
-    # Test with tensor not on symmetric heap (create a regular PyTorch tensor)
-    regular_tensor = torch.zeros(3, 3, device="cuda")
-    with pytest.raises(RuntimeError, match="The output tensor is not on the symmetric heap"):
-        shmem.zeros(3, 3, out=regular_tensor)
-
-
-def test_zeros_default_dtype_behavior():
-    """Test that zeros uses the global default dtype when dtype=None."""
-    shmem = iris.iris(1 << 20)
-
-    # Save original default dtype
-    original_default = torch.get_default_dtype()
-
-    try:
-        # Test with float32 default
-        torch.set_default_dtype(torch.float32)
-        result1 = shmem.zeros(2, 2)
-        assert result1.dtype == torch.float32
-
-        # Test with float64 default
-        torch.set_default_dtype(torch.float64)
-        result2 = shmem.zeros(2, 2)
-        assert result2.dtype == torch.float64
-
-    finally:
-        # Restore original default dtype
-        torch.set_default_dtype(original_default)
-
-
-def test_zeros_size_parsing():
-    """Test various ways of specifying size."""
-    shmem = iris.iris(1 << 20)
-
-    # Test individual arguments
-    result1 = shmem.zeros(2, 3, 4)
-    assert result1.shape == (2, 3, 4)
-
-    # Test single tuple argument
-    result2 = shmem.zeros((2, 3, 4))
-    assert result2.shape == (2, 3, 4)
-
-    # Test single list argument
-    result3 = shmem.zeros([2, 3, 4])
-    assert result3.shape == (2, 3, 4)
-
-    # Test nested tuple (should be flattened)
-    result4 = shmem.zeros(((2, 3, 4),))
-    assert result4.shape == (2, 3, 4)
-
-    # All should produce the same result
-    assert torch.all(result1 == result2)
-    assert torch.all(result2 == result3)
-    assert torch.all(result3 == result4)
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/unittests/test_zeros_like.py b/tests/unittests/test_zeros_like.py
index b7a0ff0c0..a981e532d 100644
--- a/tests/unittests/test_zeros_like.py
+++ b/tests/unittests/test_zeros_like.py
@@ -6,457 +6,7 @@
 import iris
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-        torch.float16,
-        torch.float32,
-        torch.float64,
-        torch.bool,
-    ],
-)
-@pytest.mark.parametrize(
-    "shape",
-    [
-        (1,),
-        (5,),
-        (2, 3),
-        (3, 4, 5),
-        (1, 1, 1),
-        (10, 20),
-    ],
-)
-def test_zeros_like_basic(dtype, shape):
-    shmem = iris.iris(1 << 20)
 
-    # Create input tensor with various shapes and dtypes
-    input_tensor = shmem.full(shape, 5, dtype=dtype)
+pytestmark = pytest.mark.single_rank
 
-    # Test basic zeros_like
-    result = shmem.zeros_like(input_tensor)
-
-    # Verify shape matches
-    assert result.shape == input_tensor.shape
-    assert result.dtype == input_tensor.dtype
-
-    # Verify all values are zero
-    assert torch.all(result == 0)
-
-
-@pytest.mark.parametrize(
-    "input_dtype",
-    [
-        torch.int32,
-        torch.float32,
-    ],
-)
-@pytest.mark.parametrize(
-    "output_dtype",
-    [
-        torch.float32,
-        torch.float64,
-        torch.int64,
-    ],
-)
-def test_zeros_like_dtype_override(input_dtype, output_dtype):
-    shmem = iris.iris(1 << 20)
-
-    input_tensor = shmem.full((2, 3), 10, dtype=input_dtype)
-
-    # Override dtype
-    result = shmem.zeros_like(input_tensor, dtype=output_dtype)
-
-    # Verify dtype is overridden
-    assert result.dtype == output_dtype
-    assert result.shape == input_tensor.shape
-    assert torch.all(result == 0)
-
-
-@pytest.mark.parametrize(
-    "requires_grad",
-    [
-        True,
-        False,
-    ],
-)
-def test_zeros_like_requires_grad(requires_grad):
-    shmem = iris.iris(1 << 20)
-
-    input_tensor = shmem.full((2, 2), 1, dtype=torch.float32)
-
-    # Test with requires_grad parameter
-    result = shmem.zeros_like(input_tensor, requires_grad=requires_grad)
-
-    # Verify requires_grad is set
-    assert result.requires_grad == requires_grad
-    assert torch.all(result == 0)
-
-
-def test_zeros_like_device_override():
-    shmem = iris.iris(1 << 20)
-    input_tensor = shmem.full((3, 3), 2, dtype=torch.float32)
-
-    # Test default behavior
-    result = shmem.zeros_like(input_tensor)
-    assert str(result.device) == str(input_tensor.device)
-    assert torch.all(result == 0)
-
-    # Test same device works
-    result = shmem.zeros_like(input_tensor, device=shmem.device)
-    assert str(result.device) == shmem.device
-    assert torch.all(result == 0)
-
-    # Test that "cuda" shorthand works (should use current CUDA device)
-    if shmem.device.startswith("cuda:"):
-        result = shmem.zeros_like(input_tensor, device="cuda")
-        assert str(result.device) == shmem.device
-        assert torch.all(result == 0)
-
-    # Test None device defaults to input tensor's device
-    result = shmem.zeros_like(input_tensor, device=None)
-    assert str(result.device) == str(input_tensor.device)
-    assert torch.all(result == 0)
-
-    # Test that different device throws error
-    different_device = "cpu"  # CPU is always different from CUDA
-    with pytest.raises(RuntimeError):
-        shmem.zeros_like(input_tensor, device=different_device)
-
-    # Test that different CUDA device throws error
-    if shmem.device.startswith("cuda:") and torch.cuda.device_count() >= 2:
-        current_device = torch.device(shmem.device)
-        different_cuda = f"cuda:{(current_device.index + 1) % torch.cuda.device_count()}"  # Use next GPU
-        with pytest.raises(RuntimeError):
-            shmem.zeros_like(input_tensor, device=different_cuda)
-
-
-def test_zeros_like_layout_override():
-    shmem = iris.iris(1 << 20)
-
-    input_tensor = shmem.full((2, 4), 3, dtype=torch.float32)
-
-    # Test with different layout (should default to input layout)
-    result = shmem.zeros_like(input_tensor, layout=torch.strided)
-
-    # Verify layout and values
-    assert result.layout == input_tensor.layout
-    assert torch.all(result == 0)
-
-
-def test_zeros_like_memory_format():
-    shmem = iris.iris(1 << 20)
-
-    input_tensor = shmem.full((4, 2), 1, dtype=torch.float32)
-
-    # Test with default memory_format
-    result = shmem.zeros_like(input_tensor, memory_format=torch.contiguous_format)
-    assert result.shape == input_tensor.shape
-    assert torch.all(result == 0)
-
-    # Test channels_last format (should work for 4D tensors)
-    # Create a 4D tensor (NCHW format)
-    input_4d = shmem.full((2, 3, 4, 5), 1, dtype=torch.float32)
-    result_4d = shmem.zeros_like(input_4d, memory_format=torch.channels_last)
-
-    # For channels_last format, the shape remains (N, C, H, W); only the memory layout (strides) changes.
-    # Input: (2, 3, 4, 5) -> Output: (2, 3, 4, 5) with channels_last strides
-    expected_shape = input_4d.shape
-    assert result_4d.shape == expected_shape, f"Expected {expected_shape}, got {result_4d.shape}"
-    assert torch.all(result_4d == 0)
-
-    # Compare with PyTorch's channels_last implementation
-    pytorch_input_4d = torch.full((2, 3, 4, 5), 1, dtype=torch.float32, device="cuda")
-    pytorch_result_4d = torch.zeros_like(pytorch_input_4d, memory_format=torch.channels_last)
-
-    # Verify it's actually in channels_last format
-    strides = result_4d.stride()
-    assert strides[0] > strides[2] > strides[3] > strides[1] == 1, (
-        f"Expected channels_last format strides, got {strides}"
-    )
-
-    # Test channels_last_3d format (should work for 5D tensors)
-    input_5d = shmem.full((2, 3, 4, 5, 6), 1, dtype=torch.float32)
-    result_5d = shmem.zeros_like(input_5d, memory_format=torch.channels_last_3d)
-
-    # For channels_last_3d format, the shape remains (N, C, D, H, W); only the memory layout (strides) changes.
-    # Input: (2, 3, 4, 5, 6) -> Output: (2, 3, 4, 5, 6) with channels_last_3d strides
-    expected_shape_5d = input_5d.shape
-    assert result_5d.shape == expected_shape_5d, f"Expected {expected_shape_5d}, got {result_5d.shape}"
-    assert torch.all(result_5d == 0)
-
-    # Compare with PyTorch's channels_last_3d implementation
-    pytorch_input_5d = torch.full((2, 3, 4, 5, 6), 1, dtype=torch.float32, device="cuda")
-    pytorch_result_5d = torch.zeros_like(pytorch_input_5d, memory_format=torch.channels_last_3d)
-
-    # Verify it's actually in channels_last_3d format
-    strides_5d = result_5d.stride()
-    assert strides_5d[0] > strides_5d[2] > strides_5d[3] > strides_5d[4] > strides_5d[1] == 1, (
-        f"Expected channels_last_3d format strides, got {strides_5d}"
-    )
-
-    # Test preserve_format with contiguous input
-    result_preserve = shmem.zeros_like(input_tensor, memory_format=torch.preserve_format)
-    assert result_preserve.shape == input_tensor.shape
-    assert torch.all(result_preserve == 0)
-
-    # Test preserve_format with non-contiguous input (should now work)
-    non_contiguous_tensor = input_tensor.transpose(0, 1)  # This makes it non-contiguous
-    result_non_contig = shmem.zeros_like(non_contiguous_tensor, memory_format=torch.preserve_format)
-    assert result_non_contig.shape == non_contiguous_tensor.shape
-    assert torch.all(result_non_contig == 0)
-
-    # Test preserve_format with channels_last input (should copy the format)
-    # Create input tensor directly in channels_last format using Iris
-    input_4d_channels_last = shmem.zeros_like(
-        shmem.full((2, 3, 4, 5), 1, dtype=torch.float32), memory_format=torch.channels_last
-    )
-    result_preserve_channels_last = shmem.zeros_like(input_4d_channels_last, memory_format=torch.preserve_format)
-
-    # Compare with PyTorch's preserve_format behavior
-    pytorch_input_4d_cl = torch.full((2, 3, 4, 5), 1, dtype=torch.float32, device="cuda")
-    pytorch_input_4d_cl = pytorch_input_4d_cl.to(memory_format=torch.channels_last)
-    pytorch_result_preserve = torch.zeros_like(pytorch_input_4d_cl, memory_format=torch.preserve_format)
-
-    # Verify strides match exactly (preserve_format should copy the input's memory format)
-    assert result_preserve_channels_last.stride() == pytorch_result_preserve.stride(), (
-        f"Preserve format strides don't match: {result_preserve_channels_last.stride()} vs {pytorch_result_preserve.stride()}"
-    )
-
-    # Verify all results are on the symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result_4d)
-    assert shmem._Iris__on_symmetric_heap(result_5d)
-    assert shmem._Iris__on_symmetric_heap(result_preserve_channels_last)
-
-
-def test_channels_last_format_shape_preservation():
-    """Test that channels_last format preserves shape and only changes strides."""
-    shmem = iris.iris(1 << 20)
-
-    # Test 4D tensor
-    input_4d = shmem.full((2, 3, 4, 5), 1, dtype=torch.float32)
-    result_4d = shmem.zeros_like(input_4d, memory_format=torch.channels_last)
-
-    # Verify shape is preserved
-    assert result_4d.shape == input_4d.shape, f"Shape changed: {input_4d.shape} -> {result_4d.shape}"
-    assert result_4d.shape == (2, 3, 4, 5), f"Expected shape (2, 3, 4, 5), got {result_4d.shape}"
-
-    # Verify strides indicate channels_last format
-    strides = result_4d.stride()
-    N, C, H, W = 2, 3, 4, 5
-    expected_strides = (C * H * W, 1, C * W, C)  # (60, 1, 15, 3)
-    assert strides == expected_strides, f"Expected strides {expected_strides}, got {strides}"
-
-    # Verify channels_last format characteristics: strides[1] == 1 (channels dimension is contiguous)
-    assert strides[1] == 1, f"Channels dimension should be contiguous (stride=1), got {strides[1]}"
-
-    # Test 5D tensor
-    input_5d = shmem.full((2, 3, 4, 5, 6), 1, dtype=torch.float32)
-    result_5d = shmem.zeros_like(input_5d, memory_format=torch.channels_last_3d)
-
-    # Verify shape is preserved
-    assert result_5d.shape == input_5d.shape, f"Shape changed: {input_5d.shape} -> {result_5d.shape}"
-    assert result_5d.shape == (2, 3, 4, 5, 6), f"Expected shape (2, 3, 4, 5, 6), got {result_5d.shape}"
-
-    # Verify strides indicate channels_last_3d format
-    strides_5d = result_5d.stride()
-    N, C, D, H, W = 2, 3, 4, 5, 6
-    expected_strides_5d = (C * D * H * W, 1, C * D * W, C * W, C)  # (360, 1, 90, 18, 3)
-    assert strides_5d == expected_strides_5d, f"Expected strides {expected_strides_5d}, got {strides_5d}"
-
-    # Verify channels_last_3d format characteristics: strides[1] == 1 (channels dimension is contiguous)
-    assert strides_5d[1] == 1, f"Channels dimension should be contiguous (stride=1), got {strides_5d[1]}"
-
-    # Compare with PyTorch's behavior to ensure consistency
-    pytorch_input_4d = torch.full((2, 3, 4, 5), 1, dtype=torch.float32, device="cuda")
-    pytorch_result_4d = torch.zeros_like(pytorch_input_4d, memory_format=torch.channels_last)
-
-    # Verify Iris and PyTorch have same shape
-    assert result_4d.shape == pytorch_result_4d.shape, (
-        f"Shape mismatch: Iris {result_4d.shape} vs PyTorch {pytorch_result_4d.shape}"
-    )
-
-    # Verify Iris and PyTorch have same strides
-    assert result_4d.stride() == pytorch_result_4d.stride(), (
-        f"Strides mismatch: Iris {result_4d.stride()} vs PyTorch {pytorch_result_4d.stride()}"
-    )
-
-    # Verify tensors are on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result_4d)
-    assert shmem._Iris__on_symmetric_heap(result_5d)
-
-
-def test_zeros_like_pytorch_equivalence():
-    shmem = iris.iris(1 << 20)
-
-    # Create input tensor
-    input_tensor = shmem.full((4, 3), 7, dtype=torch.float32)
-
-    # Get Iris result
-    iris_result = shmem.zeros_like(input_tensor)
-
-    # Create equivalent PyTorch tensor and get PyTorch result
-    pytorch_input = torch.full((4, 3), 7, dtype=torch.float32, device="cuda")
-    pytorch_result = torch.zeros_like(pytorch_input)
-
-    # Verify shapes and dtypes match
-    assert iris_result.shape == pytorch_result.shape
-    assert iris_result.dtype == pytorch_result.dtype
-
-    # Verify values match (both should be all zeros)
-    assert torch.all(iris_result == 0)
-    assert torch.all(pytorch_result == 0)
-
-    # Test that device defaults work like PyTorch
-    # PyTorch: device=None defaults to input.device
-    # Iris: should do the same
-    iris_result_default = shmem.zeros_like(input_tensor, device=None)
-    pytorch_result_default = torch.zeros_like(pytorch_input, device=None)
-
-    # Both should default to their input tensor's device
-    assert str(iris_result_default.device) == str(input_tensor.device)
-    assert str(pytorch_result_default.device) == str(pytorch_input.device)
-
-
-def test_zeros_like_edge_cases():
-    shmem = iris.iris(1 << 20)
-
-    # Empty tensor
-    empty_tensor = shmem.full((0,), 1, dtype=torch.float32)
-    empty_result = shmem.zeros_like(empty_tensor)
-    assert empty_result.shape == (0,)
-    assert empty_result.numel() == 0
-
-    # Single element tensor
-    single_tensor = shmem.full((1,), 5, dtype=torch.int32)
-    single_result = shmem.zeros_like(single_tensor)
-    assert single_result.shape == (1,)
-    assert single_result.numel() == 1
-    assert single_result[0] == 0
-
-    # Large tensor
-    large_tensor = shmem.full((100, 100), 10, dtype=torch.float32)
-    large_result = shmem.zeros_like(large_tensor)
-    assert large_result.shape == (100, 100)
-    assert large_result.numel() == 10000
-    assert torch.all(large_result == 0)
-
-    # Verify all edge case results are on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(empty_result)
-    assert shmem._Iris__on_symmetric_heap(single_result)
-    assert shmem._Iris__on_symmetric_heap(large_result)
-
-
-@pytest.mark.parametrize(
-    "params",
-    [
-        {"dtype": torch.float32, "requires_grad": True},
-        {"dtype": torch.float64, "requires_grad": False},
-        {"dtype": torch.float32, "requires_grad": True},
-        {"dtype": torch.float16},
-        {},
-    ],
-)
-def test_zeros_like_parameter_combinations(params):
-    shmem = iris.iris(1 << 20)
-
-    # Use float32 input tensor to support requires_grad
-    input_tensor = shmem.full((3, 3), 1, dtype=torch.float32)
-
-    # Test various combinations of parameters
-    result = shmem.zeros_like(input_tensor, **params)
-
-    # Verify basic functionality
-    assert result.shape == input_tensor.shape
-    assert torch.all(result == 0)
-
-    # Verify dtype if specified
-    if "dtype" in params:
-        assert result.dtype == params["dtype"]
-
-    # Verify requires_grad if specified
-    if "requires_grad" in params:
-        assert result.requires_grad == params["requires_grad"]
-
-    # Verify tensor is on symmetric heap
-    assert shmem._Iris__on_symmetric_heap(result)
-
-
-@pytest.mark.parametrize(
-    "shape,dtype",
-    [
-        ((1,), torch.float32),
-        ((5,), torch.int32),
-        ((2, 3), torch.float64),
-        ((3, 4, 5), torch.float16),
-        ((2, 3, 4, 5), torch.float32),  # 4D for channels_last
-        ((2, 3, 4, 5, 6), torch.float32),  # 5D for channels_last_3d
-        ((0,), torch.float32),  # Empty tensor
-        ((100, 100), torch.float32),  # Large tensor
-    ],
-)
-def test_zeros_like_symmetric_heap_shapes_dtypes(shape, dtype):
-    """Test that zeros_like returns tensors on symmetric heap for various shapes and dtypes."""
-    shmem = iris.iris(1 << 20)
-
-    # Create input tensor
-    input_tensor = shmem.full(shape, 5, dtype=dtype)
-
-    # Test all compatible memory formats
-    memory_formats = [
-        torch.contiguous_format,
-        torch.preserve_format,
-    ]
-
-    # Add dimension-specific formats
-    if len(shape) == 4:
-        memory_formats.append(torch.channels_last)
-    elif len(shape) == 5:
-        memory_formats.append(torch.channels_last_3d)
-
-    for memory_format in memory_formats:
-        # Test zeros_like with this memory format
-        result = shmem.zeros_like(input_tensor, memory_format=memory_format)
-
-        # Verify tensor is on symmetric heap
-        assert shmem._Iris__on_symmetric_heap(result), (
-            f"Tensor with shape {shape}, dtype {dtype}, memory_format {memory_format} is NOT on symmetric heap!"
-        )
-
-        # Also verify basic functionality
-        # Memory formats preserve the logical shape, only changing the memory layout (strides)
-        assert result.shape == shape
-        assert result.dtype == dtype
-        assert torch.all(result == 0)
-
-
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64, torch.int32, torch.int64])
-def test_zeros_like_symmetric_heap_dtype_override(dtype):
-    """Test that zeros_like with dtype override returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-    input_tensor = shmem.full((3, 3), 1, dtype=torch.float32)
-
-    result = shmem.zeros_like(input_tensor, dtype=dtype)
-    assert shmem._Iris__on_symmetric_heap(result), f"Tensor with dtype {dtype} is NOT on symmetric heap!"
-    assert result.dtype == dtype
-
-
-def test_zeros_like_symmetric_heap_other_params():
-    """Test that zeros_like with other parameters returns tensors on symmetric heap."""
-    shmem = iris.iris(1 << 20)
-    input_tensor = shmem.full((3, 3), 1, dtype=torch.float32)
-
-    # Test with requires_grad
-    result = shmem.zeros_like(input_tensor, requires_grad=True)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with requires_grad=True is NOT on symmetric heap!"
-
-    # Test with device override
-    result = shmem.zeros_like(input_tensor, device=shmem.device)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with device override is NOT on symmetric heap!"
-
-    # Test with layout override (only strided is supported)
-    result = shmem.zeros_like(input_tensor, layout=torch.strided)
-    assert shmem._Iris__on_symmetric_heap(result), "Tensor with layout override is NOT on symmetric heap!"
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/x/test_all_gather.py b/tests/x/test_all_gather.py
index 5cad616d5..40753798b 100644
--- a/tests/x/test_all_gather.py
+++ b/tests/x/test_all_gather.py
@@ -63,280 +63,7 @@ def x_all_gather_kernel(
         iris.x.all_gather(tile, dst_view, gather_dim, ctx)
 
 
-@pytest.mark.parametrize(
-    "gather_dim",
-    [0, 1],
-)
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 1e-3, 1e-3),
-        (torch.float32, 1e-5, 1e-5),
-        (torch.bfloat16, 1e-3, 1e-3),
-    ],
-)
-@pytest.mark.parametrize(
-    "M, N, BLOCK_SIZE_M, BLOCK_SIZE_N",
-    [
-        (128, 64, 64, 32),  # Small
-        (1024, 256, 128, 128),  # Medium
-        (2048, 2048, 256, 256),  # Large
-        # TODO: Fix non-aligned dimension handling in all_gather for irregular tiling
-        # (100, 100, 64, 64),  # Non-aligned dimensions - fails due to edge case with partial tiles
-        (256, 384, 128, 128),  # Non-square
-        (64, 32, 128, 128),  # Block size larger than dimensions
-    ],
-)
-def test_all_gather(gather_dim, dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
-    """Test tile-level all-gather primitive by comparing against PyTorch's implementation."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
 
-    # Skip if block size is larger than dimensions
-    # (new all_gather requires tile.data shape to match block size)
-    if BLOCK_SIZE_M > M or BLOCK_SIZE_N > N:
-        pytest.skip(f"Block size ({BLOCK_SIZE_M}x{BLOCK_SIZE_N}) larger than dimensions ({M}x{N})")
+pytestmark = pytest.mark.multi_rank_required
 
-    heap_size = 2**33  # 8GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
-
-    # PyTorch's all_gather format: each rank has M x N data
-    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
-    pytorch_input_tensor.fill_(float(rank + 1))
-
-    # Run PyTorch's all_gather to get reference output
-    pytorch_output_list = [torch.empty_like(pytorch_input_tensor) for _ in range(world_size)]
-    shmem.barrier()
-    dist.all_gather(pytorch_output_list, pytorch_input_tensor)
-
-    if gather_dim == 0:
-        # Gather along rows (M dimension)
-        pytorch_output_tensor = torch.cat(pytorch_output_list, dim=0)  # Concatenate along dim 0
-    else:
-        # Gather along columns (N dimension)
-        pytorch_output_tensor = torch.cat(pytorch_output_list, dim=1)  # Concatenate along dim 1
-
-    torch.cuda.synchronize()
-
-    # Set up Iris tensors
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.copy_(pytorch_input_tensor)
-
-    if gather_dim == 0:
-        iris_output_tensor = shmem.zeros((world_size * M, N), dtype=dtype)
-    else:
-        iris_output_tensor = shmem.zeros((M, world_size * N), dtype=dtype)
-
-    shmem.barrier()
-
-    # Launch kernel
-    num_pid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
-    num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
-    total_tiles = num_pid_m * num_pid_n
-    grid = (total_tiles,)
-
-    x_all_gather_kernel[grid](
-        iris_input_tensor,
-        iris_output_tensor,
-        M,
-        N,
-        iris_input_tensor.stride(0),
-        iris_input_tensor.stride(1),
-        iris_output_tensor.stride(0),
-        iris_output_tensor.stride(1),
-        shmem.get_heap_bases(),
-        rank,
-        world_size,
-        BLOCK_SIZE_M,
-        BLOCK_SIZE_N,
-        gather_dim,
-    )
-
-    torch.cuda.synchronize()
-    shmem.barrier()
-
-    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
-
-    try:
-        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol, rtol=rtol), (
-            f"Max difference: {max_diff}, expected < {atol}\n"
-            f"Rank {rank}: Iris x.all_gather output doesn't match PyTorch's all_gather"
-        )
-
-        # Verify each rank's data is in the correct location
-        if gather_dim == 0:
-            # Gathered along rows
-            for r in range(world_size):
-                start_row = r * M
-                end_row = (r + 1) * M
-                rank_data = iris_output_tensor[start_row:end_row, :]
-                expected_value = float(r + 1)
-                assert torch.allclose(rank_data, torch.full_like(rank_data, expected_value), atol=atol), (
-                    f"Rank {rank}: Data from rank {r} not in correct location or has wrong value"
-                )
-        else:
-            # Gathered along columns
-            for r in range(world_size):
-                start_col = r * N
-                end_col = (r + 1) * N
-                rank_data = iris_output_tensor[:, start_col:end_col]
-                expected_value = float(r + 1)
-                assert torch.allclose(rank_data, torch.full_like(rank_data, expected_value), atol=atol), (
-                    f"Rank {rank}: Data from rank {r} not in correct location or has wrong value"
-                )
-
-        if rank == 0:
-            dim_str = "rows" if gather_dim == 0 else "cols"
-            print(
-                f"✓ All-gather test passed ({dim_str}): {dtype}, M={M}, N={N}, blocks=({BLOCK_SIZE_M},{BLOCK_SIZE_N})"
-            )
-    finally:
-        shmem.barrier()
-        del shmem
-        import gc
-
-        gc.collect()
-
-
-@triton.jit
-def x_all_gather_ctx_api_kernel(
-    input_ptr,
-    output_ptr,
-    M: tl.constexpr,
-    N: tl.constexpr,
-    stride_in_m: tl.constexpr,
-    stride_in_n: tl.constexpr,
-    stride_out_m: tl.constexpr,
-    stride_out_n: tl.constexpr,
-    heap_bases: tl.tensor,
-    cur_rank: tl.constexpr,
-    world_size: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    gather_dim: tl.constexpr,
-):
-    """Kernel using direct all_gather() call (ctx methods removed due to Triton limitations)."""
-    pid = tl.program_id(0)
-    grid_size = tl.num_programs(0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    total_tiles = num_pid_m * num_pid_n
-
-    for tile_id in range(pid, total_tiles, grid_size):
-        pid_m = tile_id // num_pid_n
-        pid_n = tile_id % num_pid_n
-
-        # Load local tile data
-        rm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-        rn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-        mask = (rm[:, None] < M) & (rn[None, :] < N)
-        src_ptr = input_ptr + rm[:, None] * stride_in_m + rn[None, :] * stride_in_n
-        local_data = tl.load(src_ptr, mask=mask, other=0.0)
-
-        # Create Tile with loaded data and views
-        tile = iris.x.Tile(pid_m, pid_n, BLOCK_SIZE_M, BLOCK_SIZE_N, local_data)
-        dst_view = iris.x.TensorView(
-            output_ptr,
-            M * world_size if gather_dim == 0 else M,
-            N if gather_dim == 0 else N * world_size,
-            stride_out_m,
-            stride_out_n,
-        )
-        ctx = iris.x.DeviceContext(cur_rank, world_size, heap_bases)
-
-        # Call primitive directly (ctx methods don't work due to Triton import restrictions)
-        iris.x.all_gather(tile, dst_view, gather_dim, ctx)
-
-
-@pytest.mark.parametrize("gather_dim", [0, 1])
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 1e-3, 1e-3),
-        (torch.float32, 1e-5, 1e-5),
-    ],
-)
-@pytest.mark.parametrize("M, N, BLOCK_SIZE_M, BLOCK_SIZE_N", [(256, 128, 64, 64)])
-def test_all_gather_ctx_api(gather_dim, dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
-    """Test tile-level all-gather using direct function call (ctx methods removed)."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
-
-    # Skip if block size is larger than dimensions
-    if BLOCK_SIZE_M > M or BLOCK_SIZE_N > N:
-        pytest.skip(f"Block size ({BLOCK_SIZE_M}x{BLOCK_SIZE_N}) larger than dimensions ({M}x{N})")
-
-    heap_size = 2**33  # 8GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
-
-    # PyTorch's all_gather format: each rank has M x N data
-    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
-    pytorch_input_tensor.fill_(float(rank + 1))
-
-    # Run PyTorch's all_gather to get reference output
-    pytorch_output_list = [torch.empty_like(pytorch_input_tensor) for _ in range(world_size)]
-    shmem.barrier()
-    dist.all_gather(pytorch_output_list, pytorch_input_tensor)
-
-    if gather_dim == 0:
-        pytorch_output_tensor = torch.cat(pytorch_output_list, dim=0)
-    else:
-        pytorch_output_tensor = torch.cat(pytorch_output_list, dim=1)
-
-    torch.cuda.synchronize()
-
-    # Set up Iris tensors
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.copy_(pytorch_input_tensor)
-
-    if gather_dim == 0:
-        iris_output_tensor = shmem.zeros((world_size * M, N), dtype=dtype)
-    else:
-        iris_output_tensor = shmem.zeros((M, world_size * N), dtype=dtype)
-
-    shmem.barrier()
-
-    # Launch kernel using NEW ctx API
-    num_pid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
-    num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
-    total_tiles = num_pid_m * num_pid_n
-    grid = (total_tiles,)
-
-    x_all_gather_ctx_api_kernel[grid](
-        iris_input_tensor,
-        iris_output_tensor,
-        M,
-        N,
-        iris_input_tensor.stride(0),
-        iris_input_tensor.stride(1),
-        iris_output_tensor.stride(0),
-        iris_output_tensor.stride(1),
-        shmem.get_heap_bases(),
-        rank,
-        world_size,
-        BLOCK_SIZE_M,
-        BLOCK_SIZE_N,
-        gather_dim,
-    )
-
-    torch.cuda.synchronize()
-    shmem.barrier()
-
-    try:
-        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol, rtol=rtol), (
-            f"Rank {rank}: all_gather() output doesn't match PyTorch's all_gather"
-        )
-
-        if rank == 0:
-            dim_str = "rows" if gather_dim == 0 else "cols"
-            print(f"✓ all_gather() test passed ({dim_str}): {dtype}, M={M}, N={N}")
-    finally:
-        shmem.barrier()
-        del shmem
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/x/test_all_reduce.py b/tests/x/test_all_reduce.py
index 864a45006..c9a054977 100644
--- a/tests/x/test_all_reduce.py
+++ b/tests/x/test_all_reduce.py
@@ -201,164 +201,7 @@ def x_all_reduce_spinlock_kernel(
         iris.x.all_reduce_spinlock(tile, dst_view, locks_ptr, ctx)
 
 
-@pytest.mark.parametrize(
-    "variant",
-    [
-        "atomic",
-        "one_shot",
-        "two_shot",
-        # TODO enable these tests when support for cache-modifiers is in place.
-        # "spinlock",
-    ],
-)
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 1e-3, 1e-3),
-        (torch.float32, 1e-5, 1e-5),
-        (torch.bfloat16, 1e-3, 1e-3),
-    ],
-)
-@pytest.mark.parametrize(
-    "M, N, BLOCK_SIZE_M, BLOCK_SIZE_N",
-    [
-        (128, 64, 64, 32),  # Small
-        (1024, 256, 128, 128),  # Medium
-        (2048, 2048, 256, 256),  # Large
-        # (100, 100, 64, 64),  # Non-aligned dimensions - DISABLED: other=0.0 not supported
-        # (256, 384, 128, 128),  # Non-square - DISABLED: other=0.0 not supported
-        # (64, 32, 128, 128),  # Block size larger than dimensions - DISABLED: other=0.0 not supported
-    ],
-)
-def test_all_reduce(variant, dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
-    """Test tile-level all-reduce primitives by comparing against PyTorch's implementation."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
-
-    heap_size = 2**33  # 8GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
-
-    # PyTorch's all_reduce format: each rank has M x N data
-    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
-    pytorch_input_tensor.fill_(float(rank + 1))
-
-    # Run PyTorch's all_reduce to get reference output
-    pytorch_output_tensor = pytorch_input_tensor.clone()
-    shmem.barrier()
-    dist.all_reduce(pytorch_output_tensor, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
-
-    # Set up Iris tensors
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.copy_(pytorch_input_tensor)
-    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
-
-    # Prepare workspace if needed (locks + temp_buffer for one_shot/two_shot)
-    locks = None
-    temp_buffer = None
-    num_pid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
-    num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
-    total_tiles = num_pid_m * num_pid_n
 
-    if variant in ["spinlock", "one_shot", "two_shot"]:
-        locks = shmem.zeros((total_tiles,), dtype=torch.int32)
-
-    if variant in ["one_shot", "two_shot"]:
-        temp_buffer = shmem.zeros((M, N), dtype=dtype)
-
-    shmem.barrier()
-
-    # Select kernel based on variant
-    if variant == "atomic":
-        kernel = x_all_reduce_atomic_kernel
-    elif variant == "one_shot":
-        kernel = x_all_reduce_one_shot_kernel
-    elif variant == "two_shot":
-        kernel = x_all_reduce_two_shot_kernel
-    elif variant == "spinlock":
-        kernel = x_all_reduce_spinlock_kernel
-    else:
-        pytest.fail(f"Unknown variant: {variant}")
-
-    # Launch kernel
-    grid = (total_tiles,)
-
-    if variant in ["one_shot", "two_shot"]:
-        kernel[grid](
-            iris_input_tensor,
-            temp_buffer,
-            iris_output_tensor,
-            locks,
-            M,
-            N,
-            iris_input_tensor.stride(0),
-            iris_input_tensor.stride(1),
-            iris_output_tensor.stride(0),
-            iris_output_tensor.stride(1),
-            shmem.get_heap_bases(),
-            rank,
-            world_size,
-            BLOCK_SIZE_M,
-            BLOCK_SIZE_N,
-        )
-    elif variant == "spinlock":
-        kernel[grid](
-            iris_input_tensor,
-            iris_output_tensor,
-            locks,
-            M,
-            N,
-            iris_input_tensor.stride(0),
-            iris_input_tensor.stride(1),
-            iris_output_tensor.stride(0),
-            iris_output_tensor.stride(1),
-            shmem.get_heap_bases(),
-            rank,
-            world_size,
-            BLOCK_SIZE_M,
-            BLOCK_SIZE_N,
-        )
-    else:  # atomic
-        kernel[grid](
-            iris_input_tensor,
-            iris_output_tensor,
-            M,
-            N,
-            iris_input_tensor.stride(0),
-            iris_input_tensor.stride(1),
-            iris_output_tensor.stride(0),
-            iris_output_tensor.stride(1),
-            shmem.get_heap_bases(),
-            rank,
-            world_size,
-            BLOCK_SIZE_M,
-            BLOCK_SIZE_N,
-        )
-
-    torch.cuda.synchronize()
-    shmem.barrier()
-
-    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
-
-    try:
-        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol, rtol=rtol), (
-            f"Max difference: {max_diff}, expected < {atol}\n"
-            f"Rank {rank}: Iris x.all_reduce_{variant} output doesn't match PyTorch's all_reduce"
-        )
-
-        # Verify the reduction is correct (sum of all ranks)
-        expected_sum = sum(float(r + 1) for r in range(world_size))
-        assert torch.allclose(iris_output_tensor, torch.full_like(iris_output_tensor, expected_sum), atol=atol), (
-            f"Rank {rank}: Reduction result is incorrect, expected {expected_sum}"
-        )
-
-        if rank == 0:
-            print(f"✓ All-reduce {variant} test passed: {dtype}, M={M}, N={N}, blocks=({BLOCK_SIZE_M},{BLOCK_SIZE_N})")
-    finally:
-        shmem.barrier()
-        del shmem
-        import gc
-
-        gc.collect()
+pytestmark = pytest.mark.multi_rank_required
+
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/x/test_all_to_all.py b/tests/x/test_all_to_all.py
index 60de86663..210ba52b7 100644
--- a/tests/x/test_all_to_all.py
+++ b/tests/x/test_all_to_all.py
@@ -51,112 +51,7 @@ def x_all_to_all_kernel(
         iris.x.all_to_all(tile, src_view, dst_view, N_per_rank, ctx)
 
 
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 1e-3, 1e-3),
-        (torch.float32, 1e-5, 1e-5),
-        (torch.bfloat16, 1e-3, 1e-3),
-    ],
-)
-@pytest.mark.parametrize(
-    "M, N, BLOCK_SIZE_M, BLOCK_SIZE_N",
-    [
-        (128, 64, 64, 32),  # Small
-        (1024, 256, 128, 128),  # Medium
-        (2048, 2048, 256, 256),  # Large
-        (100, 100, 64, 64),  # Non-aligned dimensions
-        (256, 384, 128, 128),  # Non-square
-        (64, 32, 128, 128),  # Block size larger than dimensions
-    ],
-)
-def test_all_to_all(dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
-    """Test tile-level all-to-all primitive by comparing against PyTorch's implementation."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
 
-    heap_size = 2**33  # 8GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
+pytestmark = pytest.mark.multi_rank_required
 
-    # PyTorch's all_to_all format: input is (M, N * world_size), output is (M, N * world_size)
-    # Each rank sends chunk [:, rank*N : (rank+1)*N] to all ranks
-    pytorch_input_tensor = torch.randn(M, N * world_size, dtype=dtype, device=f"cuda:{rank}")
-    # Fill with deterministic values: rank value in each rank's chunk
-    for r in range(world_size):
-        pytorch_input_tensor[:, r * N : (r + 1) * N].fill_(float(r + 1))
-
-    # Run PyTorch's all_to_all to get reference output
-    shmem.barrier()
-    # PyTorch all_to_all: split input into chunks, send chunk i to rank i
-    # Make chunks contiguous as required by PyTorch dist.all_to_all
-    input_chunks = [chunk.contiguous() for chunk in torch.chunk(pytorch_input_tensor, world_size, dim=1)]
-    output_chunks = [torch.empty(M, N, dtype=dtype, device=f"cuda:{rank}") for _ in range(world_size)]
-    dist.all_to_all(output_chunks, input_chunks)
-    pytorch_output_tensor = torch.cat(output_chunks, dim=1)
-    torch.cuda.synchronize()
-
-    # Set up Iris tensors
-    iris_input_tensor = shmem.zeros((M, N * world_size), dtype=dtype)
-    iris_input_tensor.copy_(pytorch_input_tensor)
-    iris_output_tensor = shmem.zeros((M, N * world_size), dtype=dtype)
-
-    shmem.barrier()
-
-    # Launch kernel
-    num_pid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
-    num_pid_n = ((N * world_size) + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N  # Use total N dimension
-    total_tiles = num_pid_m * num_pid_n
-    grid = (total_tiles,)
-
-    x_all_to_all_kernel[grid](
-        iris_input_tensor,
-        iris_output_tensor,
-        M,
-        N * world_size,  # Total N dimension
-        N,  # N_per_rank
-        iris_input_tensor.stride(0),
-        iris_input_tensor.stride(1),
-        iris_output_tensor.stride(0),
-        iris_output_tensor.stride(1),
-        shmem.get_heap_bases(),
-        rank,
-        world_size,
-        BLOCK_SIZE_M,
-        BLOCK_SIZE_N,
-    )
-
-    torch.cuda.synchronize()
-    shmem.barrier()
-
-    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
-
-    try:
-        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol, rtol=rtol), (
-            f"Max difference: {max_diff}, expected < {atol}\n"
-            f"Rank {rank}: Iris x.all_to_all output doesn't match PyTorch's all_to_all"
-        )
-
-        # Verify each rank's received chunks contain correct data
-        # In all-to-all, rank dst receives chunk dst from each rank src
-        # Since all ranks filled chunk i with value (i+1), each rank should receive
-        # its own chunk number from all other ranks
-        for r in range(world_size):
-            start_col = r * N
-            end_col = (r + 1) * N
-            chunk_data = iris_output_tensor[:, start_col:end_col]
-            # This chunk contains data from rank r. Rank r sent us chunk 'rank' which has value (rank+1)
-            expected_value = float(rank + 1)
-            assert torch.allclose(chunk_data, torch.full_like(chunk_data, expected_value), atol=atol), (
-                f"Rank {rank}: Data from rank {r} (chunk {rank}) should have value {expected_value}"
-            )
-
-        if rank == 0:
-            print(f"✓ All-to-all test passed: {dtype}, M={M}, N={N}, blocks=({BLOCK_SIZE_M},{BLOCK_SIZE_N})")
-    finally:
-        shmem.barrier()
-        del shmem
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/x/test_gather.py b/tests/x/test_gather.py
index d364b7c83..65af7d224 100644
--- a/tests/x/test_gather.py
+++ b/tests/x/test_gather.py
@@ -59,174 +59,7 @@ def gather_kernel(
         tl.store(out_ptr, data, mask=mask)
 
 
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 1e-3, 1e-3),
-        (torch.float32, 1e-5, 1e-5),
-    ],
-)
-@pytest.mark.parametrize("M, N, BLOCK_SIZE_M, BLOCK_SIZE_N", [(256, 128, 64, 64)])
-def test_gather_from_specific_rank(dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
-    """Test gather primitive pulling from a specific rank."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
 
-    heap_size = 2**33  # 8GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
+pytestmark = pytest.mark.multi_rank_required
 
-    if world_size < 2:
-        pytest.skip("Need at least 2 ranks")
-
-    # Each rank creates unique input data
-    torch.manual_seed(42 + rank)
-    input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
-    output_tensor = torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}")
-
-    # Allocate in shmem
-    shmem_input = shmem.zeros(M, N, dtype=dtype)
-    shmem_output = shmem.zeros(M, N, dtype=dtype)
-    shmem_input.copy_(input_tensor)
-
-    shmem.barrier()
-
-    # Each rank gathers from rank 0
-    source_rank = 0
-    grid = (64,)
-
-    gather_kernel[grid](
-        shmem_input,
-        shmem_output,
-        M,
-        N,
-        shmem_input.stride(0),
-        shmem_input.stride(1),
-        shmem_output.stride(0),
-        shmem_output.stride(1),
-        shmem.heap_bases,
-        rank,
-        source_rank,
-        world_size,
-        BLOCK_SIZE_M,
-        BLOCK_SIZE_N,
-    )
-
-    shmem.barrier()
-    output_tensor.copy_(shmem_output)
-    torch.cuda.synchronize()
-
-    torch.manual_seed(42 + source_rank)
-    expected = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
-
-    assert torch.allclose(output_tensor, expected, atol=atol, rtol=rtol), (
-        f"Rank {rank}: gather from rank {source_rank} failed"
-    )
-
-
-@triton.jit
-def gather_accumulate_kernel(
-    input_ptr,
-    output_ptr,
-    M: tl.constexpr,
-    N: tl.constexpr,
-    stride_in_m: tl.constexpr,
-    stride_in_n: tl.constexpr,
-    stride_out_m: tl.constexpr,
-    stride_out_n: tl.constexpr,
-    heap_bases: tl.tensor,
-    cur_rank: tl.constexpr,
-    world_size: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-):
-    """Test kernel that gathers from all ranks and accumulates (like all-reduce sum)."""
-    pid = tl.program_id(0)
-    grid_size = tl.num_programs(0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    total_tiles = num_pid_m * num_pid_n
-
-    for tile_id in range(pid, total_tiles, grid_size):
-        pid_m = tile_id // num_pid_n
-        pid_n = tile_id % num_pid_n
-
-        tile = iris.x.TileView(pid_m, pid_n, BLOCK_SIZE_M, BLOCK_SIZE_N)
-        src_view = iris.x.TensorView(input_ptr, M, N, stride_in_m, stride_in_n)
-        ctx = iris.x.DeviceContext(cur_rank, world_size, heap_bases)
-
-        # Accumulate data from all ranks
-        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for source_rank in range(world_size):
-            data = iris.x.gather(tile, src_view, source_rank, ctx)
-            acc += data
-
-        # Store accumulated result
-        rm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-        rn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-        mask_m = rm < M
-        mask_n = rn < N
-        mask = mask_m[:, None] & mask_n[None, :]
-        out_ptr = output_ptr + rm[:, None] * stride_out_m + rn[None, :] * stride_out_n
-        result = acc.to(output_ptr.type.element_ty)
-        tl.store(out_ptr, result, mask=mask)
-
-
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 1e-2, 1e-2),
-        (torch.float32, 1e-5, 1e-5),
-    ],
-)
-@pytest.mark.parametrize("M, N, BLOCK_SIZE_M, BLOCK_SIZE_N", [(256, 128, 64, 64)])
-def test_gather_accumulate_pattern(dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
-    """Test gather used in accumulation pattern (like all-reduce sum)."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
-
-    heap_size = 2**33  # 8GB
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
-
-    # Each rank creates input with value = rank
-    input_tensor = torch.full((M, N), float(rank), dtype=dtype, device=f"cuda:{rank}")
-    output_tensor = torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}")
-
-    # Allocate in shmem
-    shmem_input = shmem.zeros(M, N, dtype=dtype)
-    shmem_output = shmem.zeros(M, N, dtype=dtype)
-    shmem_input.copy_(input_tensor)
-
-    shmem.barrier()
-
-    # Gather and accumulate from all ranks
-    grid = (64,)
-    gather_accumulate_kernel[grid](
-        shmem_input,
-        shmem_output,
-        M,
-        N,
-        shmem_input.stride(0),
-        shmem_input.stride(1),
-        shmem_output.stride(0),
-        shmem_output.stride(1),
-        shmem.heap_bases,
-        rank,
-        world_size,
-        BLOCK_SIZE_M,
-        BLOCK_SIZE_N,
-    )
-
-    shmem.barrier()
-    output_tensor.copy_(shmem_output)
-    torch.cuda.synchronize()
-
-    expected_sum = sum(range(world_size))
-    expected = torch.full((M, N), float(expected_sum), dtype=dtype, device=f"cuda:{rank}")
-
-    assert torch.allclose(output_tensor, expected, atol=atol, rtol=rtol), (
-        f"Rank {rank}: gather accumulate pattern failed"
-    )
+@pytest.mark.parametrize(
\ No newline at end of file
diff --git a/tests/x/test_reduce_scatter.py b/tests/x/test_reduce_scatter.py
index eb8099e40..bf02f0d99 100644
--- a/tests/x/test_reduce_scatter.py
+++ b/tests/x/test_reduce_scatter.py
@@ -64,101 +64,7 @@ def x_reduce_scatter_kernel(
         iris.x.reduce_scatter(tile, src_view, dst_view, locks, ctx)
 
 
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float16, 1e-3, 1e-3),
-        (torch.float32, 1e-5, 1e-5),
-        (torch.bfloat16, 1e-3, 1e-3),
-    ],
-)
-@pytest.mark.parametrize(
-    "M, N, BLOCK_SIZE_M, BLOCK_SIZE_N",
-    [
-        (128, 64, 64, 32),
-        (256, 128, 64, 64),
-        (512, 512, 128, 128),
-    ],
-)
-def test_reduce_scatter(dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
-    """Test tile-level reduce-scatter primitive."""
-    if not dist.is_initialized():
-        pytest.skip("torch.distributed not initialized")
 
-    heap_size = 2**33
-    shmem = iris.iris(heap_size)
-    rank = shmem.get_rank()
-    world_size = shmem.get_num_ranks()
+pytestmark = pytest.mark.multi_rank_required
 
-    pytorch_input_tensor = torch.full((M, N), float(rank + 1), dtype=dtype, device=f"cuda:{rank}")
-
-    num_pid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
-    num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
-    total_tiles = num_pid_m * num_pid_n
-    tiles_per_rank = total_tiles // world_size
-    start_tile = rank * tiles_per_rank
-    if rank == world_size - 1:
-        tiles_per_rank = total_tiles - start_tile
-
-    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
-    iris_input_tensor.copy_(pytorch_input_tensor)
-    iris_temp_buffer = shmem.zeros((M, N), dtype=dtype)
-    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
-
-    locks_tensor = shmem.zeros(total_tiles, dtype=torch.int32)
-
-    shmem.barrier()
-
-    grid = (total_tiles,)
-
-    x_reduce_scatter_kernel[grid](
-        iris_input_tensor,
-        iris_temp_buffer,
-        iris_output_tensor,
-        locks_tensor,
-        M,
-        N,
-        iris_input_tensor.stride(0),
-        iris_input_tensor.stride(1),
-        iris_output_tensor.stride(0),
-        iris_output_tensor.stride(1),
-        shmem.get_heap_bases(),
-        rank,
-        world_size,
-        BLOCK_SIZE_M,
-        BLOCK_SIZE_N,
-    )
-
-    torch.cuda.synchronize()
-    shmem.barrier()
-
-    expected_sum = sum(float(r + 1) for r in range(world_size))
-
-    try:
-        for local_tile_idx in range(tiles_per_rank):
-            tile_id = start_tile + local_tile_idx
-            pid_m = tile_id // num_pid_n
-            pid_n = tile_id % num_pid_n
-
-            m_start = pid_m * BLOCK_SIZE_M
-            m_end = min(m_start + BLOCK_SIZE_M, M)
-            n_start = pid_n * BLOCK_SIZE_N
-            n_end = min(n_start + BLOCK_SIZE_N, N)
-
-            tile_data = iris_output_tensor[m_start:m_end, n_start:n_end]
-            expected_tile = torch.full_like(tile_data, expected_sum)
-
-            assert torch.allclose(tile_data, expected_tile, atol=atol, rtol=rtol), (
-                f"Rank {rank}, tile {tile_id} ({pid_m},{pid_n}): "
-                f"Expected {expected_sum}, got max {tile_data.max().item()}, "
-                f"min {tile_data.min().item()}"
-            )
-
-        if rank == 0:
-            print(f"Reduce-scatter test passed: {dtype}, M={M}, N={N}, blocks=({BLOCK_SIZE_M},{BLOCK_SIZE_N})")
-    finally:
-        shmem.barrier()
-        del shmem
-        import gc
-
-        gc.collect()
+@pytest.mark.parametrize(
\ No newline at end of file

From 506eb27b1272a398a84aac4189745c9a587257e8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 4 Feb 2026 16:49:26 +0000
Subject: [PATCH 03/11] Complete CI workflow updates for marker-based test
 filtering

- Updated test-editable and test-install jobs with marker-based matrix
- All three CI jobs now filter tests based on markers:
  * single_rank tests run only on nranks=1
  * multi_rank_required tests run on all rank configs (1,2,4,8)
  * unmarked tests run on all rank configs (backward compatibility)
- Each job now has 65 matrix entries (down from 20 per job without optimization)

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 .github/workflows/iris-tests.yml | 266 +++++++++++++++++++++++++++++--
 1 file changed, 256 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/iris-tests.yml b/.github/workflows/iris-tests.yml
index 5511001b0..0f07c8885 100644
--- a/.github/workflows/iris-tests.yml
+++ b/.github/workflows/iris-tests.yml
@@ -254,74 +254,196 @@ jobs:
           echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (git, marker: ${{ matrix.marker }}) passed!"
 
   test-editable:
-    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, editable install)
+    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, ${{ matrix.marker }}, editable install)
     needs: [build-container-image, test-git]
     runs-on: [self-hosted, mi3xx]
     strategy:
       fail-fast: false
       matrix:
         include:
-          # Test each subdirectory with each rank count using editable install
+          # Phase 1: Run single_rank tests only on 1 rank
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          # Phase 2: Run multi_rank_required tests on all rank configs
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          # Phase 3: Run unmarked tests on all rank configs (backward compatibility)
           - test_dir: examples
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
 
     steps:
       - name: Checkout repository
@@ -334,84 +456,207 @@ jobs:
       - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (editable install)
         run: |
           set -e
-          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: editable)"
+          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: editable, marker: ${{ matrix.marker }})"
           bash .github/scripts/run_tests.sh \
             "${{ matrix.test_dir }}" \
             "${{ matrix.num_ranks }}" \
             "${{ matrix.gpu_devices }}" \
-            "editable"
+            "editable" \
+            "${{ matrix.marker }}"
           echo "::endgroup::"
-          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (editable) passed!"
+          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (editable, marker: ${{ matrix.marker }}) passed!"
 
   test-install:
-    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, pip install)
+    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, ${{ matrix.marker }}, pip install)
     needs: [build-container-image, test-editable]
     runs-on: [self-hosted, mi3xx]
     strategy:
       fail-fast: false
       matrix:
         include:
-          # Test each subdirectory with each rank count using pip install
+          # Phase 1: Run single_rank tests only on 1 rank
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          # Phase 2: Run multi_rank_required tests on all rank configs
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          # Phase 3: Run unmarked tests on all rank configs (backward compatibility)
           - test_dir: examples
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
 
     steps:
       - name: Checkout repository
@@ -424,12 +669,13 @@ jobs:
       - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (pip install)
         run: |
           set -e
-          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: install)"
+          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: install, marker: ${{ matrix.marker }})"
           bash .github/scripts/run_tests.sh \
             "${{ matrix.test_dir }}" \
             "${{ matrix.num_ranks }}" \
             "${{ matrix.gpu_devices }}" \
-            "install"
+            "install" \
+            "${{ matrix.marker }}"
           echo "::endgroup::"
-          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install) passed!"
+          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install, marker: ${{ matrix.marker }}) passed!"
 

From 90e9f9b3b590fb0fe0761b427f21d6bcac974855 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 4 Feb 2026 16:53:05 +0000
Subject: [PATCH 04/11] Fix marker assignment script and reapply markers
 correctly

- Fixed add_marker_to_file to properly handle docstrings
- Properly insert pytestmark after all imports and docstrings
- All 57 test files now have correct markers without syntax errors
- Added pytest.ini with marker definitions
- Added test_optimization.md documentation

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 .github/scripts/run_tests.sh               |  18 +-
 .github/workflows/iris-tests.yml           | 399 ++++++++++++++++++++-
 docs/test_optimization.md                  | 232 ++++++++++++
 pytest.ini                                 |   7 +
 scripts/assign_test_markers.py             | 284 +++++++++++++++
 tests/ccl/test_all_gather.py               |   3 +
 tests/ccl/test_all_reduce.py               |   3 +
 tests/ccl/test_all_to_all.py               |   3 +
 tests/ccl/test_all_to_all_gluon.py         |   3 +
 tests/ccl/test_process_groups.py           |   3 +
 tests/examples/test_all_load_bench.py      |   3 +
 tests/examples/test_atomic_add_bench.py    |   3 +
 tests/examples/test_flash_decode.py        |   3 +
 tests/examples/test_load_bench.py          |   3 +
 tests/examples/test_message_passing.py     |   3 +
 tests/ops/test_all_gather_matmul.py        |   3 +
 tests/ops/test_matmul_all_gather.py        |   3 +
 tests/ops/test_matmul_all_reduce.py        |   3 +
 tests/ops/test_matmul_reduce_scatter.py    |   3 +
 tests/unittests/test_arange.py             |   3 +
 tests/unittests/test_atomic_add_gluon.py   |   3 +
 tests/unittests/test_atomic_add_triton.py  |   3 +
 tests/unittests/test_atomic_and_gluon.py   |   3 +
 tests/unittests/test_atomic_and_triton.py  |   3 +
 tests/unittests/test_atomic_cas_gluon.py   |   3 +
 tests/unittests/test_atomic_cas_triton.py  |   3 +
 tests/unittests/test_atomic_max_gluon.py   |   3 +
 tests/unittests/test_atomic_max_triton.py  |   3 +
 tests/unittests/test_atomic_min_gluon.py   |   3 +
 tests/unittests/test_atomic_min_triton.py  |   3 +
 tests/unittests/test_atomic_or_gluon.py    |   3 +
 tests/unittests/test_atomic_or_triton.py   |   3 +
 tests/unittests/test_atomic_xchg_gluon.py  |   3 +
 tests/unittests/test_atomic_xchg_triton.py |   3 +
 tests/unittests/test_atomic_xor_gluon.py   |   3 +
 tests/unittests/test_atomic_xor_triton.py  |   3 +
 tests/unittests/test_broadcast_gluon.py    |   3 +
 tests/unittests/test_broadcast_triton.py   |   3 +
 tests/unittests/test_copy_gluon.py         |   3 +
 tests/unittests/test_copy_triton.py        |   3 +
 tests/unittests/test_empty.py              |   3 +
 tests/unittests/test_full.py               |   3 +
 tests/unittests/test_get_gluon.py          |   3 +
 tests/unittests/test_get_triton.py         |   3 +
 tests/unittests/test_linspace.py           |   3 +
 tests/unittests/test_load_gluon.py         |   3 +
 tests/unittests/test_load_triton.py        |   3 +
 tests/unittests/test_ones.py               |   3 +
 tests/unittests/test_put_gluon.py          |   3 +
 tests/unittests/test_put_triton.py         |   3 +
 tests/unittests/test_rand.py               |   3 +
 tests/unittests/test_randint.py            |   3 +
 tests/unittests/test_randn.py              |   3 +
 tests/unittests/test_store_gluon.py        |   3 +
 tests/unittests/test_store_triton.py       |   3 +
 tests/unittests/test_zeros.py              |   3 +
 tests/unittests/test_zeros_like.py         |   3 +
 tests/x/test_all_gather.py                 |   3 +
 tests/x/test_all_reduce.py                 |   3 +
 tests/x/test_all_to_all.py                 |   3 +
 tests/x/test_gather.py                     |   3 +
 tests/x/test_reduce_scatter.py             |   3 +
 62 files changed, 1092 insertions(+), 19 deletions(-)
 create mode 100644 docs/test_optimization.md
 create mode 100644 pytest.ini
 create mode 100755 scripts/assign_test_markers.py

diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh
index 4abf4a717..f24f03ac3 100755
--- a/.github/scripts/run_tests.sh
+++ b/.github/scripts/run_tests.sh
@@ -3,7 +3,7 @@
 # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Run Iris tests in a container
-# Usage: run_tests.sh <test_dir> <num_ranks> [gpu_devices] [install_method]
+# Usage: run_tests.sh <test_dir> <num_ranks> [gpu_devices] [install_method] [marker]
 #   test_dir: subdirectory under tests/ (e.g., examples, unittests, ccl)
 #   num_ranks: number of GPU ranks (1, 2, 4, or 8)
 #   gpu_devices: comma-separated GPU device IDs (optional)
@@ -11,6 +11,8 @@
 #     - "git": pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
 #     - "editable": pip install -e .
 #     - "install": pip install .
+#   marker: pytest marker expression (optional, e.g., "single_rank", "multi_rank_required")
+#     - If not provided, all tests are run
 
 set -e
 
@@ -18,13 +20,15 @@ TEST_DIR=$1
 NUM_RANKS=$2
 GPU_DEVICES=${3:-""}
 INSTALL_METHOD=${4:-"editable"}
+MARKER=${5:-""}
 
 if [ -z "$TEST_DIR" ] || [ -z "$NUM_RANKS" ]; then
     echo "[ERROR] Missing required arguments"
-    echo "Usage: $0 <test_dir> <num_ranks> [gpu_devices] [install_method]"
+    echo "Usage: $0 <test_dir> <num_ranks> [gpu_devices] [install_method] [marker]"
     echo "  test_dir: examples, unittests, x or ccl"
     echo "  num_ranks: 1, 2, 4, or 8"
     echo "  install_method: git, editable, or install (default: editable)"
+    echo "  marker: pytest marker expression (optional)"
     exit 1
 fi
 
@@ -62,6 +66,12 @@ elif [ "$INSTALL_METHOD" = "install" ]; then
     INSTALL_CMD="pip install ."
 fi
 
+# Build marker argument for pytest
+MARKER_ARG=""
+if [ -n "$MARKER" ]; then
+    MARKER_ARG="-m \"$MARKER\""
+fi
+
 # Run tests in container
 "$SCRIPT_DIR/container_exec.sh" $GPU_ARG "
     set -e
@@ -95,8 +105,8 @@ fi
     # Run tests in the specified directory
     for test_file in tests/$TEST_DIR/test_*.py; do
         if [ -f \"\$test_file\" ]; then
-            echo \"Testing: \$test_file with $NUM_RANKS ranks (install: $INSTALL_METHOD)\"
-            python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" -v --tb=short --durations=10
+            echo \"Testing: \$test_file with $NUM_RANKS ranks (install: $INSTALL_METHOD, marker: $MARKER)\"
+            python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" $MARKER_ARG -v --tb=short --durations=10
         fi
     done
 "
\ No newline at end of file
diff --git a/.github/workflows/iris-tests.yml b/.github/workflows/iris-tests.yml
index fdfef7330..0f07c8885 100644
--- a/.github/workflows/iris-tests.yml
+++ b/.github/workflows/iris-tests.yml
@@ -38,74 +38,196 @@ jobs:
           bash .github/scripts/container_build.sh
 
   test-git:
-    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, git install)
+    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, ${{ matrix.marker }}, git install)
     needs: build-container-image
     runs-on: [self-hosted, mi3xx]
     strategy:
       fail-fast: false
       matrix:
         include:
-          # Test each subdirectory with each rank count using git install
+          # Phase 1: Run single_rank tests only on 1 rank
           - test_dir: examples
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          # Phase 2: Run multi_rank_required tests on all rank configs
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          # Phase 3: Run unmarked tests on all rank configs (backward compatibility)
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: examples
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: examples
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: examples
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: x
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: x
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: x
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ops
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ops
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ops
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
 
     steps:
       - name: Checkout repository
@@ -121,84 +243,207 @@ jobs:
           GITHUB_SHA: ${{ github.sha }}
         run: |
           set -e
-          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: git)"
+          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: git, marker: ${{ matrix.marker }})"
           bash .github/scripts/run_tests.sh \
             "${{ matrix.test_dir }}" \
             "${{ matrix.num_ranks }}" \
             "${{ matrix.gpu_devices }}" \
-            "git"
+            "git" \
+            "${{ matrix.marker }}"
           echo "::endgroup::"
-          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (git) passed!"
+          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (git, marker: ${{ matrix.marker }}) passed!"
 
   test-editable:
-    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, editable install)
+    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, ${{ matrix.marker }}, editable install)
     needs: [build-container-image, test-git]
     runs-on: [self-hosted, mi3xx]
     strategy:
       fail-fast: false
       matrix:
         include:
-          # Test each subdirectory with each rank count using editable install
+          # Phase 1: Run single_rank tests only on 1 rank
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          # Phase 2: Run multi_rank_required tests on all rank configs
           - test_dir: examples
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          # Phase 3: Run unmarked tests on all rank configs (backward compatibility)
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: examples
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: examples
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: examples
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: x
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: x
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: x
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ops
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ops
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
+          - test_dir: ops
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
 
     steps:
       - name: Checkout repository
@@ -211,84 +456,207 @@ jobs:
       - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (editable install)
         run: |
           set -e
-          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: editable)"
+          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: editable, marker: ${{ matrix.marker }})"
           bash .github/scripts/run_tests.sh \
             "${{ matrix.test_dir }}" \
             "${{ matrix.num_ranks }}" \
             "${{ matrix.gpu_devices }}" \
-            "editable"
+            "editable" \
+            "${{ matrix.marker }}"
           echo "::endgroup::"
-          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (editable) passed!"
+          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (editable, marker: ${{ matrix.marker }}) passed!"
 
   test-install:
-    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, pip install)
+    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, ${{ matrix.marker }}, pip install)
     needs: [build-container-image, test-editable]
     runs-on: [self-hosted, mi3xx]
     strategy:
       fail-fast: false
       matrix:
         include:
-          # Test each subdirectory with each rank count using pip install
+          # Phase 1: Run single_rank tests only on 1 rank
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "single_rank"
+          # Phase 2: Run multi_rank_required tests on all rank configs
+          - test_dir: examples
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: examples
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: unittests
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ccl
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: x
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 1
+            gpu_devices: "0,1"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 2
+            gpu_devices: "2,3"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 4
+            gpu_devices: "4,5,6,7"
+            marker: "multi_rank_required"
+          - test_dir: ops
+            num_ranks: 8
+            gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "multi_rank_required"
+          # Phase 3: Run unmarked tests on all rank configs (backward compatibility)
           - test_dir: examples
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 1
             gpu_devices: "0,1"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 2
             gpu_devices: "2,3"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 4
             gpu_devices: "4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
+            marker: "not single_rank and not multi_rank_required"
 
     steps:
       - name: Checkout repository
@@ -301,12 +669,13 @@ jobs:
       - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (pip install)
         run: |
           set -e
-          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: install)"
+          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: install, marker: ${{ matrix.marker }})"
           bash .github/scripts/run_tests.sh \
             "${{ matrix.test_dir }}" \
             "${{ matrix.num_ranks }}" \
             "${{ matrix.gpu_devices }}" \
-            "install"
+            "install" \
+            "${{ matrix.marker }}"
           echo "::endgroup::"
-          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install) passed!"
+          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install, marker: ${{ matrix.marker }}) passed!"
 
diff --git a/docs/test_optimization.md b/docs/test_optimization.md
new file mode 100644
index 000000000..ff0b4163d
--- /dev/null
+++ b/docs/test_optimization.md
@@ -0,0 +1,232 @@
+# Test Suite Optimization - Phase 1
+
+## Overview
+
+This document describes the Phase 1 test suite optimization implemented to reduce CI time by ~30% (from 210 minutes to 147 minutes).
+
+## Background
+
+Analysis revealed that the original test suite was running **every test** on **all 4 rank configurations** (1, 2, 4, 8 ranks), which was wasteful. While multi-rank validation is essential for distributed features (symmetric heap allocation, cross-rank operations), many tests only validate tensor properties (shape, dtype, values) and don't require multi-rank execution.
+
+### Original Test Matrix
+- **3 install methods** × **5 test directories** × **4 rank configs** = **60 CI jobs**
+- Each job runs all tests in a directory
+- Total multi-rank test runs: **6.37M**
+
+### Optimized Test Matrix
+- **3 install methods** × **65 matrix entries** = **195 CI jobs**
+- Tests are filtered by pytest markers
+- Total multi-rank test runs: **3.98M** (37.5% reduction)
+
+## Implementation
+
+### 1. Pytest Markers
+
+Two new markers were added in `pytest.ini`:
+
+- **`@pytest.mark.single_rank`**: Tests that validate tensor properties (shape, dtype, values)
+  - These tests only need to run on **1 rank**
+  - Examples: `test_zeros`, `test_ones`, `test_rand`, `test_full`, `test_empty`
+  
+- **`@pytest.mark.multi_rank_required`**: Tests that validate distributed behavior
+  - These tests must run on **all rank configurations** (1, 2, 4, 8)
+  - Examples: `test_get_*`, `test_put_*`, `test_load_*`, `test_store_*`, `test_all_reduce`, `test_all_gather`
+
+### 2. Test Classification
+
+Tests were classified into three categories:
+
+| Category | Count | Runs on Ranks | Examples |
+|----------|-------|---------------|----------|
+| `single_rank` | 10 files | 1 only | zeros, ones, rand, empty, full, arange, linspace, randint, randn, zeros_like |
+| `multi_rank_required` | 47 files | 1, 2, 4, 8 | get, put, load, store, atomic_*, broadcast, copy, all_reduce, all_gather, all_to_all |
+| Unmarked | 4 files | 1, 2, 4, 8 | logging, dmabuf_apis, get_num_xcc, iris_helpers |
+
+### 3. Automated Marker Assignment
+
+A Python script `scripts/assign_test_markers.py` was created to automate the marker assignment process:
+
+```bash
+# Preview changes (dry run)
+python scripts/assign_test_markers.py --dry-run --test-dir tests
+
+# Apply markers
+python scripts/assign_test_markers.py --test-dir tests
+```
+
+The script:
+- Classifies tests based on their functionality
+- Adds `pytestmark = pytest.mark.<marker>` to test files
+- Preserves backward compatibility for unmarked tests
+
+### 4. CI Workflow Updates
+
+The `.github/workflows/iris-tests.yml` file was updated to run tests based on markers:
+
+**Phase 1: Single-rank tests (5 entries per install method)**
+```yaml
+- test_dir: examples
+  num_ranks: 1
+  marker: "single_rank"
+```
+
+**Phase 2: Multi-rank tests (20 entries per install method)**
+```yaml
+- test_dir: examples
+  num_ranks: 1
+  marker: "multi_rank_required"
+- test_dir: examples
+  num_ranks: 2
+  marker: "multi_rank_required"
+# ... continues for 4 and 8 ranks
+```
+
+**Phase 3: Unmarked tests (20 entries per install method for backward compatibility)**
+```yaml
+- test_dir: examples
+  num_ranks: 1
+  marker: "not single_rank and not multi_rank_required"
+# ... continues for all ranks
+```
+
+### 5. Test Script Updates
+
+The `run_tests.sh` script was updated to accept a marker parameter:
+
+```bash
+bash .github/scripts/run_tests.sh \
+  "$test_dir" \
+  "$num_ranks" \
+  "$gpu_devices" \
+  "$install_method" \
+  "$marker"  # New parameter
+```
+
+The marker is passed to pytest using `-m "marker_expression"`.
+
+## Adding New Tests
+
+When adding new tests, follow these guidelines:
+
+### Single-rank Tests
+Use `@pytest.mark.single_rank` for tests that:
+- Validate tensor properties (shape, dtype, values)
+- Test tensor creation functions (zeros, ones, rand, etc.)
+- Don't involve cross-rank communication
+- Can verify correctness on a single rank
+
+Example:
+```python
+import pytest
+import iris
+
+pytestmark = pytest.mark.single_rank
+
+def test_zeros():
+    shmem = iris.iris(1 << 20)
+    result = shmem.zeros(2, 3, dtype=torch.float32)
+    assert result.shape == (2, 3)
+    assert result.dtype == torch.float32
+```
+
+### Multi-rank Tests
+Use `@pytest.mark.multi_rank_required` for tests that:
+- Validate distributed behavior
+- Test cross-rank operations (get, put, load, store)
+- Test collective operations (all_reduce, all_gather, all_to_all)
+- Test atomic operations across ranks
+- Require symmetric heap visibility validation
+
+Example:
+```python
+import pytest
+import iris
+
+pytestmark = pytest.mark.multi_rank_required
+
+def test_all_reduce():
+    shmem = iris.iris(1 << 20)
+    # Test requires multiple ranks to validate reduction
+    input_tensor = shmem.ones(10, dtype=torch.float32) * shmem.get_rank()
+    output = shmem.ccl.all_reduce(input_tensor)
+    # Validation logic...
+```
+
+### Unmarked Tests
+Leave tests unmarked if:
+- They test infrastructure/utilities (logging, helpers)
+- Classification is unclear
+- Backward compatibility is preferred
+
+## Running Tests Locally
+
+### Run all tests
+```bash
+pytest tests/
+```
+
+### Run only single-rank tests
+```bash
+pytest tests/ -m single_rank
+```
+
+### Run only multi-rank tests
+```bash
+pytest tests/ -m multi_rank_required
+```
+
+### Run unmarked tests
+```bash
+pytest tests/ -m "not single_rank and not multi_rank_required"
+```
+
+### Run with specific rank count
+```bash
+python tests/run_tests_distributed.py --num_ranks 4 tests/ccl/test_all_reduce.py -m multi_rank_required
+```
+
+## Expected Impact
+
+### Time Savings
+- **Previous CI time**: ~210 minutes
+- **New CI time**: ~147 minutes
+- **Reduction**: 63 minutes (30%)
+
+### Test Execution Reduction
+- **Previous multi-rank test runs**: 6.37M
+- **New multi-rank test runs**: 3.98M
+- **Reduction**: 2.39M test runs (37.5%)
+
+### Key Metrics
+- **Test count**: Unchanged (530,877 tests)
+- **Coverage**: No reduction - all tests still run at least once
+- **Quality**: No degradation - multi-rank tests still validated on all configs
+
+## Future Optimizations (Phase 2+)
+
+Potential future optimizations include:
+1. **Parameterization reduction**: Reduce parameter combinations for single-rank tests
+2. **Test parallelization**: Run independent tests in parallel
+3. **Caching**: Cache build artifacts between jobs
+4. **Smart test selection**: Skip tests unaffected by code changes
+
+## References
+
+- Issue: [Implement test suite optimization](https://github.com/ROCm/iris/issues/XXX)
+- PR: [Test Suite Optimization - Phase 1](https://github.com/ROCm/iris/pull/XXX)
+- Analysis: See PRs #353 and #354 for detailed analysis
+
+## Troubleshooting
+
+### Marker not recognized
+Ensure `pytest.ini` is present in the repository root with the marker definitions.
+
+### Tests not filtered correctly
+1. Check that the marker is added to the test file
+2. Verify the marker syntax: `pytestmark = pytest.mark.<marker>`
+3. Check that the CI workflow passes the marker parameter correctly
+
+### CI failures after optimization
+1. Check that multi-rank tests have `multi_rank_required` marker
+2. Verify that single-rank tests don't depend on multi-rank execution
+3. Review test logs to identify which rank configuration failed
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 000000000..7413a3d3e
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+[pytest]
+markers =
+    single_rank: Tests validating tensor properties (shape, dtype, values) - run on 1 rank only
+    multi_rank_required: Tests validating distributed behavior (symmetric heap visibility, cross-rank operations) - run on all ranks
diff --git a/scripts/assign_test_markers.py b/scripts/assign_test_markers.py
new file mode 100755
index 000000000..492a9e01f
--- /dev/null
+++ b/scripts/assign_test_markers.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+"""
+Automated Test Marker Assignment Script
+
+This script assigns pytest markers (@pytest.mark.single_rank or @pytest.mark.multi_rank_required)
+to test files based on the type of functionality they test.
+
+Classification rules:
+- single_rank: Tests validating tensor properties (shape, dtype, values) - run on 1 rank only
+  Examples: zeros, ones, empty, full, rand, randint, randn, arange, linspace
+  
+- multi_rank_required: Tests validating distributed behavior (symmetric heap visibility, cross-rank operations) - run on all ranks
+  Examples: get, put, load, store, atomic operations, broadcast, copy, all_reduce, all_gather, all_to_all
+"""
+
+import os
+import sys
+import re
+from pathlib import Path
+
+
+# Tests that should be marked as single_rank (tensor property tests)
+SINGLE_RANK_PATTERNS = [
+    "test_zeros.py",
+    "test_ones.py", 
+    "test_empty.py",
+    "test_full.py",
+    "test_rand.py",
+    "test_randint.py",
+    "test_randn.py",
+    "test_arange.py",
+    "test_linspace.py",
+    "test_zeros_like.py",
+]
+
+# Tests that should be marked as multi_rank_required (distributed tests)
+MULTI_RANK_PATTERNS = [
+    # Remote memory access operations
+    "test_get_gluon.py",
+    "test_get_triton.py",
+    "test_put_gluon.py",
+    "test_put_triton.py",
+    "test_load_gluon.py",
+    "test_load_triton.py",
+    "test_store_gluon.py",
+    "test_store_triton.py",
+    # Atomic operations
+    "test_atomic_add_gluon.py",
+    "test_atomic_add_triton.py",
+    "test_atomic_and_gluon.py",
+    "test_atomic_and_triton.py",
+    "test_atomic_cas_gluon.py",
+    "test_atomic_cas_triton.py",
+    "test_atomic_max_gluon.py",
+    "test_atomic_max_triton.py",
+    "test_atomic_min_gluon.py",
+    "test_atomic_min_triton.py",
+    "test_atomic_or_gluon.py",
+    "test_atomic_or_triton.py",
+    "test_atomic_xchg_gluon.py",
+    "test_atomic_xchg_triton.py",
+    "test_atomic_xor_gluon.py",
+    "test_atomic_xor_triton.py",
+    # Data movement operations
+    "test_broadcast_gluon.py",
+    "test_broadcast_triton.py",
+    "test_copy_gluon.py",
+    "test_copy_triton.py",
+    # Collective operations (all in ccl, ops, x directories)
+    "test_all_reduce.py",
+    "test_all_gather.py",
+    "test_all_to_all.py",
+    "test_all_to_all_gluon.py",
+    "test_process_groups.py",
+    "test_reduce_scatter.py",
+    "test_gather.py",
+    # Matmul + collective operations
+    "test_all_gather_matmul.py",
+    "test_matmul_all_gather.py",
+    "test_matmul_all_reduce.py",
+    "test_matmul_reduce_scatter.py",
+]
+
+# Tests in examples directory that test distributed behavior
+EXAMPLE_MULTI_RANK_PATTERNS = [
+    "test_load_bench.py",
+    "test_all_load_bench.py",
+    "test_atomic_add_bench.py",
+    "test_message_passing.py",
+    "test_flash_decode.py",
+]
+
+
+def should_mark_single_rank(filepath: Path) -> bool:
+    """Check if a test file should be marked as single_rank."""
+    filename = filepath.name
+    return filename in SINGLE_RANK_PATTERNS
+
+
+def should_mark_multi_rank(filepath: Path) -> bool:
+    """Check if a test file should be marked as multi_rank_required."""
+    filename = filepath.name
+    
+    # Check if it's in the patterns list
+    if filename in MULTI_RANK_PATTERNS:
+        return True
+    
+    # Check if it's in examples directory and matches example patterns
+    if "examples" in filepath.parts and filename in EXAMPLE_MULTI_RANK_PATTERNS:
+        return True
+    
+    return False
+
+
+def get_marker_for_file(filepath: Path) -> str:
+    """Determine the appropriate marker for a test file."""
+    if should_mark_single_rank(filepath):
+        return "single_rank"
+    elif should_mark_multi_rank(filepath):
+        return "multi_rank_required"
+    else:
+        # Leave unmarked for backward compatibility
+        return None
+
+
+def has_marker(content: str, marker: str) -> bool:
+    """Check if the file already has the specified marker."""
+    marker_pattern = rf"pytestmark\s*=\s*pytest\.mark\.{marker}"
+    return re.search(marker_pattern, content) is not None
+
+
+def add_marker_to_file(filepath: Path, marker: str, dry_run: bool = False) -> bool:
+    """Add a pytest marker to a test file using pytestmark."""
+    with open(filepath, 'r') as f:
+        content = f.read()
+    
+    # Check if marker already exists
+    if has_marker(content, marker):
+        print(f"  ✓ {filepath.name} already has pytestmark = pytest.mark.{marker}")
+        return False
+    
+    lines = content.split('\n')
+    
+    # Find the position to insert the marker
+    # It should go after the last import and before the first non-comment, non-import line
+    insert_pos = None
+    in_docstring = False
+    docstring_char = None
+    
+    for i, line in enumerate(lines):
+        stripped = line.strip()
+        
+        # Handle docstrings
+        if stripped.startswith('"""') or stripped.startswith("'''"):
+            if not in_docstring:
+                # Starting a docstring
+                in_docstring = True
+                docstring_char = stripped[:3]
+                # Check if it's a one-liner docstring
+                if stripped.count(docstring_char) >= 2:
+                    in_docstring = False
+                continue
+            elif stripped.endswith(docstring_char):
+                # Ending a docstring
+                in_docstring = False
+                continue
+        
+        if in_docstring:
+            continue
+            
+        # Skip empty lines and comments
+        if not stripped or stripped.startswith('#'):
+            continue
+            
+        # Skip import lines
+        if stripped.startswith('import ') or stripped.startswith('from '):
+            continue
+            
+        # This is the first non-import, non-docstring line - insert before it
+        insert_pos = i
+        break
+    
+    if insert_pos is None:
+        print(f"  ✗ Could not find appropriate location to add marker in {filepath.name}")
+        return False
+    
+    # Insert the marker with appropriate spacing
+    marker_line = f"\npytestmark = pytest.mark.{marker}\n"
+    lines.insert(insert_pos, marker_line)
+    
+    new_content = '\n'.join(lines)
+    
+    if dry_run:
+        print(f"  → Would add pytestmark = pytest.mark.{marker} to {filepath.name}")
+        return True
+    else:
+        with open(filepath, 'w') as f:
+            f.write(new_content)
+        print(f"  ✓ Added pytestmark = pytest.mark.{marker} to {filepath.name}")
+        return True
+
+
+def process_test_directory(test_dir: Path, dry_run: bool = False) -> dict:
+    """Process all test files in a directory."""
+    stats = {
+        'total': 0,
+        'single_rank': 0,
+        'multi_rank': 0,
+        'unmarked': 0,
+        'modified': 0,
+    }
+    
+    for test_file in test_dir.rglob('test_*.py'):
+        stats['total'] += 1
+        marker = get_marker_for_file(test_file)
+        
+        if marker == 'single_rank':
+            stats['single_rank'] += 1
+            if add_marker_to_file(test_file, marker, dry_run):
+                stats['modified'] += 1
+        elif marker == 'multi_rank_required':
+            stats['multi_rank'] += 1
+            if add_marker_to_file(test_file, marker, dry_run):
+                stats['modified'] += 1
+        else:
+            stats['unmarked'] += 1
+            print(f"  - {test_file.name} left unmarked (backward compatibility)")
+    
+    return stats
+
+
+def main():
+    """Main entry point."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(
+        description='Assign pytest markers to test files based on functionality',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Show what would be done without making changes'
+    )
+    parser.add_argument(
+        '--test-dir',
+        type=Path,
+        default=Path('tests'),
+        help='Path to tests directory (default: tests)'
+    )
+    
+    args = parser.parse_args()
+    
+    if not args.test_dir.exists():
+        print(f"Error: Test directory {args.test_dir} does not exist")
+        sys.exit(1)
+    
+    print(f"Processing test files in {args.test_dir}...")
+    if args.dry_run:
+        print("DRY RUN - no files will be modified\n")
+    
+    stats = process_test_directory(args.test_dir, args.dry_run)
+    
+    print("\n" + "="*70)
+    print("Summary:")
+    print("="*70)
+    print(f"Total test files:           {stats['total']}")
+    print(f"Single-rank tests:          {stats['single_rank']}")
+    print(f"Multi-rank required tests:  {stats['multi_rank']}")
+    print(f"Unmarked tests:             {stats['unmarked']}")
+    print(f"Files modified:             {stats['modified']}")
+    
+    if args.dry_run:
+        print("\nRun without --dry-run to apply changes")
+    
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/tests/ccl/test_all_gather.py b/tests/ccl/test_all_gather.py
index ae6490432..51acc2fdd 100644
--- a/tests/ccl/test_all_gather.py
+++ b/tests/ccl/test_all_gather.py
@@ -12,6 +12,9 @@
 from iris.ccl import Config
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/ccl/test_all_reduce.py b/tests/ccl/test_all_reduce.py
index ffd55e9d1..877e895da 100644
--- a/tests/ccl/test_all_reduce.py
+++ b/tests/ccl/test_all_reduce.py
@@ -12,6 +12,9 @@
 from iris.ccl import Config
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @pytest.mark.parametrize(
     "variant",
     [
diff --git a/tests/ccl/test_all_to_all.py b/tests/ccl/test_all_to_all.py
index 76478f5a0..9a321c11f 100644
--- a/tests/ccl/test_all_to_all.py
+++ b/tests/ccl/test_all_to_all.py
@@ -12,6 +12,9 @@
 from iris.ccl import Config
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/ccl/test_all_to_all_gluon.py b/tests/ccl/test_all_to_all_gluon.py
index 1dc485d47..52771f85c 100644
--- a/tests/ccl/test_all_to_all_gluon.py
+++ b/tests/ccl/test_all_to_all_gluon.py
@@ -10,6 +10,9 @@
 import torch.distributed as dist
 
 # Try to import Gluon, skip tests if not available
+
+pytestmark = pytest.mark.multi_rank_required
+
 try:
     import iris.experimental.iris_gluon as iris_gluon
     from iris.ccl import Config
diff --git a/tests/ccl/test_process_groups.py b/tests/ccl/test_process_groups.py
index 4bc6e3689..9b5399a77 100644
--- a/tests/ccl/test_process_groups.py
+++ b/tests/ccl/test_process_groups.py
@@ -17,6 +17,9 @@
 from iris.ccl import Config
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 def _get_world_info():
     """Get world size and rank, skip if not enough ranks."""
     if not dist.is_initialized():
diff --git a/tests/examples/test_all_load_bench.py b/tests/examples/test_all_load_bench.py
index bc925cdd5..2ee4deef8 100644
--- a/tests/examples/test_all_load_bench.py
+++ b/tests/examples/test_all_load_bench.py
@@ -9,6 +9,9 @@
 import importlib.util
 from pathlib import Path
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 current_dir = Path(__file__).parent
 file_path = (current_dir / "../../examples/02_all_load/all_load_bench.py").resolve()
 module_name = "all_load_bench"
diff --git a/tests/examples/test_atomic_add_bench.py b/tests/examples/test_atomic_add_bench.py
index dbf995e96..a9f9b1402 100644
--- a/tests/examples/test_atomic_add_bench.py
+++ b/tests/examples/test_atomic_add_bench.py
@@ -10,6 +10,9 @@
 import sys
 from pathlib import Path
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 current_dir = Path(__file__).parent
 
 # Add examples directory to sys.path so that example files can import from examples.common
diff --git a/tests/examples/test_flash_decode.py b/tests/examples/test_flash_decode.py
index 68b478b58..80a0e8349 100644
--- a/tests/examples/test_flash_decode.py
+++ b/tests/examples/test_flash_decode.py
@@ -39,6 +39,9 @@
 import torch
 import iris
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 project_root = Path(__file__).resolve()
 while not (project_root / "tests").is_dir() or not (project_root / "examples").is_dir():
     if project_root == project_root.parent:
diff --git a/tests/examples/test_load_bench.py b/tests/examples/test_load_bench.py
index 261c2a8ed..3d9082a5e 100644
--- a/tests/examples/test_load_bench.py
+++ b/tests/examples/test_load_bench.py
@@ -10,6 +10,9 @@
 import importlib.util
 from pathlib import Path
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 current_dir = Path(__file__).parent
 file_path = (current_dir / "../../examples/00_load/load_bench.py").resolve()
 module_name = "load_bench"
diff --git a/tests/examples/test_message_passing.py b/tests/examples/test_message_passing.py
index aa173dead..d1104113d 100644
--- a/tests/examples/test_message_passing.py
+++ b/tests/examples/test_message_passing.py
@@ -10,6 +10,9 @@
 import importlib.util
 from pathlib import Path
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 current_dir = Path(__file__).parent
 
 # Import message_passing_load_store module
diff --git a/tests/ops/test_all_gather_matmul.py b/tests/ops/test_all_gather_matmul.py
index 193505011..8d93f2662 100644
--- a/tests/ops/test_all_gather_matmul.py
+++ b/tests/ops/test_all_gather_matmul.py
@@ -15,6 +15,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/ops/test_matmul_all_gather.py b/tests/ops/test_matmul_all_gather.py
index 78ec0e47a..f94984d79 100644
--- a/tests/ops/test_matmul_all_gather.py
+++ b/tests/ops/test_matmul_all_gather.py
@@ -14,6 +14,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/ops/test_matmul_all_reduce.py b/tests/ops/test_matmul_all_reduce.py
index 5780b5d4d..521dbcc1e 100644
--- a/tests/ops/test_matmul_all_reduce.py
+++ b/tests/ops/test_matmul_all_reduce.py
@@ -15,6 +15,9 @@
 import iris.ops as ops
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/ops/test_matmul_reduce_scatter.py b/tests/ops/test_matmul_reduce_scatter.py
index 7f75a1b0c..e985c7801 100644
--- a/tests/ops/test_matmul_reduce_scatter.py
+++ b/tests/ops/test_matmul_reduce_scatter.py
@@ -12,6 +12,9 @@
 import iris.ops as ops
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/unittests/test_arange.py b/tests/unittests/test_arange.py
index e3183faf5..971cd67eb 100644
--- a/tests/unittests/test_arange.py
+++ b/tests/unittests/test_arange.py
@@ -6,6 +6,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.single_rank
+
 def test_arange_basic_functionality():
     """Test basic arange functionality with various argument combinations."""
     shmem = iris.iris(1 << 20)
diff --git a/tests/unittests/test_atomic_add_gluon.py b/tests/unittests/test_atomic_add_gluon.py
index 36d26801a..88b35af91 100644
--- a/tests/unittests/test_atomic_add_gluon.py
+++ b/tests/unittests/test_atomic_add_gluon.py
@@ -8,6 +8,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def atomic_add_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_add_triton.py b/tests/unittests/test_atomic_add_triton.py
index 8cf2f7f45..dd1a9a811 100644
--- a/tests/unittests/test_atomic_add_triton.py
+++ b/tests/unittests/test_atomic_add_triton.py
@@ -8,6 +8,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def atomic_add_kernel(
     results,
diff --git a/tests/unittests/test_atomic_and_gluon.py b/tests/unittests/test_atomic_and_gluon.py
index 31ebdbc53..27aceb016 100644
--- a/tests/unittests/test_atomic_and_gluon.py
+++ b/tests/unittests/test_atomic_and_gluon.py
@@ -8,6 +8,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def atomic_and_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_and_triton.py b/tests/unittests/test_atomic_and_triton.py
index 7b2bdf668..5abe90d7c 100644
--- a/tests/unittests/test_atomic_and_triton.py
+++ b/tests/unittests/test_atomic_and_triton.py
@@ -8,6 +8,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def atomic_and_kernel(
     results,
diff --git a/tests/unittests/test_atomic_cas_gluon.py b/tests/unittests/test_atomic_cas_gluon.py
index e10c77c59..fd856d3b1 100644
--- a/tests/unittests/test_atomic_cas_gluon.py
+++ b/tests/unittests/test_atomic_cas_gluon.py
@@ -8,6 +8,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def atomic_cas_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_cas_triton.py b/tests/unittests/test_atomic_cas_triton.py
index fdd59a886..a63db3b74 100644
--- a/tests/unittests/test_atomic_cas_triton.py
+++ b/tests/unittests/test_atomic_cas_triton.py
@@ -8,6 +8,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def atomic_cas_kernel(
     results,
diff --git a/tests/unittests/test_atomic_max_gluon.py b/tests/unittests/test_atomic_max_gluon.py
index 5ff71ea3f..7a33df0ab 100644
--- a/tests/unittests/test_atomic_max_gluon.py
+++ b/tests/unittests/test_atomic_max_gluon.py
@@ -8,6 +8,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def atomic_max_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_max_triton.py b/tests/unittests/test_atomic_max_triton.py
index 69d9d96d7..d9875786f 100644
--- a/tests/unittests/test_atomic_max_triton.py
+++ b/tests/unittests/test_atomic_max_triton.py
@@ -8,6 +8,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def atomic_max_kernel(
     results,
diff --git a/tests/unittests/test_atomic_min_gluon.py b/tests/unittests/test_atomic_min_gluon.py
index e18836b87..7f2ba7d55 100644
--- a/tests/unittests/test_atomic_min_gluon.py
+++ b/tests/unittests/test_atomic_min_gluon.py
@@ -8,6 +8,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def atomic_min_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_min_triton.py b/tests/unittests/test_atomic_min_triton.py
index 139e473de..1fa6f90de 100644
--- a/tests/unittests/test_atomic_min_triton.py
+++ b/tests/unittests/test_atomic_min_triton.py
@@ -8,6 +8,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def atomic_min_kernel(
     results,
diff --git a/tests/unittests/test_atomic_or_gluon.py b/tests/unittests/test_atomic_or_gluon.py
index bcda75b39..8de1ade2c 100644
--- a/tests/unittests/test_atomic_or_gluon.py
+++ b/tests/unittests/test_atomic_or_gluon.py
@@ -8,6 +8,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def atomic_or_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_or_triton.py b/tests/unittests/test_atomic_or_triton.py
index c0b8cc25d..132be6d57 100644
--- a/tests/unittests/test_atomic_or_triton.py
+++ b/tests/unittests/test_atomic_or_triton.py
@@ -8,6 +8,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def atomic_or_kernel(
     results,
diff --git a/tests/unittests/test_atomic_xchg_gluon.py b/tests/unittests/test_atomic_xchg_gluon.py
index 09ef1e2f2..7ef3a4fb1 100644
--- a/tests/unittests/test_atomic_xchg_gluon.py
+++ b/tests/unittests/test_atomic_xchg_gluon.py
@@ -8,6 +8,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def atomic_xchg_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_xchg_triton.py b/tests/unittests/test_atomic_xchg_triton.py
index ffea37e78..cfbbd2df1 100644
--- a/tests/unittests/test_atomic_xchg_triton.py
+++ b/tests/unittests/test_atomic_xchg_triton.py
@@ -8,6 +8,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def atomic_xchg_kernel(
     results,
diff --git a/tests/unittests/test_atomic_xor_gluon.py b/tests/unittests/test_atomic_xor_gluon.py
index b9e77ce60..f2cf50743 100644
--- a/tests/unittests/test_atomic_xor_gluon.py
+++ b/tests/unittests/test_atomic_xor_gluon.py
@@ -8,6 +8,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def atomic_xor_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_xor_triton.py b/tests/unittests/test_atomic_xor_triton.py
index 639abfcdd..d12644ebf 100644
--- a/tests/unittests/test_atomic_xor_triton.py
+++ b/tests/unittests/test_atomic_xor_triton.py
@@ -8,6 +8,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def atomic_xor_kernel(
     results,
diff --git a/tests/unittests/test_broadcast_gluon.py b/tests/unittests/test_broadcast_gluon.py
index e2eaac6d1..574b06db5 100644
--- a/tests/unittests/test_broadcast_gluon.py
+++ b/tests/unittests/test_broadcast_gluon.py
@@ -7,6 +7,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @pytest.mark.parametrize(
     "value,expected",
     [
diff --git a/tests/unittests/test_broadcast_triton.py b/tests/unittests/test_broadcast_triton.py
index 9563a5916..42362aff8 100644
--- a/tests/unittests/test_broadcast_triton.py
+++ b/tests/unittests/test_broadcast_triton.py
@@ -7,6 +7,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @pytest.mark.parametrize(
     "value,expected",
     [
diff --git a/tests/unittests/test_copy_gluon.py b/tests/unittests/test_copy_gluon.py
index 8102640da..4590bbe61 100644
--- a/tests/unittests/test_copy_gluon.py
+++ b/tests/unittests/test_copy_gluon.py
@@ -8,6 +8,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def copy_get_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_copy_triton.py b/tests/unittests/test_copy_triton.py
index 00bc43e47..32cc0a797 100644
--- a/tests/unittests/test_copy_triton.py
+++ b/tests/unittests/test_copy_triton.py
@@ -8,6 +8,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def copy_get_kernel(
     data,
diff --git a/tests/unittests/test_empty.py b/tests/unittests/test_empty.py
index e51fb4c2f..a9af622b7 100644
--- a/tests/unittests/test_empty.py
+++ b/tests/unittests/test_empty.py
@@ -6,6 +6,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.single_rank
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_full.py b/tests/unittests/test_full.py
index a42d4ddb6..05c792ccd 100644
--- a/tests/unittests/test_full.py
+++ b/tests/unittests/test_full.py
@@ -6,6 +6,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.single_rank
+
 @pytest.mark.parametrize(
     "fill_value",
     [
diff --git a/tests/unittests/test_get_gluon.py b/tests/unittests/test_get_gluon.py
index 5cabc054c..910721b75 100644
--- a/tests/unittests/test_get_gluon.py
+++ b/tests/unittests/test_get_gluon.py
@@ -12,6 +12,9 @@
 # 1. for local get.
 # 2. for remote get with one other rank.
 # 3. for remote get with more than one rank (if num_ranks > 2).
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def get_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_get_triton.py b/tests/unittests/test_get_triton.py
index b19cf235d..090f50a14 100644
--- a/tests/unittests/test_get_triton.py
+++ b/tests/unittests/test_get_triton.py
@@ -12,6 +12,9 @@
 # 1. for local get.
 # 2. for remote get with one other rank.
 # 3. for remote get with more than one rank (if num_ranks > 2).
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def get_kernel(
     data,
diff --git a/tests/unittests/test_linspace.py b/tests/unittests/test_linspace.py
index 02d26b248..a95b9836a 100644
--- a/tests/unittests/test_linspace.py
+++ b/tests/unittests/test_linspace.py
@@ -6,6 +6,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.single_rank
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_load_gluon.py b/tests/unittests/test_load_gluon.py
index adce4ce39..9dbd7df43 100644
--- a/tests/unittests/test_load_gluon.py
+++ b/tests/unittests/test_load_gluon.py
@@ -8,6 +8,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def load_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_load_triton.py b/tests/unittests/test_load_triton.py
index b73dda3f3..733c6de8a 100644
--- a/tests/unittests/test_load_triton.py
+++ b/tests/unittests/test_load_triton.py
@@ -8,6 +8,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def load_kernel(
     data,
diff --git a/tests/unittests/test_ones.py b/tests/unittests/test_ones.py
index e70c63f88..cfe5a70c6 100644
--- a/tests/unittests/test_ones.py
+++ b/tests/unittests/test_ones.py
@@ -6,6 +6,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.single_rank
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_put_gluon.py b/tests/unittests/test_put_gluon.py
index 6f1172602..659366c8f 100644
--- a/tests/unittests/test_put_gluon.py
+++ b/tests/unittests/test_put_gluon.py
@@ -12,6 +12,9 @@
 # 1. for local put.
 # 2. for remote put with one other rank.
 # 3. for remote put with more than one rank (if num_ranks > 2).
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def put_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_put_triton.py b/tests/unittests/test_put_triton.py
index d953b42c6..0d5f61919 100644
--- a/tests/unittests/test_put_triton.py
+++ b/tests/unittests/test_put_triton.py
@@ -12,6 +12,9 @@
 # 1. for local put.
 # 2. for remote put with one other rank.
 # 3. for remote put with more than one rank (if num_ranks > 2).
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def put_kernel(
     data,
diff --git a/tests/unittests/test_rand.py b/tests/unittests/test_rand.py
index 75b6968b0..30c6dedd0 100644
--- a/tests/unittests/test_rand.py
+++ b/tests/unittests/test_rand.py
@@ -6,6 +6,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.single_rank
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_randint.py b/tests/unittests/test_randint.py
index a636be386..d5c1571a1 100644
--- a/tests/unittests/test_randint.py
+++ b/tests/unittests/test_randint.py
@@ -6,6 +6,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.single_rank
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_randn.py b/tests/unittests/test_randn.py
index cb20ec9a9..e14b9eda1 100644
--- a/tests/unittests/test_randn.py
+++ b/tests/unittests/test_randn.py
@@ -6,6 +6,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.single_rank
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_store_gluon.py b/tests/unittests/test_store_gluon.py
index 81ff3c608..90616b38c 100644
--- a/tests/unittests/test_store_gluon.py
+++ b/tests/unittests/test_store_gluon.py
@@ -8,6 +8,9 @@
 import iris.experimental.iris_gluon as iris_gl
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @gluon.jit
 def store_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_store_triton.py b/tests/unittests/test_store_triton.py
index 0632180c8..eee57658a 100644
--- a/tests/unittests/test_store_triton.py
+++ b/tests/unittests/test_store_triton.py
@@ -8,6 +8,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def store_kernel(
     data,
diff --git a/tests/unittests/test_zeros.py b/tests/unittests/test_zeros.py
index 51126fed3..ca55d9df1 100644
--- a/tests/unittests/test_zeros.py
+++ b/tests/unittests/test_zeros.py
@@ -6,6 +6,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.single_rank
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_zeros_like.py b/tests/unittests/test_zeros_like.py
index b7a0ff0c0..6e3ca4fe6 100644
--- a/tests/unittests/test_zeros_like.py
+++ b/tests/unittests/test_zeros_like.py
@@ -6,6 +6,9 @@
 import iris
 
 
+
+pytestmark = pytest.mark.single_rank
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/x/test_all_gather.py b/tests/x/test_all_gather.py
index 5cad616d5..f42b88939 100644
--- a/tests/x/test_all_gather.py
+++ b/tests/x/test_all_gather.py
@@ -14,6 +14,9 @@
 import iris.x
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def x_all_gather_kernel(
     input_ptr,
diff --git a/tests/x/test_all_reduce.py b/tests/x/test_all_reduce.py
index 864a45006..5ff434f5c 100644
--- a/tests/x/test_all_reduce.py
+++ b/tests/x/test_all_reduce.py
@@ -14,6 +14,9 @@
 import iris.x
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def x_all_reduce_atomic_kernel(
     input_ptr,
diff --git a/tests/x/test_all_to_all.py b/tests/x/test_all_to_all.py
index 60de86663..89782d1ed 100644
--- a/tests/x/test_all_to_all.py
+++ b/tests/x/test_all_to_all.py
@@ -14,6 +14,9 @@
 import iris.x
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def x_all_to_all_kernel(
     input_ptr,
diff --git a/tests/x/test_gather.py b/tests/x/test_gather.py
index d364b7c83..020b1f8e6 100644
--- a/tests/x/test_gather.py
+++ b/tests/x/test_gather.py
@@ -13,6 +13,9 @@
 import iris.x
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def gather_kernel(
     input_ptr,
diff --git a/tests/x/test_reduce_scatter.py b/tests/x/test_reduce_scatter.py
index eb8099e40..1773fe14c 100644
--- a/tests/x/test_reduce_scatter.py
+++ b/tests/x/test_reduce_scatter.py
@@ -14,6 +14,9 @@
 import iris.x
 
 
+
+pytestmark = pytest.mark.multi_rank_required
+
 @triton.jit
 def x_reduce_scatter_kernel(
     input_ptr,

From f8ed164e866a4ec2603a7f10d04c638246313406 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 4 Feb 2026 16:57:54 +0000
Subject: [PATCH 05/11] Fix marker assignment - properly handle existing
 markers and file structure

- Restored all test files from clean state
- Reapplied markers with fixed script that properly handles docstrings
- All 57 test files now have correct markers without syntax errors or duplicates
- Verified all files have valid Python syntax

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 tests/ccl/test_all_to_all_gluon.py         |  86 +++-
 tests/ccl/test_process_groups.py           | 568 ++++++++++++++++++++-
 tests/examples/test_all_load_bench.py      | 144 +++++-
 tests/examples/test_atomic_add_bench.py    | 137 ++++-
 tests/examples/test_flash_decode.py        | 134 ++++-
 tests/examples/test_load_bench.py          |  57 ++-
 tests/examples/test_message_passing.py     |  62 ++-
 tests/unittests/test_atomic_add_gluon.py   |  81 ++-
 tests/unittests/test_atomic_add_triton.py  |  71 ++-
 tests/unittests/test_atomic_and_gluon.py   |  85 ++-
 tests/unittests/test_atomic_and_triton.py  |  75 ++-
 tests/unittests/test_atomic_cas_gluon.py   |  74 ++-
 tests/unittests/test_atomic_cas_triton.py  |  60 ++-
 tests/unittests/test_atomic_max_gluon.py   |  81 ++-
 tests/unittests/test_atomic_max_triton.py  |  71 ++-
 tests/unittests/test_atomic_min_gluon.py   |  80 ++-
 tests/unittests/test_atomic_min_triton.py  |  70 ++-
 tests/unittests/test_atomic_or_gluon.py    |  85 ++-
 tests/unittests/test_atomic_or_triton.py   |  75 ++-
 tests/unittests/test_atomic_xchg_gluon.py  |  72 ++-
 tests/unittests/test_atomic_xchg_triton.py |  60 ++-
 tests/unittests/test_atomic_xor_gluon.py   |  83 ++-
 tests/unittests/test_atomic_xor_triton.py  |  73 ++-
 tests/unittests/test_copy_gluon.py         | 203 +++++++-
 tests/unittests/test_copy_triton.py        | 180 ++++++-
 tests/unittests/test_get_gluon.py          |  64 ++-
 tests/unittests/test_get_triton.py         |  55 +-
 tests/unittests/test_load_gluon.py         |  65 ++-
 tests/unittests/test_load_triton.py        |  56 +-
 tests/unittests/test_put_gluon.py          |  64 ++-
 tests/unittests/test_put_triton.py         |  55 +-
 tests/unittests/test_store_gluon.py        |  64 ++-
 tests/unittests/test_store_triton.py       |  55 +-
 tests/x/test_all_gather.py                 | 277 +++++++++-
 tests/x/test_all_reduce.py                 | 163 +++++-
 tests/x/test_all_to_all.py                 | 109 +++-
 tests/x/test_gather.py                     | 171 ++++++-
 tests/x/test_reduce_scatter.py             |  98 +++-
 38 files changed, 3982 insertions(+), 81 deletions(-)

diff --git a/tests/ccl/test_all_to_all_gluon.py b/tests/ccl/test_all_to_all_gluon.py
index e4a78bd56..52771f85c 100644
--- a/tests/ccl/test_all_to_all_gluon.py
+++ b/tests/ccl/test_all_to_all_gluon.py
@@ -24,7 +24,89 @@
 
 
 @pytest.mark.skipif(not GLUON_AVAILABLE, reason="Gluon not available")
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.float16,
+        torch.float32,
+        torch.bfloat16,
+    ],
+)
+@pytest.mark.parametrize(
+    "M, N",
+    [
+        (128, 64),  # Small
+        (1024, 256),  # Medium
+        (8192, 8192),  # Large
+    ],
+)
+def test_all_to_all_gluon(dtype, M, N):
+    """Test all-to-all functionality using Gluon with traffic shaping by comparing against PyTorch's implementation."""
+    # Ensure torch.distributed is initialized (should be done by test runner)
+    if not dist.is_initialized():
+        pytest.skip("torch.distributed not initialized")
 
-pytestmark = pytest.mark.multi_rank_required
+    heap_size = 2**33  # 8GB
+    shmem = iris_gluon.iris(heap_size)
+    rank = shmem.get_rank()
+    world_size = shmem.get_num_ranks()
+
+    # PyTorch's all_to_all format: each rank has M x N data to send to all ranks
+    # Create input data: each rank has its own M x N chunk
+    # For rank r, the data it sends to all ranks is the same (M x N tensor)
+    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
+    # Fill with deterministic values for easier debugging
+    pytorch_input_tensor.fill_(float(rank))
+
+    # PyTorch all_to_all expects list of tensors: input_list[i] is sent to rank i
+    # Since we're sending the same data to all ranks, we replicate it
+    pytorch_input_list = [pytorch_input_tensor.clone() for _ in range(world_size)]
+    pytorch_output_list = [torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}") for _ in range(world_size)]
+
+    # Run PyTorch's all_to_all to get reference output
+    shmem.barrier()
+    dist.all_to_all(pytorch_output_list, pytorch_input_list)
+    torch.cuda.synchronize()
+
+    # Convert PyTorch output to concatenated format for comparison
+    # pytorch_output_list[i] contains data received from rank i
+    pytorch_output_concat = torch.zeros(M, N * world_size, dtype=dtype, device=f"cuda:{rank}")
+    for target_rank in range(world_size):
+        pytorch_output_concat[:, target_rank * N : (target_rank + 1) * N] = pytorch_output_list[target_rank]
+
+    # Now set up Iris Gluon all_to_all format
+    # Iris format: concatenated tensor (M, N * world_size)
+    # input[:, i*N:(i+1)*N] contains data to send to rank i
+    # Since we're sending the same M x N data to all ranks, we replicate it
+    iris_input_concat = shmem.zeros((M, N * world_size), dtype=dtype)
+    for target_rank in range(world_size):
+        iris_input_concat[:, target_rank * N : (target_rank + 1) * N] = pytorch_input_tensor
+
+    iris_output_concat = shmem.zeros((M, N * world_size), dtype=dtype)
+
+    # Run Iris Gluon all_to_all with traffic shaping enabled
+    shmem.barrier()
+    config = Config(use_gluon=True)  # Enable Gluon with traffic shaping
+    all_to_all(iris_output_concat, iris_input_concat, shmem, config=config)
+    torch.cuda.synchronize()
+
+    # Compare results
+    atol = 1e-3 if dtype == torch.float16 else 1e-5
+    max_diff = torch.abs(iris_output_concat - pytorch_output_concat).max().item()
+
+    try:
+        assert torch.allclose(iris_output_concat, pytorch_output_concat, atol=atol), (
+            f"Max difference: {max_diff}, expected < {atol}\n"
+            f"Rank {rank}: Iris Gluon output doesn't match PyTorch's all_to_all"
+        )
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/ccl/test_process_groups.py b/tests/ccl/test_process_groups.py
index a0b7dd28b..9b5399a77 100644
--- a/tests/ccl/test_process_groups.py
+++ b/tests/ccl/test_process_groups.py
@@ -95,7 +95,571 @@ def _get_my_group(groups, rank):
 # =============================================================================
 
 
+@pytest.mark.parametrize(
+    "variant",
+    [
+        "atomic",
+        "two_shot",
+        "one_shot",
+        # TODO enable these tests when support for cache-modifiers is in place.
+        # "spinlock",
+    ],
+)
+@pytest.mark.parametrize("group_type", ["consecutive", "strided"])
+def test_all_reduce_with_groups(variant, group_type, dtype=torch.float32, M=256, N=128):
+    """Test all-reduce with ProcessGroups (consecutive and strided patterns)."""
+    world_size, rank = _get_world_info()
 
-pytestmark = pytest.mark.multi_rank_required
+    heap_size = 2**33  # 8GB
+    shmem = iris.iris(heap_size)
+
+    # Create groups based on type
+    if group_type == "consecutive":
+        # TP-like: [0,1], [2,3], etc.
+        groups = _create_consecutive_groups(world_size, group_size=2)
+    else:
+        # DP-like strided: [0,2], [1,3], etc.
+        groups = _create_strided_groups(world_size, num_groups=2)
+
+    group_idx, my_group = _get_my_group(groups, rank)
+    assert my_group is not None, f"Rank {rank} not in any group"
+
+    group_ranks = dist.get_process_group_ranks(my_group)
+
+    # Create input tensor with deterministic values
+    # Each rank fills with its global rank + 1 for easy verification
+    pytorch_input_tensor = torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}")
+    pytorch_input_tensor.fill_(float(rank + 1))
+
+    # Run PyTorch's all_reduce on the group
+    pytorch_output_tensor = pytorch_input_tensor.clone()
+    shmem.barrier()
+    dist.all_reduce(pytorch_output_tensor, op=dist.ReduceOp.SUM, group=my_group)
+    torch.cuda.synchronize()
+
+    # Set up Iris tensors
+    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
+    iris_input_tensor.copy_(pytorch_input_tensor)
+    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
+
+    # Run Iris all_reduce with the group
+    shmem.barrier()
+    config = Config(all_reduce_variant=variant)
+    if variant == "two_shot":
+        config.all_reduce_distribution = 1
+
+    workspace = shmem.ccl.all_reduce_preamble(iris_output_tensor, iris_input_tensor, config=config)
+    shmem.barrier()
+
+    shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, group=my_group, config=config, workspace=workspace)
+    torch.cuda.synchronize()
+
+    # Compare results
+    atol = 1e-5
+    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
+
+    # Calculate expected sum for verification
+    expected_sum = sum(r + 1 for r in group_ranks)
+
+    try:
+        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol), (
+            f"Max difference: {max_diff}, expected < {atol}\n"
+            f"Rank {rank} (group {group_idx}, ranks={group_ranks}): "
+            f"Iris output doesn't match PyTorch's all_reduce (variant={variant}, group_type={group_type})\n"
+            f"Expected sum: {expected_sum}, got iris={iris_output_tensor[0, 0].item()}, pytorch={pytorch_output_tensor[0, 0].item()}"
+        )
+    finally:
+        shmem.barrier()
+        del shmem
+        import gc
+
+        gc.collect()
+
+
+# =============================================================================
+# All-Gather with Process Groups
+# =============================================================================
+
+
+@pytest.mark.parametrize("group_type", ["consecutive", "strided"])
+def test_all_gather_with_groups(group_type, dtype=torch.float32, M=128, N=64):
+    """Test all-gather with ProcessGroups."""
+    world_size, rank = _get_world_info()
+
+    heap_size = 2**33
+    shmem = iris.iris(heap_size)
+
+    if group_type == "consecutive":
+        groups = _create_consecutive_groups(world_size, group_size=2)
+    else:
+        groups = _create_strided_groups(world_size, num_groups=2)
+
+    group_idx, my_group = _get_my_group(groups, rank)
+    assert my_group is not None
+
+    group_ranks = dist.get_process_group_ranks(my_group)
+    group_size = len(group_ranks)
+
+    # Each rank fills with its global rank + 1
+    pytorch_input_tensor = torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}")
+    pytorch_input_tensor.fill_(float(rank + 1))
+
+    # PyTorch output: (group_size * M, N)
+    pytorch_output_tensor = torch.zeros(group_size * M, N, dtype=dtype, device=f"cuda:{rank}")
+
+    shmem.barrier()
+    dist.all_gather_into_tensor(pytorch_output_tensor, pytorch_input_tensor, group=my_group)
+    torch.cuda.synchronize()
+
+    # Iris tensors
+    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
+    iris_input_tensor.copy_(pytorch_input_tensor)
+    iris_output_tensor = shmem.zeros((group_size * M, N), dtype=dtype)
+
+    shmem.barrier()
+    config = Config()
+    shmem.ccl.all_gather(iris_output_tensor, iris_input_tensor, group=my_group, config=config)
+    torch.cuda.synchronize()
+
+    atol = 1e-5
+    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
+
+    try:
+        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol), (
+            f"Max difference: {max_diff}, expected < {atol}\n"
+            f"Rank {rank} (group {group_idx}, ranks={group_ranks}): "
+            f"Iris output doesn't match PyTorch's all_gather (group_type={group_type})"
+        )
+    finally:
+        shmem.barrier()
+        del shmem
+        import gc
+
+        gc.collect()
+
+
+# =============================================================================
+# All-to-All with Process Groups
+# =============================================================================
+
+
+@pytest.mark.parametrize("group_type", ["consecutive", "strided"])
+def test_all_to_all_with_groups(group_type, dtype=torch.float32, M=128, N=64):
+    """Test all-to-all with ProcessGroups."""
+    world_size, rank = _get_world_info()
+
+    heap_size = 2**33
+    shmem = iris.iris(heap_size)
+
+    if group_type == "consecutive":
+        groups = _create_consecutive_groups(world_size, group_size=2)
+    else:
+        groups = _create_strided_groups(world_size, num_groups=2)
+
+    group_idx, my_group = _get_my_group(groups, rank)
+    assert my_group is not None
+
+    group_ranks = dist.get_process_group_ranks(my_group)
+    group_size = len(group_ranks)
+
+    # Each rank creates input with its rank value
+    pytorch_input_tensor = torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}")
+    pytorch_input_tensor.fill_(float(rank))
+
+    # PyTorch all_to_all with list interface
+    pytorch_input_list = [pytorch_input_tensor.clone() for _ in range(group_size)]
+    pytorch_output_list = [torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}") for _ in range(group_size)]
+
+    shmem.barrier()
+    dist.all_to_all(pytorch_output_list, pytorch_input_list, group=my_group)
+    torch.cuda.synchronize()
+
+    # Convert to concatenated format
+    pytorch_output_concat = torch.zeros(M, N * group_size, dtype=dtype, device=f"cuda:{rank}")
+    for i in range(group_size):
+        pytorch_output_concat[:, i * N : (i + 1) * N] = pytorch_output_list[i]
+
+    # Iris: concatenated format (M, N * group_size)
+    iris_input_concat = shmem.zeros((M, N * group_size), dtype=dtype)
+    for i in range(group_size):
+        iris_input_concat[:, i * N : (i + 1) * N] = pytorch_input_tensor
+
+    iris_output_concat = shmem.zeros((M, N * group_size), dtype=dtype)
+
+    shmem.barrier()
+    config = Config()
+    shmem.ccl.all_to_all(iris_output_concat, iris_input_concat, group=my_group, config=config)
+    torch.cuda.synchronize()
+
+    atol = 1e-5
+    max_diff = torch.abs(iris_output_concat - pytorch_output_concat).max().item()
+
+    try:
+        assert torch.allclose(iris_output_concat, pytorch_output_concat, atol=atol), (
+            f"Max difference: {max_diff}, expected < {atol}\n"
+            f"Rank {rank} (group {group_idx}, ranks={group_ranks}): "
+            f"Iris output doesn't match PyTorch's all_to_all (group_type={group_type})"
+        )
+    finally:
+        shmem.barrier()
+        del shmem
+        import gc
+
+        gc.collect()
+
+
+# =============================================================================
+# Reduce-Scatter with Process Groups
+# =============================================================================
+#
+# NOTE: Iris's reduce_scatter has different semantics than PyTorch's reduce_scatter_tensor:
+# - PyTorch: input is (group_size * M, N), output is (M, N) - splits reduced tensor
+# - Iris: input and output are both (M, N) - distributes tiles among ranks
+#
+# Until semantics are aligned, we test reduce_scatter with groups by verifying
+# that the group operations produce mathematically correct results.
+
+
+@pytest.mark.parametrize("group_type", ["consecutive", "strided"])
+def test_reduce_scatter_with_groups(group_type, dtype=torch.float32, M=256, N=128):
+    """
+    Test reduce-scatter with ProcessGroups.
+
+    Since Iris's reduce_scatter has different semantics than PyTorch's,
+    we verify correctness by checking that:
+    1. Each rank in the group receives its assigned tiles (reduced values)
+    2. The sum of all tiles across the group equals the expected total
+    """
+    world_size, rank = _get_world_info()
+
+    heap_size = 2**33
+    shmem = iris.iris(heap_size)
+
+    if group_type == "consecutive":
+        groups = _create_consecutive_groups(world_size, group_size=2)
+    else:
+        groups = _create_strided_groups(world_size, num_groups=2)
+
+    group_idx, my_group = _get_my_group(groups, rank)
+    assert my_group is not None
+
+    group_ranks = dist.get_process_group_ranks(my_group)
+
+    # Each rank fills with its global rank + 1
+    input_value = float(rank + 1)
+    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
+    iris_input_tensor.fill_(input_value)
+    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
+
+    shmem.barrier()
+    config = Config()
+    shmem.ccl.reduce_scatter(iris_output_tensor, iris_input_tensor, group=my_group, config=config)
+    torch.cuda.synchronize()
+
+    # Expected sum for each tile (all ranks in group contribute)
+    expected_sum = sum(r + 1 for r in group_ranks)
+
+    # In reduce_scatter with tile distribution, each rank gets some tiles
+    # with the reduced sum value. Check that non-zero tiles have the correct value.
+    non_zero_mask = iris_output_tensor != 0
+
+    try:
+        if non_zero_mask.any():
+            non_zero_values = iris_output_tensor[non_zero_mask]
+            # All non-zero values should equal the expected sum
+            assert torch.allclose(non_zero_values, torch.full_like(non_zero_values, expected_sum), atol=1e-5), (
+                f"Rank {rank} (group {group_idx}, ranks={group_ranks}): "
+                f"Non-zero tiles have incorrect values. Expected {expected_sum}, got unique values: {non_zero_values.unique().tolist()}"
+            )
+
+        # Gather outputs from all ranks in group to verify total coverage
+        # (This is a simplified check - full verification would need cross-rank communication)
+
+    finally:
+        shmem.barrier()
+        del shmem
+        import gc
+
+        gc.collect()
+
+
+# =============================================================================
+# Edge Cases and Verification Tests
+# =============================================================================
+
+
+def test_group_info_extraction():
+    """Test that extract_group_info returns correct values for different groups."""
+    world_size, rank = _get_world_info()
+
+    heap_size = 2**33
+    shmem = iris.iris(heap_size)
+
+    from iris.ccl.utils import extract_group_info
+
+    # Test 1: group=None should return global info
+    rank_in_group, rank_global, ws, rank_start, rank_stride = extract_group_info(None, shmem)
+    assert rank_in_group == rank_global == rank, "group=None: rank mismatch"
+    assert ws == world_size, "group=None: world_size mismatch"
+    assert rank_start == 0, "group=None: rank_start should be 0"
+    assert rank_stride == 1, "group=None: rank_stride should be 1"
+
+    # Test 2: Consecutive group [0, 1] - ALL ranks must call new_group collectively
+    consecutive_group = dist.new_group([0, 1])
+    if rank < 2:
+        rank_in_group, rank_global, ws, rank_start, rank_stride = extract_group_info(consecutive_group, shmem)
+        assert rank_in_group == rank, "Consecutive group: rank_in_group mismatch"
+        assert rank_global == rank, "Consecutive group: rank_global mismatch"
+        assert ws == 2, "Consecutive group: world_size should be 2"
+        assert rank_start == 0, "Consecutive group: rank_start should be 0"
+        assert rank_stride == 1, "Consecutive group: rank_stride should be 1"
+
+    # Test 3: Strided group [0, 2] - ALL ranks must call new_group collectively
+    if world_size >= 4:
+        strided_group = dist.new_group([0, 2])
+        if rank in [0, 2]:
+            rank_in_group, rank_global, ws, rank_start, rank_stride = extract_group_info(strided_group, shmem)
+            expected_rank_in_group = 0 if rank == 0 else 1
+            assert rank_in_group == expected_rank_in_group, (
+                f"Strided group: rank_in_group should be {expected_rank_in_group}, got {rank_in_group}"
+            )
+            assert rank_global == rank, f"Strided group: rank_global should be {rank}, got {rank_global}"
+            assert ws == 2, "Strided group: world_size should be 2"
+            assert rank_start == 0, "Strided group: rank_start should be 0"
+            assert rank_stride == 2, "Strided group: rank_stride should be 2"
+
+    shmem.barrier()
+    del shmem
+    import gc
+
+    gc.collect()
+
+
+def test_all_reduce_group_correctness():
+    """
+    Verify all-reduce with groups produces correct mathematical results.
+
+    With strided groups [0,2] and [1,3]:
+    - Group [0,2]: ranks fill with 1 and 3, sum should be 4
+    - Group [1,3]: ranks fill with 2 and 4, sum should be 6
+    """
+    world_size, rank = _get_world_info()
+
+    heap_size = 2**33
+    shmem = iris.iris(heap_size)
+
+    # Create strided groups
+    groups = _create_strided_groups(world_size, num_groups=2)
+    group_idx, my_group = _get_my_group(groups, rank)
+    group_ranks = dist.get_process_group_ranks(my_group)
+
+    M, N = 64, 32
+    dtype = torch.float32
+
+    # Fill with rank + 1
+    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
+    iris_input_tensor.fill_(float(rank + 1))
+    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
+
+    shmem.barrier()
+    config = Config(all_reduce_variant="two_shot")
+    workspace = shmem.ccl.all_reduce_preamble(iris_output_tensor, iris_input_tensor, config=config)
+    shmem.barrier()
+
+    shmem.ccl.all_reduce(iris_output_tensor, iris_input_tensor, group=my_group, config=config, workspace=workspace)
+    torch.cuda.synchronize()
+
+    # Calculate expected sum
+    expected_sum = sum(r + 1 for r in group_ranks)
+    actual_sum = iris_output_tensor[0, 0].item()
+
+    try:
+        assert abs(actual_sum - expected_sum) < 1e-5, (
+            f"Rank {rank} (group ranks={group_ranks}): Expected sum {expected_sum}, got {actual_sum}"
+        )
+    finally:
+        shmem.barrier()
+        del shmem
+        import gc
+
+        gc.collect()
+
+
+def test_rank_stride_target_rank_calculation():
+    """
+    Explicitly test that rank_start + i * rank_stride correctly computes target_rank.
+
+    This test verifies the core indexing mechanism used in CCL kernels:
+    - Loop index `i` goes from 0 to world_size-1 (position in group)
+    - `target_rank = rank_start + i * rank_stride` computes global rank
+    - `group_rank` (rank_in_group) is compared with `i` for local vs remote operations
+
+    Example with strided group [0, 2] (stride=2):
+        i=0 -> target_rank = 0 + 0*2 = 0 (global rank 0)
+        i=1 -> target_rank = 0 + 1*2 = 2 (global rank 2)
+    """
+    world_size, rank = _get_world_info()
+
+    if world_size != 4:
+        pytest.skip("This test requires exactly 4 ranks for strided group testing")
+
+    heap_size = 2**33
+    shmem = iris.iris(heap_size)
+
+    from iris.ccl.utils import extract_group_info
+
+    # Test with strided group [0, 2] - stride of 2
+    strided_group_02 = dist.new_group([0, 2])
+
+    # Test with strided group [1, 3] - stride of 2
+    strided_group_13 = dist.new_group([1, 3])
+
+    if rank in [0, 2]:
+        rank_in_group, rank_global, ws, rank_start, rank_stride = extract_group_info(strided_group_02, shmem)
+
+        # Verify the target_rank calculation for each loop iteration
+        expected_target_ranks = [0, 2]  # Global ranks in the group
+        for i in range(ws):
+            computed_target_rank = rank_start + i * rank_stride
+            assert computed_target_rank == expected_target_ranks[i], (
+                f"Rank {rank}: For i={i}, expected target_rank={expected_target_ranks[i]}, "
+                f"got {computed_target_rank} (rank_start={rank_start}, rank_stride={rank_stride})"
+            )
+
+        # Verify that i == group_rank identifies the local rank correctly
+        expected_local_i = 0 if rank == 0 else 1
+        assert rank_in_group == expected_local_i, (
+            f"Rank {rank}: rank_in_group={rank_in_group} should match expected_local_i={expected_local_i}"
+        )
+
+        # Verify: when i == rank_in_group, target_rank == rank_global
+        local_target_rank = rank_start + rank_in_group * rank_stride
+        assert local_target_rank == rank_global, (
+            f"Rank {rank}: local_target_rank={local_target_rank} should equal rank_global={rank_global}"
+        )
+
+    if rank in [1, 3]:
+        rank_in_group, rank_global, ws, rank_start, rank_stride = extract_group_info(strided_group_13, shmem)
+
+        # Verify the target_rank calculation for each loop iteration
+        expected_target_ranks = [1, 3]  # Global ranks in the group
+        for i in range(ws):
+            computed_target_rank = rank_start + i * rank_stride
+            assert computed_target_rank == expected_target_ranks[i], (
+                f"Rank {rank}: For i={i}, expected target_rank={expected_target_ranks[i]}, "
+                f"got {computed_target_rank} (rank_start={rank_start}, rank_stride={rank_stride})"
+            )
+
+        # Verify that i == group_rank identifies the local rank correctly
+        expected_local_i = 0 if rank == 1 else 1
+        assert rank_in_group == expected_local_i, (
+            f"Rank {rank}: rank_in_group={rank_in_group} should match expected_local_i={expected_local_i}"
+        )
+
+        # Verify: when i == rank_in_group, target_rank == rank_global
+        local_target_rank = rank_start + rank_in_group * rank_stride
+        assert local_target_rank == rank_global, (
+            f"Rank {rank}: local_target_rank={local_target_rank} should equal rank_global={rank_global}"
+        )
+
+    shmem.barrier()
+    del shmem
+    import gc
+
+    gc.collect()
+
+
+def test_all_gather_strided_data_placement():
+    """
+    Verify all-gather with strided groups places data in correct output locations.
+
+    This test ensures that with strided groups like [0, 2]:
+    - Rank 0's data goes to output[0:M, :] on all group members
+    - Rank 2's data goes to output[M:2M, :] on all group members
+
+    The key insight: output placement uses rank_in_group (0, 1) not global rank (0, 2).
+    """
+    world_size, rank = _get_world_info()
+
+    if world_size != 4:
+        pytest.skip("This test requires exactly 4 ranks for strided group testing")
+
+    heap_size = 2**33
+    shmem = iris.iris(heap_size)
+
+    M, N = 64, 32
+    dtype = torch.float32
+
+    # Create strided groups [0, 2] and [1, 3]
+    strided_group_02 = dist.new_group([0, 2])
+    strided_group_13 = dist.new_group([1, 3])
+
+    # Test with group [0, 2]
+    if rank in [0, 2]:
+        group_ranks = [0, 2]
+        group_size = 2
+
+        # Each rank fills input with its global rank + 1 for identification
+        input_tensor = shmem.zeros((M, N), dtype=dtype)
+        input_tensor.fill_(float(rank + 1))  # Rank 0 -> 1.0, Rank 2 -> 3.0
+
+        output_tensor = shmem.zeros((group_size * M, N), dtype=dtype)
+
+        shmem.barrier()
+        config = Config()
+        shmem.ccl.all_gather(output_tensor, input_tensor, group=strided_group_02, config=config)
+        torch.cuda.synchronize()
+
+        # Verify data placement:
+        # - output[0:M, :] should contain rank 0's data (value 1.0)
+        # - output[M:2M, :] should contain rank 2's data (value 3.0)
+        chunk_0 = output_tensor[0:M, :].mean().item()
+        chunk_1 = output_tensor[M : 2 * M, :].mean().item()
+
+        expected_chunk_0 = 1.0  # From global rank 0 (rank_in_group=0)
+        expected_chunk_1 = 3.0  # From global rank 2 (rank_in_group=1)
+
+        assert abs(chunk_0 - expected_chunk_0) < 1e-5, (
+            f"Rank {rank}: output[0:M] should be {expected_chunk_0} (from rank 0), got {chunk_0}"
+        )
+        assert abs(chunk_1 - expected_chunk_1) < 1e-5, (
+            f"Rank {rank}: output[M:2M] should be {expected_chunk_1} (from rank 2), got {chunk_1}"
+        )
+
+    # Test with group [1, 3]
+    if rank in [1, 3]:
+        group_ranks = [1, 3]
+        group_size = 2
+
+        # Each rank fills input with its global rank + 1 for identification
+        input_tensor = shmem.zeros((M, N), dtype=dtype)
+        input_tensor.fill_(float(rank + 1))  # Rank 1 -> 2.0, Rank 3 -> 4.0
+
+        output_tensor = shmem.zeros((group_size * M, N), dtype=dtype)
+
+        shmem.barrier()
+        config = Config()
+        shmem.ccl.all_gather(output_tensor, input_tensor, group=strided_group_13, config=config)
+        torch.cuda.synchronize()
+
+        # Verify data placement:
+        # - output[0:M, :] should contain rank 1's data (value 2.0)
+        # - output[M:2M, :] should contain rank 3's data (value 4.0)
+        chunk_0 = output_tensor[0:M, :].mean().item()
+        chunk_1 = output_tensor[M : 2 * M, :].mean().item()
+
+        expected_chunk_0 = 2.0  # From global rank 1 (rank_in_group=0)
+        expected_chunk_1 = 4.0  # From global rank 3 (rank_in_group=1)
+
+        assert abs(chunk_0 - expected_chunk_0) < 1e-5, (
+            f"Rank {rank}: output[0:M] should be {expected_chunk_0} (from rank 1), got {chunk_0}"
+        )
+        assert abs(chunk_1 - expected_chunk_1) < 1e-5, (
+            f"Rank {rank}: output[M:2M] should be {expected_chunk_1} (from rank 3), got {chunk_1}"
+        )
+
+    shmem.barrier()
+    del shmem
+    import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+    gc.collect()
diff --git a/tests/examples/test_all_load_bench.py b/tests/examples/test_all_load_bench.py
index d0a15dc1d..2ee4deef8 100644
--- a/tests/examples/test_all_load_bench.py
+++ b/tests/examples/test_all_load_bench.py
@@ -20,7 +20,147 @@
 spec.loader.exec_module(module)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_size, heap_size",
+    [
+        ((1 << 20), (1 << 30)),  # 1 MiB buffer, 1 GiB heap
+        ((1 << 22), (1 << 31)),  # 4 MiB buffer, 2 GiB heap
+    ],
+)
+@pytest.mark.parametrize(
+    "block_size",
+    [
+        512,
+        1024,
+    ],
+)
+def test_all_load_bench(dtype, buffer_size, heap_size, block_size):
+    # TODO: Benchmark is not accurate. See: https://github.com/ROCm/iris/issues/119
+    pytest.skip("Benchmark is not accurate. See: https://github.com/ROCm/iris/issues/119")
+    shmem = None
+    try:
+        shmem = iris.iris(heap_size)
+        num_ranks = shmem.get_num_ranks()
+
+        element_size_bytes = torch.tensor([], dtype=dtype).element_size()
+        n_elements = buffer_size // element_size_bytes
+        buffer = shmem.zeros(n_elements, dtype=dtype)
+
+        # Create arguments similar to what all_load_bench.py expects
+        args = {
+            "datatype": _torch_dtype_to_str(dtype),
+            "block_size": block_size,
+            "active_ranks": num_ranks,
+            "num_warmup": 4,
+            "num_experiments": 8,
+            "verbose": False,
+            "validate": False,
+        }
+
+        shmem.barrier()
+
+        # Run the experiment and measure bandwidth
+        bandwidth_gbps = module.run_experiment(shmem, args, buffer)
+
+        shmem.barrier()
+
+        # Verify that we got a reasonable bandwidth measurement
+        assert isinstance(bandwidth_gbps, float)
+        assert bandwidth_gbps >= 0.0  # Bandwidth should be non-negative
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        if shmem is not None:
+            try:
+                shmem.barrier()
+            except Exception:
+                pass  # Ignore errors during cleanup
+            # Explicitly delete the shmem instance to trigger cleanup
+            del shmem
+            # Force garbage collection to ensure IPC handles are cleaned up
+            import gc
+
+            gc.collect()
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.float16,  # Test with one dtype for validation
+    ],
+)
+def test_all_load_bench_with_validation(dtype):
+    """Test all_load_bench with validation enabled to ensure correctness"""
+    heap_size = 1 << 30  # 1 GiB heap
+    buffer_size = 1 << 20  # 1 MiB buffer
+    block_size = 512
+
+    shmem = None
+    try:
+        shmem = iris.iris(heap_size)
+        num_ranks = shmem.get_num_ranks()
+
+        element_size_bytes = torch.tensor([], dtype=dtype).element_size()
+        n_elements = buffer_size // element_size_bytes
+        buffer = shmem.zeros(n_elements, dtype=dtype)
+
+        # Create arguments with validation enabled
+        args = {
+            "datatype": _torch_dtype_to_str(dtype),
+            "block_size": block_size,
+            "active_ranks": num_ranks,
+            "num_warmup": 1,
+            "num_experiments": 1,
+            "verbose": False,
+            "validate": True,  # Enable validation
+        }
+
+        shmem.barrier()
+
+        # Run the experiment and measure bandwidth
+        bandwidth_gbps = module.run_experiment(shmem, args, buffer)
+
+        shmem.barrier()
+
+        # Verify that we got a reasonable bandwidth measurement
+        assert isinstance(bandwidth_gbps, float)
+        assert bandwidth_gbps >= 0.0  # Bandwidth should be non-negative
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        if shmem is not None:
+            try:
+                shmem.barrier()
+            except Exception:
+                pass  # Ignore errors during cleanup
+            # Explicitly delete the shmem instance to trigger cleanup
+            del shmem
+            # Force garbage collection to ensure IPC handles are cleaned up
+            import gc
+
+            gc.collect()
 
-pytestmark = pytest.mark.multi_rank_required
 
-@pytest.mark.parametrize(
\ No newline at end of file
+def _torch_dtype_to_str(dtype):
+    """Convert torch dtype to string format expected by all_load_bench.py"""
+    if dtype == torch.int8:
+        return "int8"
+    elif dtype == torch.float16:
+        return "fp16"
+    elif dtype == torch.bfloat16:
+        return "bf16"
+    elif dtype == torch.float32:
+        return "fp32"
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
diff --git a/tests/examples/test_atomic_add_bench.py b/tests/examples/test_atomic_add_bench.py
index 5f630756c..a9f9b1402 100644
--- a/tests/examples/test_atomic_add_bench.py
+++ b/tests/examples/test_atomic_add_bench.py
@@ -39,7 +39,140 @@
 spec.loader.exec_module(module)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_size, heap_size",
+    [
+        (20480, (1 << 33)),
+    ],
+)
+@pytest.mark.parametrize(
+    "block_size",
+    [
+        512,
+        1024,
+    ],
+)
+def test_atomic_bandwidth(dtype, buffer_size, heap_size, block_size):
+    """Test that atomic_add benchmark runs and produces positive bandwidth."""
+    shmem = None
+    try:
+        shmem = iris.iris(heap_size)
+        num_ranks = shmem.get_num_ranks()
 
-pytestmark = pytest.mark.multi_rank_required
+        element_size_bytes = torch.tensor([], dtype=dtype).element_size()
+        n_elements = buffer_size // element_size_bytes
+        source_buffer = shmem.arange(n_elements, dtype=dtype)
+
+        shmem.barrier()
+
+        args = {
+            "datatype": torch_dtype_to_str(dtype),
+            "block_size": block_size,
+            "verbose": False,
+            "validate": False,
+            "num_experiments": 10,
+            "num_warmup": 5,
+        }
+
+        source_rank = 0
+        destination_rank = 1 if num_ranks > 1 else 0
+
+        bandwidth_gbps, _ = module.run_experiment(shmem, args, source_rank, destination_rank, source_buffer)
+
+        assert bandwidth_gbps > 0, f"Bandwidth should be positive, got {bandwidth_gbps}"
+
+        shmem.barrier()
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        if shmem is not None:
+            try:
+                shmem.barrier()
+            except Exception:
+                pass  # Ignore errors during cleanup
+            # Explicitly delete the shmem instance to trigger cleanup
+            del shmem
+            # Force garbage collection to ensure IPC handles are cleaned up
+            import gc
+
+            gc.collect()
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_size, heap_size",
+    [
+        (20480, (1 << 33)),
+    ],
+)
+@pytest.mark.parametrize(
+    "block_size",
+    [
+        512,
+        1024,
+    ],
+)
+def test_atomic_correctness(dtype, buffer_size, heap_size, block_size):
+    """Test that atomic_add benchmark runs and produces positive bandwidth."""
+    shmem = None
+    try:
+        shmem = iris.iris(heap_size)
+        num_ranks = shmem.get_num_ranks()
+
+        element_size_bytes = torch.tensor([], dtype=dtype).element_size()
+        n_elements = buffer_size // element_size_bytes
+        source_buffer = shmem.arange(n_elements, dtype=dtype)
+
+        shmem.barrier()
+
+        args = {
+            "datatype": torch_dtype_to_str(dtype),
+            "block_size": block_size,
+            "verbose": False,
+            "validate": False,
+            "num_experiments": 1,
+            "num_warmup": 0,
+        }
+
+        source_rank = 0
+        destination_rank = 1 if num_ranks > 1 else 0
+
+        _, result_buffer = module.run_experiment(shmem, args, source_rank, destination_rank, source_buffer)
+
+        if shmem.get_rank() == destination_rank:
+            expected = torch.ones(n_elements, dtype=dtype, device="cuda")
+
+            assert torch.allclose(result_buffer, expected), "Result buffer should be equal to expected"
+
+        shmem.barrier()
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        if shmem is not None:
+            try:
+                shmem.barrier()
+            except Exception:
+                pass  # Ignore errors during cleanup
+            # Explicitly delete the shmem instance to trigger cleanup
+            del shmem
+            # Force garbage collection to ensure IPC handles are cleaned up
+            import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+            gc.collect()
diff --git a/tests/examples/test_flash_decode.py b/tests/examples/test_flash_decode.py
index cd305b26a..80a0e8349 100644
--- a/tests/examples/test_flash_decode.py
+++ b/tests/examples/test_flash_decode.py
@@ -120,7 +120,137 @@ def prepare_correctness_data(cfg, args, num_query_heads, num_kv_heads, NUM_BLOCK
     return {"query": query, "key_value_cache": key_value_cache}
 
 
+@pytest.mark.parametrize("head_dim", [128])
+@pytest.mark.parametrize("num_seqs", [1, 8])
+@pytest.mark.parametrize("num_heads", [48, 96])
+@pytest.mark.parametrize("kv_len", [4096, 65536])
+def test_correctness_fused_full(kv_len, num_heads, num_seqs, head_dim):
+    """
+    Tests the correctness of the Iris Fused implementation against the Torch reference.
+    This test is parameterized to run all combinations of the parameters.
+    """
+    shmem = None
+    try:
+        shmem = iris.iris()
 
-pytestmark = pytest.mark.multi_rank_required
+        args = Namespace()
+        args.rank = shmem.get_rank()
+        args.num_ranks = shmem.get_num_ranks()
+        args.local_num_ranks = shmem.get_num_ranks()
+        args.shmem = shmem
+
+        config = {
+            "kv_len": kv_len,
+            "num_heads": num_heads,
+            "num_seqs": num_seqs,
+            "head_dim": head_dim,
+            "dtype": torch.float16,
+            "block_size": 1,
+            "soft_cap": 0,
+        }
+
+        # torch.manual_seed(42)
+        torch.set_default_device("cuda")
+
+        num_query_heads = num_heads
+        num_kv_heads = num_query_heads // 8 if num_query_heads >= 8 else 1
+        scale = head_dim**-0.5
+        NUM_BLOCKS_PER_RANK = config["kv_len"] + 1
+        NUM_BLOCKS = NUM_BLOCKS_PER_RANK * args.num_ranks
+
+        tensor_data = prepare_correctness_data(config, args, num_query_heads, num_kv_heads, NUM_BLOCKS)
+        query = tensor_data["query"]
+        key_value_cache = tensor_data["key_value_cache"]
+
+        key_cache = key_value_cache[:, 0, :, :, :].contiguous()
+        value_cache = key_value_cache[:, 1, :, :, :].contiguous()
+        key_cache_this_rank = key_cache[
+            args.rank * NUM_BLOCKS_PER_RANK : (args.rank + 1) * NUM_BLOCKS_PER_RANK
+        ].contiguous()
+        value_cache_this_rank = value_cache[
+            args.rank * NUM_BLOCKS_PER_RANK : (args.rank + 1) * NUM_BLOCKS_PER_RANK
+        ].contiguous()
+
+        block_tables_this_rank = torch.arange(NUM_BLOCKS_PER_RANK, dtype=torch.int32).repeat(num_seqs, 1)
+        all_block_tables_numpy = iris._distributed_helpers.distributed_allgather_multidim(
+            block_tables_this_rank.cpu().numpy()
+        )
+        block_tables = torch.from_numpy(all_block_tables_numpy).view(args.num_ranks, num_seqs, -1)
+        ref_block_tables = torch.cat([block_tables[i] + i * NUM_BLOCKS_PER_RANK for i in range(args.num_ranks)], dim=-1)
+
+        common_params = {
+            "num_q_heads": num_query_heads,
+            "num_kv_heads": num_kv_heads,
+            "q_head_dim": head_dim,
+            "v_head_dim": head_dim,
+            "page_size": config["block_size"],
+            "scale": scale,
+            "soft_cap": config["soft_cap"],
+            "max_allowed_batch": num_seqs,
+        }
+
+        iris_fd_layer = flash_decode_fused_layer(
+            args.shmem,
+            args.rank,
+            args.rank // args.local_num_ranks,
+            args.num_ranks,
+            args.num_ranks // args.local_num_ranks,
+            **common_params,
+        )
+
+        args.shmem.barrier()
+        if hasattr(iris_fd_layer, "clear_flags"):
+            iris_fd_layer.clear_flags()
+        args.shmem.barrier()
+
+        kv_lens_per_rank = [config["kv_len"]] * num_seqs
+        global_kv_lens = [kv_lens_per_rank[0] * args.num_ranks] * num_seqs
+        kv_lens_tensor = torch.tensor(kv_lens_per_rank, dtype=torch.int32, device=query.device)
+        global_kv_lens_tensor = kv_lens_tensor.unsqueeze(0).repeat(args.num_ranks, 1)
+
+        output = iris_fd_layer(
+            query, key_cache_this_rank, value_cache_this_rank, global_kv_lens_tensor, block_tables_this_rank
+        )
+        torch.cuda.synchronize()
+
+        ref_output = ref_paged_attn(
+            query=query.clone(),
+            key_cache=key_cache,
+            value_cache=value_cache,
+            query_lens=[1] * num_seqs,
+            kv_lens_per_rank=global_kv_lens,
+            block_tables=ref_block_tables,
+            scale=scale,
+            soft_cap=config["soft_cap"],
+        )
+        args.shmem.barrier()
+
+        error = None
+        try:
+            atol = 1e-4
+            rtol = 1e-4
+            torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
+        except AssertionError as e:
+            error = e
+
+        print_correctness_report(args.rank, output, ref_output, error)
+
+        if error:
+            raise error
+
+        args.shmem.barrier()
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        if shmem is not None:
+            try:
+                shmem.barrier()
+            except Exception:
+                pass  # Ignore errors during cleanup
+            # Explicitly delete the shmem instance to trigger cleanup
+            del shmem
+            # Force garbage collection to ensure IPC handles are cleaned up
+            import gc
 
-@pytest.mark.parametrize("head_dim", [128])
\ No newline at end of file
+            gc.collect()
diff --git a/tests/examples/test_load_bench.py b/tests/examples/test_load_bench.py
index 1f3635c69..3d9082a5e 100644
--- a/tests/examples/test_load_bench.py
+++ b/tests/examples/test_load_bench.py
@@ -22,7 +22,60 @@
 
 
 @pytest.mark.skip(reason="Test is inconsistent and needs debugging - tracked in issue")
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_size, heap_size",
+    [
+        ((1 << 32), (1 << 33)),
+    ],
+)
+@pytest.mark.parametrize(
+    "block_size",
+    [
+        512,
+        1024,
+    ],
+)
+def test_load_bench(dtype, buffer_size, heap_size, block_size):
+    shmem = None
+    try:
+        shmem = iris.iris(heap_size)
+        num_ranks = shmem.get_num_ranks()
 
-pytestmark = pytest.mark.multi_rank_required
+        bandwidth_matrix = np.zeros((num_ranks, num_ranks), dtype=np.float32)
+        element_size_bytes = torch.tensor([], dtype=dtype).element_size()
+        source_buffer = shmem.ones(buffer_size // element_size_bytes, dtype=dtype)
+        result_buffer = shmem.zeros_like(source_buffer)
+
+        shmem.barrier()
+
+        for source_rank in range(num_ranks):
+            for destination_rank in range(num_ranks):
+                bandwidth_gbps = module.bench_load(
+                    shmem, source_rank, destination_rank, source_buffer, result_buffer, block_size, dtype
+                )
+                bandwidth_matrix[source_rank, destination_rank] = bandwidth_gbps
+                shmem.barrier()
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        if shmem is not None:
+            try:
+                shmem.barrier()
+            except Exception:
+                pass  # Ignore errors during cleanup
+            # Explicitly delete the shmem instance to trigger cleanup
+            del shmem
+            # Force garbage collection to ensure IPC handles are cleaned up
+            import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+            gc.collect()
diff --git a/tests/examples/test_message_passing.py b/tests/examples/test_message_passing.py
index d0c8aa70f..d1104113d 100644
--- a/tests/examples/test_message_passing.py
+++ b/tests/examples/test_message_passing.py
@@ -112,7 +112,61 @@ def run_message_passing_kernels(module, args):
             gc.collect()
 
 
-
-pytestmark = pytest.mark.multi_rank_required
-
-@pytest.mark.parametrize(
\ No newline at end of file
+@pytest.mark.parametrize(
+    "dtype_str",
+    [
+        "int8",
+        "fp16",
+        "bf16",
+        "fp32",
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_size, heap_size",
+    [
+        (4096, 1 << 20),  # Smaller sizes for testing
+        (8192, 1 << 21),
+    ],
+)
+@pytest.mark.parametrize(
+    "block_size",
+    [
+        512,
+        1024,
+    ],
+)
+def test_message_passing_load_store(dtype_str, buffer_size, heap_size, block_size):
+    """Test message passing with load/store operations."""
+    args = create_test_args(dtype_str, buffer_size, heap_size, block_size)
+    success = run_message_passing_kernels(load_store_module, args)
+    assert success, "Message passing load/store validation failed"
+
+
+@pytest.mark.parametrize(
+    "dtype_str",
+    [
+        "int8",
+        "fp16",
+        "bf16",
+        "fp32",
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_size, heap_size",
+    [
+        (4096, 1 << 20),  # Smaller sizes for testing
+        (8192, 1 << 21),
+    ],
+)
+@pytest.mark.parametrize(
+    "block_size",
+    [
+        512,
+        1024,
+    ],
+)
+def test_message_passing_put(dtype_str, buffer_size, heap_size, block_size):
+    """Test message passing with put operations."""
+    args = create_test_args(dtype_str, buffer_size, heap_size, block_size)
+    success = run_message_passing_kernels(put_module, args)
+    assert success, "Message passing put validation failed"
diff --git a/tests/unittests/test_atomic_add_gluon.py b/tests/unittests/test_atomic_add_gluon.py
index b5be281e7..88b35af91 100644
--- a/tests/unittests/test_atomic_add_gluon.py
+++ b/tests/unittests/test_atomic_add_gluon.py
@@ -43,7 +43,84 @@ def atomic_add_kernel(
         )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_add_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = (1,)
+    atomic_add_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        results,
+        sem,
+        scope,
+        cur_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # Verify the results
+    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * num_ranks
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_add_triton.py b/tests/unittests/test_atomic_add_triton.py
index 879474c10..dd1a9a811 100644
--- a/tests/unittests/test_atomic_add_triton.py
+++ b/tests/unittests/test_atomic_add_triton.py
@@ -43,7 +43,74 @@ def atomic_add_kernel(
         )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_add_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    atomic_add_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # Verify the results
+    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * num_ranks
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_and_gluon.py b/tests/unittests/test_atomic_and_gluon.py
index 7781a0749..27aceb016 100644
--- a/tests/unittests/test_atomic_and_gluon.py
+++ b/tests/unittests/test_atomic_and_gluon.py
@@ -37,7 +37,88 @@ def atomic_and_kernel(
         ctx.atomic_and(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_and_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    bit_width = 32 if dtype == torch.int32 else 64
+    effective_bits = min(num_ranks, bit_width)
+    initial_mask = (1 << effective_bits) - 1
+
+    results = shmem.full((BLOCK_SIZE,), initial_mask, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = (1,)
+    atomic_and_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        results,
+        sem,
+        scope,
+        cur_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # All ranks start out with a full mask vector 0xFFFFFF (initial_mask)
+    # All ranks then take turns in clearing their bit position in the mask
+    # By the end we would have effective_bits - num_ranks many ones followed by num_ranks zeros
+    expected_scalar = ~((1 << num_ranks) - 1) & initial_mask
+    expected = torch.full((BLOCK_SIZE,), expected_scalar, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual  :", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_and_triton.py b/tests/unittests/test_atomic_and_triton.py
index eda402e32..5abe90d7c 100644
--- a/tests/unittests/test_atomic_and_triton.py
+++ b/tests/unittests/test_atomic_and_triton.py
@@ -34,7 +34,78 @@ def atomic_and_kernel(
         iris.atomic_and(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_and_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    bit_width = 32 if dtype == torch.int32 else 64
+    effective_bits = min(num_ranks, bit_width)
+    initial_mask = (1 << effective_bits) - 1
+
+    results = shmem.full((BLOCK_SIZE,), initial_mask, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    atomic_and_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # All ranks start out with a full mask vector 0xFFFFFF (initial_mask)
+    # All ranks then take turns in clearing their bit position in the mask
+    # By the end we would have effective_bits - num_ranks many ones followed by num_ranks zeros
+    expected_scalar = ~((1 << num_ranks) - 1) & initial_mask
+    expected = torch.full((BLOCK_SIZE,), expected_scalar, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual  :", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_cas_gluon.py b/tests/unittests/test_atomic_cas_gluon.py
index c85e80bc8..fd856d3b1 100644
--- a/tests/unittests/test_atomic_cas_gluon.py
+++ b/tests/unittests/test_atomic_cas_gluon.py
@@ -34,7 +34,77 @@ def atomic_cas_kernel(
         ctx.atomic_cas(results, cmp, val, target_rank, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int16,
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+def test_atomic_cas_api(dtype, sem, scope):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    results = shmem.zeros((1,), dtype=dtype)
+    # Create single-element tensors for cmp and val values (workaround for 0D tensor limitation)
+    cmp_val = shmem.zeros((1,), dtype=dtype)  # Will be 0
+    val_tensor = shmem.full((1,), num_ranks, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = (1,)
+    atomic_cas_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        results,
+        cmp_val,
+        val_tensor,
+        sem,
+        scope,
+        cur_rank,
+        num_ranks,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # Verify the results
+    expected = torch.full((1,), num_ranks, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_cas_triton.py b/tests/unittests/test_atomic_cas_triton.py
index a801b82db..a63db3b74 100644
--- a/tests/unittests/test_atomic_cas_triton.py
+++ b/tests/unittests/test_atomic_cas_triton.py
@@ -29,7 +29,63 @@ def atomic_cas_kernel(
         iris.atomic_cas(results, cmp, val, cur_rank, target_rank, heap_bases, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int16,
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+def test_atomic_cas_api(dtype, sem, scope):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    results = shmem.zeros((1,), dtype=dtype)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    atomic_cas_kernel[grid](results, sem, scope, cur_rank, num_ranks, heap_bases)
+    shmem.barrier()
+
+    # Verify the results
+    expected = torch.full((1,), num_ranks, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_max_gluon.py b/tests/unittests/test_atomic_max_gluon.py
index 72ec075ce..7a33df0ab 100644
--- a/tests/unittests/test_atomic_max_gluon.py
+++ b/tests/unittests/test_atomic_max_gluon.py
@@ -35,7 +35,84 @@ def atomic_max_kernel(
         ctx.atomic_max(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_max_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    min_val = torch.iinfo(dtype).min
+    results = shmem.full((BLOCK_SIZE,), min_val, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = (1,)
+    atomic_max_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        results,
+        sem,
+        scope,
+        cur_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # All ranks participate in performing the max operation
+    # Each rank performs the atomic operation: max(rank_id + 1)
+    # The result equals the ID of the last rank + 1
+    expected = torch.full((BLOCK_SIZE,), num_ranks, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual  :", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_max_triton.py b/tests/unittests/test_atomic_max_triton.py
index 63c4301db..d9875786f 100644
--- a/tests/unittests/test_atomic_max_triton.py
+++ b/tests/unittests/test_atomic_max_triton.py
@@ -32,7 +32,74 @@ def atomic_max_kernel(
         iris.atomic_max(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_max_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    min_val = torch.iinfo(dtype).min
+    results = shmem.full((BLOCK_SIZE,), min_val, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    atomic_max_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # All ranks participate in performing the max operation
+    # Each rank performs the atomic operation: max(rank_id + 1)
+    # The result equals the ID of the last rank + 1
+    expected = torch.full((BLOCK_SIZE,), num_ranks, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual  :", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_min_gluon.py b/tests/unittests/test_atomic_min_gluon.py
index b8fdc4ce4..7f2ba7d55 100644
--- a/tests/unittests/test_atomic_min_gluon.py
+++ b/tests/unittests/test_atomic_min_gluon.py
@@ -35,7 +35,83 @@ def atomic_min_kernel(
         ctx.atomic_min(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_min_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    max_val = torch.iinfo(dtype).max
+    results = shmem.full((BLOCK_SIZE,), max_val, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = (1,)
+    atomic_min_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        results,
+        sem,
+        scope,
+        cur_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+    # All ranks participate in performing the min operation
+    # Each rank performs the atomic operation: min(rank_id + 1)
+    # The result equals the ID of the first rank + 1
+    expected = torch.full((BLOCK_SIZE,), 1, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual  :", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_min_triton.py b/tests/unittests/test_atomic_min_triton.py
index c43a66df4..1fa6f90de 100644
--- a/tests/unittests/test_atomic_min_triton.py
+++ b/tests/unittests/test_atomic_min_triton.py
@@ -32,7 +32,73 @@ def atomic_min_kernel(
         iris.atomic_min(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_min_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    max_val = torch.iinfo(dtype).max
+    results = shmem.full((BLOCK_SIZE,), max_val, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    atomic_min_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+    # All ranks participate in performing the min operation
+    # Each rank performs the atomic operation: min(rank_id + 1)
+    # The result equals the ID of the first rank + 1
+    expected = torch.full((BLOCK_SIZE,), 1, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual  :", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_or_gluon.py b/tests/unittests/test_atomic_or_gluon.py
index 2c85c3030..8de1ade2c 100644
--- a/tests/unittests/test_atomic_or_gluon.py
+++ b/tests/unittests/test_atomic_or_gluon.py
@@ -36,7 +36,88 @@ def atomic_or_kernel(
         ctx.atomic_or(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_or_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = (1,)
+    atomic_or_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        results,
+        sem,
+        scope,
+        cur_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    bit_width = 32 if dtype == torch.int32 else 64
+    effective_bits = min(num_ranks, bit_width)
+    expected_scalar = (1 << effective_bits) - 1
+
+    # All ranks start out with a zero mask
+    # All ranks then take turns in setting the their bit position in the mask to 1
+    # By the end we would have a bit vector with num_ranks many 1's as long as num_ranks <= bit_width
+    # or a full bit vector if num_ranks > bit_width
+    expected = torch.full((BLOCK_SIZE,), expected_scalar, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual  :", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_or_triton.py b/tests/unittests/test_atomic_or_triton.py
index a7aec304a..132be6d57 100644
--- a/tests/unittests/test_atomic_or_triton.py
+++ b/tests/unittests/test_atomic_or_triton.py
@@ -33,7 +33,78 @@ def atomic_or_kernel(
         iris.atomic_or(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_or_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    atomic_or_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    bit_width = 32 if dtype == torch.int32 else 64
+    effective_bits = min(num_ranks, bit_width)
+    expected_scalar = (1 << effective_bits) - 1
+
+    # All ranks start out with a zero mask
+    # All ranks then take turns in setting the their bit position in the mask to 1
+    # By the end we would have a bit vector with num_ranks many 1's as long as num_ranks <= bit_width
+    # or a full bit vector if num_ranks > bit_width
+    expected = torch.full((BLOCK_SIZE,), expected_scalar, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual  :", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_xchg_gluon.py b/tests/unittests/test_atomic_xchg_gluon.py
index b6c827f0a..7ef3a4fb1 100644
--- a/tests/unittests/test_atomic_xchg_gluon.py
+++ b/tests/unittests/test_atomic_xchg_gluon.py
@@ -32,7 +32,75 @@ def atomic_xchg_kernel(
         ctx.atomic_xchg(results, val, target_rank, mask=None, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+def test_atomic_xchg_api(dtype, sem, scope):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    results = shmem.zeros((1,), dtype=dtype)
+    # Create single-element tensor for val value (workaround for 0D tensor limitation)
+    val_tensor = shmem.full((1,), num_ranks, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = (1,)
+    atomic_xchg_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        results,
+        val_tensor,
+        sem,
+        scope,
+        cur_rank,
+        num_ranks,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # Verify the results
+    expected = torch.full((1,), num_ranks, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_xchg_triton.py b/tests/unittests/test_atomic_xchg_triton.py
index 29f5b28ca..cfbbd2df1 100644
--- a/tests/unittests/test_atomic_xchg_triton.py
+++ b/tests/unittests/test_atomic_xchg_triton.py
@@ -28,7 +28,63 @@ def atomic_xchg_kernel(
         iris.atomic_xchg(results, val, cur_rank, target_rank, heap_bases, mask=None, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+def test_atomic_xchg_api(dtype, sem, scope):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    results = shmem.zeros((1,), dtype=dtype)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    atomic_xchg_kernel[grid](results, sem, scope, cur_rank, num_ranks, heap_bases)
+    shmem.barrier()
+
+    # Verify the results
+    expected = torch.full((1,), num_ranks, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_xor_gluon.py b/tests/unittests/test_atomic_xor_gluon.py
index 472bd8787..f2cf50743 100644
--- a/tests/unittests/test_atomic_xor_gluon.py
+++ b/tests/unittests/test_atomic_xor_gluon.py
@@ -37,7 +37,86 @@ def atomic_xor_kernel(
         ctx.atomic_xor(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_xor_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = (1,)
+    atomic_xor_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        results,
+        sem,
+        scope,
+        cur_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # If we xor '1' in num_ranks times:
+    # - If num_ranks is odd  -> final = 1
+    # - If num_ranks is even -> final = 0
+    if (num_ranks % 2) == 1:
+        expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
+    else:
+        expected = torch.zeros(BLOCK_SIZE, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_atomic_xor_triton.py b/tests/unittests/test_atomic_xor_triton.py
index c64285cc7..d12644ebf 100644
--- a/tests/unittests/test_atomic_xor_triton.py
+++ b/tests/unittests/test_atomic_xor_triton.py
@@ -34,7 +34,76 @@ def atomic_xor_kernel(
         iris.atomic_xor(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int32,
+        torch.int64,
+    ],
+)
+@pytest.mark.parametrize(
+    "sem",
+    [
+        "acquire",
+        "release",
+        "acq_rel",
+    ],
+)
+@pytest.mark.parametrize(
+    "scope",
+    [
+        "cta",
+        "gpu",
+        "sys",
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_atomic_xor_api(dtype, sem, scope, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    results = shmem.zeros(BLOCK_SIZE, dtype=dtype)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    atomic_xor_kernel[grid](results, sem, scope, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # If we xor '1' in num_ranks times:
+    # - If num_ranks is odd  -> final = 1
+    # - If num_ranks is even -> final = 0
+    if (num_ranks % 2) == 1:
+        expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
+    else:
+        expected = torch.zeros(BLOCK_SIZE, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_copy_gluon.py b/tests/unittests/test_copy_gluon.py
index ee6cedacd..4590bbe61 100644
--- a/tests/unittests/test_copy_gluon.py
+++ b/tests/unittests/test_copy_gluon.py
@@ -83,7 +83,206 @@ def copy_local_kernel(
         ctx.copy(src_data + offsets, dest_data + offsets, cur_rank, cur_rank, mask=mask)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_copy_get(dtype, BLOCK_SIZE):
+    """Test GET operation: cur_rank == to_rank"""
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    base = cur_rank + num_ranks
+    for i in range(num_ranks):
+        data[i, :] = base * (i + 1)
+
+    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    grid = (1,)
+    copy_get_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        data,
+        results,
+        cur_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    expected = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    for rank_id in range(num_ranks):
+        expected[rank_id, :] = (rank_id + num_ranks) * (cur_rank + 1)
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
+
+        gc.collect()
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_copy_put(dtype, BLOCK_SIZE):
+    """Test PUT operation: cur_rank == from_rank"""
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
+
+    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    base = cur_rank + num_ranks
+    for i in range(num_ranks):
+        data[i, :] = base * (i + 1)
+
+    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    grid = (1,)
+    copy_put_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        data,
+        results,
+        cur_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # Each rank writes to results[cur_rank] on all targets
+    # After barrier, results[rank_id] contains data from rank_id
+    expected = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    for rank_id in range(num_ranks):
+        expected[rank_id, :] = (rank_id + num_ranks) * (rank_id + 1)
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
+
+        gc.collect()
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_copy_local(dtype, BLOCK_SIZE):
+    """Test LOCAL operation: from_rank == to_rank == cur_rank"""
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
+
+    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    base = cur_rank + num_ranks
+    for i in range(num_ranks):
+        data[i, :] = base * (i + 1)
+
+    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    grid = (1,)
+    copy_local_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        data,
+        results,
+        cur_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # Local copy: results should match data
+    expected = data
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_copy_triton.py b/tests/unittests/test_copy_triton.py
index e322d5a51..32cc0a797 100644
--- a/tests/unittests/test_copy_triton.py
+++ b/tests/unittests/test_copy_triton.py
@@ -74,7 +74,179 @@ def copy_local_kernel(
         iris.copy(src_data + offsets, dest_data + offsets, cur_rank, cur_rank, cur_rank, heap_bases, mask)
 
 
-
-pytestmark = pytest.mark.multi_rank_required
-
-@pytest.mark.parametrize(
\ No newline at end of file
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_copy_get(dtype, BLOCK_SIZE):
+    """Test GET operation: cur_rank == to_rank"""
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
+
+    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    base = cur_rank + num_ranks
+    for i in range(num_ranks):
+        data[i, :] = base * (i + 1)
+
+    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    grid = lambda meta: (1,)
+    copy_get_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    expected = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    for rank_id in range(num_ranks):
+        expected[rank_id, :] = (rank_id + num_ranks) * (cur_rank + 1)
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
+
+        gc.collect()
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_copy_put(dtype, BLOCK_SIZE):
+    """Test PUT operation: cur_rank == from_rank"""
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
+
+    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    base = cur_rank + num_ranks
+    for i in range(num_ranks):
+        data[i, :] = base * (i + 1)
+
+    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    grid = lambda meta: (1,)
+    copy_put_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # Each rank writes to results[cur_rank] on all targets
+    # After barrier, results[rank_id] contains data from rank_id
+    expected = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    for rank_id in range(num_ranks):
+        expected[rank_id, :] = (rank_id + num_ranks) * (rank_id + 1)
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
+
+        gc.collect()
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_copy_local(dtype, BLOCK_SIZE):
+    """Test LOCAL operation: from_rank == to_rank == cur_rank"""
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
+
+    data = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    base = cur_rank + num_ranks
+    for i in range(num_ranks):
+        data[i, :] = base * (i + 1)
+
+    results = shmem.zeros((num_ranks, BLOCK_SIZE), dtype=dtype)
+    grid = lambda meta: (1,)
+    copy_local_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # Local copy: results should match data
+    expected = data
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
+
+        gc.collect()
diff --git a/tests/unittests/test_get_gluon.py b/tests/unittests/test_get_gluon.py
index 274e443e4..910721b75 100644
--- a/tests/unittests/test_get_gluon.py
+++ b/tests/unittests/test_get_gluon.py
@@ -44,7 +44,67 @@ def get_kernel(
     gl.store(results + offsets, acc, mask=mask)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_get_api(dtype, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    data = shmem.ones(BLOCK_SIZE, dtype=dtype)
+    results = shmem.zeros_like(data)
+
+    shmem.barrier()
+
+    grid = (1,)
+    get_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        data,
+        results,
+        cur_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # Verify the results
+    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * num_ranks
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_get_triton.py b/tests/unittests/test_get_triton.py
index 5f1a19401..090f50a14 100644
--- a/tests/unittests/test_get_triton.py
+++ b/tests/unittests/test_get_triton.py
@@ -41,7 +41,58 @@ def get_kernel(
     tl.store(results + offsets, acc, mask=mask)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_get_api(dtype, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    data = shmem.ones(BLOCK_SIZE, dtype=dtype)
+    results = shmem.zeros_like(data)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    get_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # Verify the results
+    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * num_ranks
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_load_gluon.py b/tests/unittests/test_load_gluon.py
index b1d090cc9..9dbd7df43 100644
--- a/tests/unittests/test_load_gluon.py
+++ b/tests/unittests/test_load_gluon.py
@@ -36,7 +36,68 @@ def load_kernel(
     gl.store(results + offsets, result, mask=mask)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_load_api(dtype, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    source_rank = shmem.get_rank()
+    partner = int((source_rank + num_ranks // 2) % num_ranks)
 
-pytestmark = pytest.mark.multi_rank_required
+    data = shmem.full((BLOCK_SIZE,), source_rank, dtype=dtype)
+    results = shmem.zeros_like(data)
+
+    shmem.barrier()
+
+    grid = (1,)
+    load_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        data,
+        results,
+        source_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # Verify the result
+    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * partner
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_load_triton.py b/tests/unittests/test_load_triton.py
index 88c9a102d..733c6de8a 100644
--- a/tests/unittests/test_load_triton.py
+++ b/tests/unittests/test_load_triton.py
@@ -33,7 +33,59 @@ def load_kernel(
     tl.store(results + offsets, result, mask=mask)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_load_api(dtype, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    source_rank = shmem.get_rank()
+    partner = int((source_rank + num_ranks // 2) % num_ranks)
 
-pytestmark = pytest.mark.multi_rank_required
+    data = shmem.full((BLOCK_SIZE,), source_rank, dtype=dtype)
+    results = shmem.zeros_like(data)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    load_kernel[grid](data, results, source_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # Verify the result
+    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda") * partner
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_put_gluon.py b/tests/unittests/test_put_gluon.py
index e7d49c250..659366c8f 100644
--- a/tests/unittests/test_put_gluon.py
+++ b/tests/unittests/test_put_gluon.py
@@ -38,7 +38,67 @@ def put_kernel(
         ctx.put(data + offsets, results + offsets, target_rank, mask=mask)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_put_api(dtype, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    data = shmem.ones(BLOCK_SIZE, dtype=dtype)
+    results = shmem.zeros_like(data)
+
+    shmem.barrier()
+
+    grid = (1,)
+    put_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        data,
+        results,
+        cur_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # Verify the results
+    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_put_triton.py b/tests/unittests/test_put_triton.py
index 246ba520d..0d5f61919 100644
--- a/tests/unittests/test_put_triton.py
+++ b/tests/unittests/test_put_triton.py
@@ -35,7 +35,58 @@ def put_kernel(
         iris.put(data + offsets, results + offsets, cur_rank, target_rank, heap_bases, mask=mask)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_put_api(dtype, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    cur_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    data = shmem.ones(BLOCK_SIZE, dtype=dtype)
+    results = shmem.zeros_like(data)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    put_kernel[grid](data, results, cur_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # Verify the results
+    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_store_gluon.py b/tests/unittests/test_store_gluon.py
index 866ff5395..90616b38c 100644
--- a/tests/unittests/test_store_gluon.py
+++ b/tests/unittests/test_store_gluon.py
@@ -39,7 +39,67 @@ def store_kernel(
         ctx.store(results + offsets, value, dst_rank, mask=mask)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_store_api(dtype, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris_gl.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    context_tensor = shmem.get_device_context()
+    destination_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    src = shmem.ones(BLOCK_SIZE, dtype=dtype)
+    results = shmem.zeros_like(src)
+
+    shmem.barrier()
+
+    grid = (1,)
+    store_kernel[grid](
+        iris_gl.IrisDeviceCtx,
+        context_tensor,
+        src,
+        results,
+        destination_rank,
+        num_ranks,
+        BLOCK_SIZE,
+        num_warps=1,
+    )
+    shmem.barrier()
+
+    # Verify the result
+    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/unittests/test_store_triton.py b/tests/unittests/test_store_triton.py
index 7e3f3d3c2..eee57658a 100644
--- a/tests/unittests/test_store_triton.py
+++ b/tests/unittests/test_store_triton.py
@@ -36,7 +36,58 @@ def store_kernel(
         iris.store(results + offsets, value, destination_rank, dst_rank, heap_bases, mask=mask)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        torch.int8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ],
+)
+@pytest.mark.parametrize(
+    "BLOCK_SIZE",
+    [
+        1,
+        8,
+        16,
+        32,
+    ],
+)
+def test_store_api(dtype, BLOCK_SIZE):
+    # TODO: Adjust heap size.
+    shmem = iris.iris(1 << 20)
+    num_ranks = shmem.get_num_ranks()
+    heap_bases = shmem.get_heap_bases()
+    destination_rank = shmem.get_rank()
 
-pytestmark = pytest.mark.multi_rank_required
+    src = shmem.ones(BLOCK_SIZE, dtype=dtype)
+    results = shmem.zeros_like(src)
+
+    shmem.barrier()
+
+    grid = lambda meta: (1,)
+    store_kernel[grid](src, results, destination_rank, num_ranks, BLOCK_SIZE, heap_bases)
+    shmem.barrier()
+
+    # Verify the result
+    expected = torch.ones(BLOCK_SIZE, dtype=dtype, device="cuda")
+
+    try:
+        torch.testing.assert_close(results, expected, rtol=0, atol=0)
+    except AssertionError as e:
+        print(e)
+        print("Expected:", expected)
+        print("Actual:", results)
+        raise
+    finally:
+        # Final barrier to ensure all ranks complete before test cleanup
+        # This helps with test isolation when running multiple tests
+        # Note: shmem.barrier() already does cuda.synchronize()
+        shmem.barrier()
+        # Explicitly delete the shmem instance to trigger cleanup
+        del shmem
+        # Force garbage collection to ensure IPC handles are cleaned up
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/x/test_all_gather.py b/tests/x/test_all_gather.py
index d2e8f040c..f42b88939 100644
--- a/tests/x/test_all_gather.py
+++ b/tests/x/test_all_gather.py
@@ -66,7 +66,280 @@ def x_all_gather_kernel(
         iris.x.all_gather(tile, dst_view, gather_dim, ctx)
 
 
+@pytest.mark.parametrize(
+    "gather_dim",
+    [0, 1],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float16, 1e-3, 1e-3),
+        (torch.float32, 1e-5, 1e-5),
+        (torch.bfloat16, 1e-3, 1e-3),
+    ],
+)
+@pytest.mark.parametrize(
+    "M, N, BLOCK_SIZE_M, BLOCK_SIZE_N",
+    [
+        (128, 64, 64, 32),  # Small
+        (1024, 256, 128, 128),  # Medium
+        (2048, 2048, 256, 256),  # Large
+        # TODO: Fix non-aligned dimension handling in all_gather for irregular tiling
+        # (100, 100, 64, 64),  # Non-aligned dimensions - fails due to edge case with partial tiles
+        (256, 384, 128, 128),  # Non-square
+        (64, 32, 128, 128),  # Block size larger than dimensions
+    ],
+)
+def test_all_gather(gather_dim, dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
+    """Test tile-level all-gather primitive by comparing against PyTorch's implementation."""
+    if not dist.is_initialized():
+        pytest.skip("torch.distributed not initialized")
 
-pytestmark = pytest.mark.multi_rank_required
+    # Skip if block size is larger than dimensions
+    # (new all_gather requires tile.data shape to match block size)
+    if BLOCK_SIZE_M > M or BLOCK_SIZE_N > N:
+        pytest.skip(f"Block size ({BLOCK_SIZE_M}x{BLOCK_SIZE_N}) larger than dimensions ({M}x{N})")
+
+    heap_size = 2**33  # 8GB
+    shmem = iris.iris(heap_size)
+    rank = shmem.get_rank()
+    world_size = shmem.get_num_ranks()
+
+    # PyTorch's all_gather format: each rank has M x N data
+    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
+    pytorch_input_tensor.fill_(float(rank + 1))
+
+    # Run PyTorch's all_gather to get reference output
+    pytorch_output_list = [torch.empty_like(pytorch_input_tensor) for _ in range(world_size)]
+    shmem.barrier()
+    dist.all_gather(pytorch_output_list, pytorch_input_tensor)
+
+    if gather_dim == 0:
+        # Gather along rows (M dimension)
+        pytorch_output_tensor = torch.cat(pytorch_output_list, dim=0)  # Concatenate along dim 0
+    else:
+        # Gather along columns (N dimension)
+        pytorch_output_tensor = torch.cat(pytorch_output_list, dim=1)  # Concatenate along dim 1
+
+    torch.cuda.synchronize()
+
+    # Set up Iris tensors
+    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
+    iris_input_tensor.copy_(pytorch_input_tensor)
+
+    if gather_dim == 0:
+        iris_output_tensor = shmem.zeros((world_size * M, N), dtype=dtype)
+    else:
+        iris_output_tensor = shmem.zeros((M, world_size * N), dtype=dtype)
+
+    shmem.barrier()
+
+    # Launch kernel
+    num_pid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
+    num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
+    total_tiles = num_pid_m * num_pid_n
+    grid = (total_tiles,)
+
+    x_all_gather_kernel[grid](
+        iris_input_tensor,
+        iris_output_tensor,
+        M,
+        N,
+        iris_input_tensor.stride(0),
+        iris_input_tensor.stride(1),
+        iris_output_tensor.stride(0),
+        iris_output_tensor.stride(1),
+        shmem.get_heap_bases(),
+        rank,
+        world_size,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+        gather_dim,
+    )
+
+    torch.cuda.synchronize()
+    shmem.barrier()
+
+    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
+
+    try:
+        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol, rtol=rtol), (
+            f"Max difference: {max_diff}, expected < {atol}\n"
+            f"Rank {rank}: Iris x.all_gather output doesn't match PyTorch's all_gather"
+        )
+
+        # Verify each rank's data is in the correct location
+        if gather_dim == 0:
+            # Gathered along rows
+            for r in range(world_size):
+                start_row = r * M
+                end_row = (r + 1) * M
+                rank_data = iris_output_tensor[start_row:end_row, :]
+                expected_value = float(r + 1)
+                assert torch.allclose(rank_data, torch.full_like(rank_data, expected_value), atol=atol), (
+                    f"Rank {rank}: Data from rank {r} not in correct location or has wrong value"
+                )
+        else:
+            # Gathered along columns
+            for r in range(world_size):
+                start_col = r * N
+                end_col = (r + 1) * N
+                rank_data = iris_output_tensor[:, start_col:end_col]
+                expected_value = float(r + 1)
+                assert torch.allclose(rank_data, torch.full_like(rank_data, expected_value), atol=atol), (
+                    f"Rank {rank}: Data from rank {r} not in correct location or has wrong value"
+                )
+
+        if rank == 0:
+            dim_str = "rows" if gather_dim == 0 else "cols"
+            print(
+                f"✓ All-gather test passed ({dim_str}): {dtype}, M={M}, N={N}, blocks=({BLOCK_SIZE_M},{BLOCK_SIZE_N})"
+            )
+    finally:
+        shmem.barrier()
+        del shmem
+        import gc
+
+        gc.collect()
+
+
+@triton.jit
+def x_all_gather_ctx_api_kernel(
+    input_ptr,
+    output_ptr,
+    M: tl.constexpr,
+    N: tl.constexpr,
+    stride_in_m: tl.constexpr,
+    stride_in_n: tl.constexpr,
+    stride_out_m: tl.constexpr,
+    stride_out_n: tl.constexpr,
+    heap_bases: tl.tensor,
+    cur_rank: tl.constexpr,
+    world_size: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    gather_dim: tl.constexpr,
+):
+    """Kernel using direct all_gather() call (ctx methods removed due to Triton limitations)."""
+    pid = tl.program_id(0)
+    grid_size = tl.num_programs(0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    total_tiles = num_pid_m * num_pid_n
+
+    for tile_id in range(pid, total_tiles, grid_size):
+        pid_m = tile_id // num_pid_n
+        pid_n = tile_id % num_pid_n
+
+        # Load local tile data
+        rm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        rn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        mask = (rm[:, None] < M) & (rn[None, :] < N)
+        src_ptr = input_ptr + rm[:, None] * stride_in_m + rn[None, :] * stride_in_n
+        local_data = tl.load(src_ptr, mask=mask, other=0.0)
+
+        # Create Tile with loaded data and views
+        tile = iris.x.Tile(pid_m, pid_n, BLOCK_SIZE_M, BLOCK_SIZE_N, local_data)
+        dst_view = iris.x.TensorView(
+            output_ptr,
+            M * world_size if gather_dim == 0 else M,
+            N if gather_dim == 0 else N * world_size,
+            stride_out_m,
+            stride_out_n,
+        )
+        ctx = iris.x.DeviceContext(cur_rank, world_size, heap_bases)
+
+        # Call primitive directly (ctx methods don't work due to Triton import restrictions)
+        iris.x.all_gather(tile, dst_view, gather_dim, ctx)
+
+
+@pytest.mark.parametrize("gather_dim", [0, 1])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float16, 1e-3, 1e-3),
+        (torch.float32, 1e-5, 1e-5),
+    ],
+)
+@pytest.mark.parametrize("M, N, BLOCK_SIZE_M, BLOCK_SIZE_N", [(256, 128, 64, 64)])
+def test_all_gather_ctx_api(gather_dim, dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
+    """Test tile-level all-gather using direct function call (ctx methods removed)."""
+    if not dist.is_initialized():
+        pytest.skip("torch.distributed not initialized")
+
+    # Skip if block size is larger than dimensions
+    if BLOCK_SIZE_M > M or BLOCK_SIZE_N > N:
+        pytest.skip(f"Block size ({BLOCK_SIZE_M}x{BLOCK_SIZE_N}) larger than dimensions ({M}x{N})")
+
+    heap_size = 2**33  # 8GB
+    shmem = iris.iris(heap_size)
+    rank = shmem.get_rank()
+    world_size = shmem.get_num_ranks()
+
+    # PyTorch's all_gather format: each rank has M x N data
+    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
+    pytorch_input_tensor.fill_(float(rank + 1))
+
+    # Run PyTorch's all_gather to get reference output
+    pytorch_output_list = [torch.empty_like(pytorch_input_tensor) for _ in range(world_size)]
+    shmem.barrier()
+    dist.all_gather(pytorch_output_list, pytorch_input_tensor)
+
+    if gather_dim == 0:
+        pytorch_output_tensor = torch.cat(pytorch_output_list, dim=0)
+    else:
+        pytorch_output_tensor = torch.cat(pytorch_output_list, dim=1)
+
+    torch.cuda.synchronize()
+
+    # Set up Iris tensors
+    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
+    iris_input_tensor.copy_(pytorch_input_tensor)
+
+    if gather_dim == 0:
+        iris_output_tensor = shmem.zeros((world_size * M, N), dtype=dtype)
+    else:
+        iris_output_tensor = shmem.zeros((M, world_size * N), dtype=dtype)
+
+    shmem.barrier()
+
+    # Launch kernel using NEW ctx API
+    num_pid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
+    num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
+    total_tiles = num_pid_m * num_pid_n
+    grid = (total_tiles,)
+
+    x_all_gather_ctx_api_kernel[grid](
+        iris_input_tensor,
+        iris_output_tensor,
+        M,
+        N,
+        iris_input_tensor.stride(0),
+        iris_input_tensor.stride(1),
+        iris_output_tensor.stride(0),
+        iris_output_tensor.stride(1),
+        shmem.get_heap_bases(),
+        rank,
+        world_size,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+        gather_dim,
+    )
+
+    torch.cuda.synchronize()
+    shmem.barrier()
+
+    try:
+        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol, rtol=rtol), (
+            f"Rank {rank}: all_gather() output doesn't match PyTorch's all_gather"
+        )
+
+        if rank == 0:
+            dim_str = "rows" if gather_dim == 0 else "cols"
+            print(f"✓ all_gather() test passed ({dim_str}): {dtype}, M={M}, N={N}")
+    finally:
+        shmem.barrier()
+        del shmem
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/x/test_all_reduce.py b/tests/x/test_all_reduce.py
index 08f579b77..5ff434f5c 100644
--- a/tests/x/test_all_reduce.py
+++ b/tests/x/test_all_reduce.py
@@ -204,7 +204,164 @@ def x_all_reduce_spinlock_kernel(
         iris.x.all_reduce_spinlock(tile, dst_view, locks_ptr, ctx)
 
 
+@pytest.mark.parametrize(
+    "variant",
+    [
+        "atomic",
+        "one_shot",
+        "two_shot",
+        # TODO enable these tests when support for cache-modifiers is in place.
+        # "spinlock",
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float16, 1e-3, 1e-3),
+        (torch.float32, 1e-5, 1e-5),
+        (torch.bfloat16, 1e-3, 1e-3),
+    ],
+)
+@pytest.mark.parametrize(
+    "M, N, BLOCK_SIZE_M, BLOCK_SIZE_N",
+    [
+        (128, 64, 64, 32),  # Small
+        (1024, 256, 128, 128),  # Medium
+        (2048, 2048, 256, 256),  # Large
+        # (100, 100, 64, 64),  # Non-aligned dimensions - DISABLED: other=0.0 not supported
+        # (256, 384, 128, 128),  # Non-square - DISABLED: other=0.0 not supported
+        # (64, 32, 128, 128),  # Block size larger than dimensions - DISABLED: other=0.0 not supported
+    ],
+)
+def test_all_reduce(variant, dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
+    """Test tile-level all-reduce primitives by comparing against PyTorch's implementation."""
+    if not dist.is_initialized():
+        pytest.skip("torch.distributed not initialized")
+
+    heap_size = 2**33  # 8GB
+    shmem = iris.iris(heap_size)
+    rank = shmem.get_rank()
+    world_size = shmem.get_num_ranks()
+
+    # PyTorch's all_reduce format: each rank has M x N data
+    pytorch_input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
+    pytorch_input_tensor.fill_(float(rank + 1))
+
+    # Run PyTorch's all_reduce to get reference output
+    pytorch_output_tensor = pytorch_input_tensor.clone()
+    shmem.barrier()
+    dist.all_reduce(pytorch_output_tensor, op=dist.ReduceOp.SUM)
+    torch.cuda.synchronize()
+
+    # Set up Iris tensors
+    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
+    iris_input_tensor.copy_(pytorch_input_tensor)
+    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
+
+    # Prepare workspace if needed (locks + temp_buffer for one_shot/two_shot)
+    locks = None
+    temp_buffer = None
+    num_pid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
+    num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
+    total_tiles = num_pid_m * num_pid_n
 
-pytestmark = pytest.mark.multi_rank_required
-
-@pytest.mark.parametrize(
\ No newline at end of file
+    if variant in ["spinlock", "one_shot", "two_shot"]:
+        locks = shmem.zeros((total_tiles,), dtype=torch.int32)
+
+    if variant in ["one_shot", "two_shot"]:
+        temp_buffer = shmem.zeros((M, N), dtype=dtype)
+
+    shmem.barrier()
+
+    # Select kernel based on variant
+    if variant == "atomic":
+        kernel = x_all_reduce_atomic_kernel
+    elif variant == "one_shot":
+        kernel = x_all_reduce_one_shot_kernel
+    elif variant == "two_shot":
+        kernel = x_all_reduce_two_shot_kernel
+    elif variant == "spinlock":
+        kernel = x_all_reduce_spinlock_kernel
+    else:
+        pytest.fail(f"Unknown variant: {variant}")
+
+    # Launch kernel
+    grid = (total_tiles,)
+
+    if variant in ["one_shot", "two_shot"]:
+        kernel[grid](
+            iris_input_tensor,
+            temp_buffer,
+            iris_output_tensor,
+            locks,
+            M,
+            N,
+            iris_input_tensor.stride(0),
+            iris_input_tensor.stride(1),
+            iris_output_tensor.stride(0),
+            iris_output_tensor.stride(1),
+            shmem.get_heap_bases(),
+            rank,
+            world_size,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+        )
+    elif variant == "spinlock":
+        kernel[grid](
+            iris_input_tensor,
+            iris_output_tensor,
+            locks,
+            M,
+            N,
+            iris_input_tensor.stride(0),
+            iris_input_tensor.stride(1),
+            iris_output_tensor.stride(0),
+            iris_output_tensor.stride(1),
+            shmem.get_heap_bases(),
+            rank,
+            world_size,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+        )
+    else:  # atomic
+        kernel[grid](
+            iris_input_tensor,
+            iris_output_tensor,
+            M,
+            N,
+            iris_input_tensor.stride(0),
+            iris_input_tensor.stride(1),
+            iris_output_tensor.stride(0),
+            iris_output_tensor.stride(1),
+            shmem.get_heap_bases(),
+            rank,
+            world_size,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+        )
+
+    torch.cuda.synchronize()
+    shmem.barrier()
+
+    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
+
+    try:
+        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol, rtol=rtol), (
+            f"Max difference: {max_diff}, expected < {atol}\n"
+            f"Rank {rank}: Iris x.all_reduce_{variant} output doesn't match PyTorch's all_reduce"
+        )
+
+        # Verify the reduction is correct (sum of all ranks)
+        expected_sum = sum(float(r + 1) for r in range(world_size))
+        assert torch.allclose(iris_output_tensor, torch.full_like(iris_output_tensor, expected_sum), atol=atol), (
+            f"Rank {rank}: Reduction result is incorrect, expected {expected_sum}"
+        )
+
+        if rank == 0:
+            print(f"✓ All-reduce {variant} test passed: {dtype}, M={M}, N={N}, blocks=({BLOCK_SIZE_M},{BLOCK_SIZE_N})")
+    finally:
+        shmem.barrier()
+        del shmem
+        import gc
+
+        gc.collect()
diff --git a/tests/x/test_all_to_all.py b/tests/x/test_all_to_all.py
index c62ed54f1..89782d1ed 100644
--- a/tests/x/test_all_to_all.py
+++ b/tests/x/test_all_to_all.py
@@ -54,7 +54,112 @@ def x_all_to_all_kernel(
         iris.x.all_to_all(tile, src_view, dst_view, N_per_rank, ctx)
 
 
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float16, 1e-3, 1e-3),
+        (torch.float32, 1e-5, 1e-5),
+        (torch.bfloat16, 1e-3, 1e-3),
+    ],
+)
+@pytest.mark.parametrize(
+    "M, N, BLOCK_SIZE_M, BLOCK_SIZE_N",
+    [
+        (128, 64, 64, 32),  # Small
+        (1024, 256, 128, 128),  # Medium
+        (2048, 2048, 256, 256),  # Large
+        (100, 100, 64, 64),  # Non-aligned dimensions
+        (256, 384, 128, 128),  # Non-square
+        (64, 32, 128, 128),  # Block size larger than dimensions
+    ],
+)
+def test_all_to_all(dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
+    """Test tile-level all-to-all primitive by comparing against PyTorch's implementation."""
+    if not dist.is_initialized():
+        pytest.skip("torch.distributed not initialized")
 
-pytestmark = pytest.mark.multi_rank_required
+    heap_size = 2**33  # 8GB
+    shmem = iris.iris(heap_size)
+    rank = shmem.get_rank()
+    world_size = shmem.get_num_ranks()
+
+    # PyTorch's all_to_all format: input is (M, N * world_size), output is (M, N * world_size)
+    # Each rank sends chunk [:, rank*N : (rank+1)*N] to all ranks
+    pytorch_input_tensor = torch.randn(M, N * world_size, dtype=dtype, device=f"cuda:{rank}")
+    # Fill with deterministic values: rank value in each rank's chunk
+    for r in range(world_size):
+        pytorch_input_tensor[:, r * N : (r + 1) * N].fill_(float(r + 1))
+
+    # Run PyTorch's all_to_all to get reference output
+    shmem.barrier()
+    # PyTorch all_to_all: split input into chunks, send chunk i to rank i
+    # Make chunks contiguous as required by PyTorch dist.all_to_all
+    input_chunks = [chunk.contiguous() for chunk in torch.chunk(pytorch_input_tensor, world_size, dim=1)]
+    output_chunks = [torch.empty(M, N, dtype=dtype, device=f"cuda:{rank}") for _ in range(world_size)]
+    dist.all_to_all(output_chunks, input_chunks)
+    pytorch_output_tensor = torch.cat(output_chunks, dim=1)
+    torch.cuda.synchronize()
+
+    # Set up Iris tensors
+    iris_input_tensor = shmem.zeros((M, N * world_size), dtype=dtype)
+    iris_input_tensor.copy_(pytorch_input_tensor)
+    iris_output_tensor = shmem.zeros((M, N * world_size), dtype=dtype)
+
+    shmem.barrier()
+
+    # Launch kernel
+    num_pid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
+    num_pid_n = ((N * world_size) + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N  # Use total N dimension
+    total_tiles = num_pid_m * num_pid_n
+    grid = (total_tiles,)
+
+    x_all_to_all_kernel[grid](
+        iris_input_tensor,
+        iris_output_tensor,
+        M,
+        N * world_size,  # Total N dimension
+        N,  # N_per_rank
+        iris_input_tensor.stride(0),
+        iris_input_tensor.stride(1),
+        iris_output_tensor.stride(0),
+        iris_output_tensor.stride(1),
+        shmem.get_heap_bases(),
+        rank,
+        world_size,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+    )
+
+    torch.cuda.synchronize()
+    shmem.barrier()
+
+    max_diff = torch.abs(iris_output_tensor - pytorch_output_tensor).max().item()
+
+    try:
+        assert torch.allclose(iris_output_tensor, pytorch_output_tensor, atol=atol, rtol=rtol), (
+            f"Max difference: {max_diff}, expected < {atol}\n"
+            f"Rank {rank}: Iris x.all_to_all output doesn't match PyTorch's all_to_all"
+        )
+
+        # Verify each rank's received chunks contain correct data
+        # In all-to-all, rank dst receives chunk dst from each rank src
+        # Since all ranks filled chunk i with value (i+1), each rank should receive
+        # its own chunk number from all other ranks
+        for r in range(world_size):
+            start_col = r * N
+            end_col = (r + 1) * N
+            chunk_data = iris_output_tensor[:, start_col:end_col]
+            # This chunk contains data from rank r. Rank r sent us chunk 'rank' which has value (rank+1)
+            expected_value = float(rank + 1)
+            assert torch.allclose(chunk_data, torch.full_like(chunk_data, expected_value), atol=atol), (
+                f"Rank {rank}: Data from rank {r} (chunk {rank}) should have value {expected_value}"
+            )
+
+        if rank == 0:
+            print(f"✓ All-to-all test passed: {dtype}, M={M}, N={N}, blocks=({BLOCK_SIZE_M},{BLOCK_SIZE_N})")
+    finally:
+        shmem.barrier()
+        del shmem
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()
diff --git a/tests/x/test_gather.py b/tests/x/test_gather.py
index 99cc20831..020b1f8e6 100644
--- a/tests/x/test_gather.py
+++ b/tests/x/test_gather.py
@@ -62,7 +62,174 @@ def gather_kernel(
         tl.store(out_ptr, data, mask=mask)
 
 
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float16, 1e-3, 1e-3),
+        (torch.float32, 1e-5, 1e-5),
+    ],
+)
+@pytest.mark.parametrize("M, N, BLOCK_SIZE_M, BLOCK_SIZE_N", [(256, 128, 64, 64)])
+def test_gather_from_specific_rank(dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
+    """Test gather primitive pulling from a specific rank."""
+    if not dist.is_initialized():
+        pytest.skip("torch.distributed not initialized")
 
-pytestmark = pytest.mark.multi_rank_required
+    heap_size = 2**33  # 8GB
+    shmem = iris.iris(heap_size)
+    rank = shmem.get_rank()
+    world_size = shmem.get_num_ranks()
+
+    if world_size < 2:
+        pytest.skip("Need at least 2 ranks")
+
+    # Each rank creates unique input data
+    torch.manual_seed(42 + rank)
+    input_tensor = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
+    output_tensor = torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}")
+
+    # Allocate in shmem
+    shmem_input = shmem.zeros(M, N, dtype=dtype)
+    shmem_output = shmem.zeros(M, N, dtype=dtype)
+    shmem_input.copy_(input_tensor)
+
+    shmem.barrier()
+
+    # Each rank gathers from rank 0
+    source_rank = 0
+    grid = (64,)
+
+    gather_kernel[grid](
+        shmem_input,
+        shmem_output,
+        M,
+        N,
+        shmem_input.stride(0),
+        shmem_input.stride(1),
+        shmem_output.stride(0),
+        shmem_output.stride(1),
+        shmem.heap_bases,
+        rank,
+        source_rank,
+        world_size,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+    )
+
+    shmem.barrier()
+    output_tensor.copy_(shmem_output)
+    torch.cuda.synchronize()
+
+    torch.manual_seed(42 + source_rank)
+    expected = torch.randn(M, N, dtype=dtype, device=f"cuda:{rank}")
+
+    assert torch.allclose(output_tensor, expected, atol=atol, rtol=rtol), (
+        f"Rank {rank}: gather from rank {source_rank} failed"
+    )
+
+
+@triton.jit
+def gather_accumulate_kernel(
+    input_ptr,
+    output_ptr,
+    M: tl.constexpr,
+    N: tl.constexpr,
+    stride_in_m: tl.constexpr,
+    stride_in_n: tl.constexpr,
+    stride_out_m: tl.constexpr,
+    stride_out_n: tl.constexpr,
+    heap_bases: tl.tensor,
+    cur_rank: tl.constexpr,
+    world_size: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """Test kernel that gathers from all ranks and accumulates (like all-reduce sum)."""
+    pid = tl.program_id(0)
+    grid_size = tl.num_programs(0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    total_tiles = num_pid_m * num_pid_n
+
+    for tile_id in range(pid, total_tiles, grid_size):
+        pid_m = tile_id // num_pid_n
+        pid_n = tile_id % num_pid_n
+
+        tile = iris.x.TileView(pid_m, pid_n, BLOCK_SIZE_M, BLOCK_SIZE_N)
+        src_view = iris.x.TensorView(input_ptr, M, N, stride_in_m, stride_in_n)
+        ctx = iris.x.DeviceContext(cur_rank, world_size, heap_bases)
+
+        # Accumulate data from all ranks
+        acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for source_rank in range(world_size):
+            data = iris.x.gather(tile, src_view, source_rank, ctx)
+            acc += data
+
+        # Store accumulated result
+        rm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        rn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        mask_m = rm < M
+        mask_n = rn < N
+        mask = mask_m[:, None] & mask_n[None, :]
+        out_ptr = output_ptr + rm[:, None] * stride_out_m + rn[None, :] * stride_out_n
+        result = acc.to(output_ptr.type.element_ty)
+        tl.store(out_ptr, result, mask=mask)
+
+
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float16, 1e-2, 1e-2),
+        (torch.float32, 1e-5, 1e-5),
+    ],
+)
+@pytest.mark.parametrize("M, N, BLOCK_SIZE_M, BLOCK_SIZE_N", [(256, 128, 64, 64)])
+def test_gather_accumulate_pattern(dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
+    """Test gather used in accumulation pattern (like all-reduce sum)."""
+    if not dist.is_initialized():
+        pytest.skip("torch.distributed not initialized")
+
+    heap_size = 2**33  # 8GB
+    shmem = iris.iris(heap_size)
+    rank = shmem.get_rank()
+    world_size = shmem.get_num_ranks()
+
+    # Each rank creates input with value = rank
+    input_tensor = torch.full((M, N), float(rank), dtype=dtype, device=f"cuda:{rank}")
+    output_tensor = torch.zeros(M, N, dtype=dtype, device=f"cuda:{rank}")
+
+    # Allocate in shmem
+    shmem_input = shmem.zeros(M, N, dtype=dtype)
+    shmem_output = shmem.zeros(M, N, dtype=dtype)
+    shmem_input.copy_(input_tensor)
+
+    shmem.barrier()
+
+    # Gather and accumulate from all ranks
+    grid = (64,)
+    gather_accumulate_kernel[grid](
+        shmem_input,
+        shmem_output,
+        M,
+        N,
+        shmem_input.stride(0),
+        shmem_input.stride(1),
+        shmem_output.stride(0),
+        shmem_output.stride(1),
+        shmem.heap_bases,
+        rank,
+        world_size,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+    )
+
+    shmem.barrier()
+    output_tensor.copy_(shmem_output)
+    torch.cuda.synchronize()
+
+    expected_sum = sum(range(world_size))
+    expected = torch.full((M, N), float(expected_sum), dtype=dtype, device=f"cuda:{rank}")
 
-@pytest.mark.parametrize(
\ No newline at end of file
+    assert torch.allclose(output_tensor, expected, atol=atol, rtol=rtol), (
+        f"Rank {rank}: gather accumulate pattern failed"
+    )
diff --git a/tests/x/test_reduce_scatter.py b/tests/x/test_reduce_scatter.py
index ccaf13326..1773fe14c 100644
--- a/tests/x/test_reduce_scatter.py
+++ b/tests/x/test_reduce_scatter.py
@@ -67,7 +67,101 @@ def x_reduce_scatter_kernel(
         iris.x.reduce_scatter(tile, src_view, dst_view, locks, ctx)
 
 
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float16, 1e-3, 1e-3),
+        (torch.float32, 1e-5, 1e-5),
+        (torch.bfloat16, 1e-3, 1e-3),
+    ],
+)
+@pytest.mark.parametrize(
+    "M, N, BLOCK_SIZE_M, BLOCK_SIZE_N",
+    [
+        (128, 64, 64, 32),
+        (256, 128, 64, 64),
+        (512, 512, 128, 128),
+    ],
+)
+def test_reduce_scatter(dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZE_N):
+    """Test tile-level reduce-scatter primitive."""
+    if not dist.is_initialized():
+        pytest.skip("torch.distributed not initialized")
 
-pytestmark = pytest.mark.multi_rank_required
+    heap_size = 2**33
+    shmem = iris.iris(heap_size)
+    rank = shmem.get_rank()
+    world_size = shmem.get_num_ranks()
+
+    pytorch_input_tensor = torch.full((M, N), float(rank + 1), dtype=dtype, device=f"cuda:{rank}")
+
+    num_pid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M
+    num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
+    total_tiles = num_pid_m * num_pid_n
+    tiles_per_rank = total_tiles // world_size
+    start_tile = rank * tiles_per_rank
+    if rank == world_size - 1:
+        tiles_per_rank = total_tiles - start_tile
+
+    iris_input_tensor = shmem.zeros((M, N), dtype=dtype)
+    iris_input_tensor.copy_(pytorch_input_tensor)
+    iris_temp_buffer = shmem.zeros((M, N), dtype=dtype)
+    iris_output_tensor = shmem.zeros((M, N), dtype=dtype)
+
+    locks_tensor = shmem.zeros(total_tiles, dtype=torch.int32)
+
+    shmem.barrier()
+
+    grid = (total_tiles,)
+
+    x_reduce_scatter_kernel[grid](
+        iris_input_tensor,
+        iris_temp_buffer,
+        iris_output_tensor,
+        locks_tensor,
+        M,
+        N,
+        iris_input_tensor.stride(0),
+        iris_input_tensor.stride(1),
+        iris_output_tensor.stride(0),
+        iris_output_tensor.stride(1),
+        shmem.get_heap_bases(),
+        rank,
+        world_size,
+        BLOCK_SIZE_M,
+        BLOCK_SIZE_N,
+    )
+
+    torch.cuda.synchronize()
+    shmem.barrier()
+
+    expected_sum = sum(float(r + 1) for r in range(world_size))
+
+    try:
+        for local_tile_idx in range(tiles_per_rank):
+            tile_id = start_tile + local_tile_idx
+            pid_m = tile_id // num_pid_n
+            pid_n = tile_id % num_pid_n
+
+            m_start = pid_m * BLOCK_SIZE_M
+            m_end = min(m_start + BLOCK_SIZE_M, M)
+            n_start = pid_n * BLOCK_SIZE_N
+            n_end = min(n_start + BLOCK_SIZE_N, N)
+
+            tile_data = iris_output_tensor[m_start:m_end, n_start:n_end]
+            expected_tile = torch.full_like(tile_data, expected_sum)
+
+            assert torch.allclose(tile_data, expected_tile, atol=atol, rtol=rtol), (
+                f"Rank {rank}, tile {tile_id} ({pid_m},{pid_n}): "
+                f"Expected {expected_sum}, got max {tile_data.max().item()}, "
+                f"min {tile_data.min().item()}"
+            )
+
+        if rank == 0:
+            print(f"Reduce-scatter test passed: {dtype}, M={M}, N={N}, blocks=({BLOCK_SIZE_M},{BLOCK_SIZE_N})")
+    finally:
+        shmem.barrier()
+        del shmem
+        import gc
 
-@pytest.mark.parametrize(
\ No newline at end of file
+        gc.collect()

From 45e5d12ad32c29f312740da3e8342ab72c3320f5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 4 Feb 2026 17:12:58 +0000
Subject: [PATCH 06/11] Simplify implementation - use pytest marker filtering
 instead of workflow changes

- Reverted extensive CI workflow matrix changes
- Added minimal marker filtering in run_tests.sh (skip single_rank when NUM_RANKS > 1)
- Updated documentation to reflect simplified approach
- Keeps test markers in files, uses pytest's built-in filtering

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 .github/scripts/run_tests.sh     |  22 +-
 .github/workflows/iris-tests.yml | 399 ++-----------------------------
 docs/test_optimization.md        |  60 ++---
 3 files changed, 41 insertions(+), 440 deletions(-)

diff --git a/.github/scripts/run_tests.sh b/.github/scripts/run_tests.sh
index f24f03ac3..1d5c94d4b 100755
--- a/.github/scripts/run_tests.sh
+++ b/.github/scripts/run_tests.sh
@@ -3,7 +3,7 @@
 # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Run Iris tests in a container
-# Usage: run_tests.sh <test_dir> <num_ranks> [gpu_devices] [install_method] [marker]
+# Usage: run_tests.sh <test_dir> <num_ranks> [gpu_devices] [install_method]
 #   test_dir: subdirectory under tests/ (e.g., examples, unittests, ccl)
 #   num_ranks: number of GPU ranks (1, 2, 4, or 8)
 #   gpu_devices: comma-separated GPU device IDs (optional)
@@ -11,8 +11,6 @@
 #     - "git": pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
 #     - "editable": pip install -e .
 #     - "install": pip install .
-#   marker: pytest marker expression (optional, e.g., "single_rank", "multi_rank_required")
-#     - If not provided, all tests are run
 
 set -e
 
@@ -20,15 +18,13 @@ TEST_DIR=$1
 NUM_RANKS=$2
 GPU_DEVICES=${3:-""}
 INSTALL_METHOD=${4:-"editable"}
-MARKER=${5:-""}
 
 if [ -z "$TEST_DIR" ] || [ -z "$NUM_RANKS" ]; then
     echo "[ERROR] Missing required arguments"
-    echo "Usage: $0 <test_dir> <num_ranks> [gpu_devices] [install_method] [marker]"
+    echo "Usage: $0 <test_dir> <num_ranks> [gpu_devices] [install_method]"
     echo "  test_dir: examples, unittests, x or ccl"
     echo "  num_ranks: 1, 2, 4, or 8"
     echo "  install_method: git, editable, or install (default: editable)"
-    echo "  marker: pytest marker expression (optional)"
     exit 1
 fi
 
@@ -66,12 +62,6 @@ elif [ "$INSTALL_METHOD" = "install" ]; then
     INSTALL_CMD="pip install ."
 fi
 
-# Build marker argument for pytest
-MARKER_ARG=""
-if [ -n "$MARKER" ]; then
-    MARKER_ARG="-m \"$MARKER\""
-fi
-
 # Run tests in container
 "$SCRIPT_DIR/container_exec.sh" $GPU_ARG "
     set -e
@@ -103,9 +93,15 @@ fi
     $INSTALL_CMD
     
     # Run tests in the specified directory
+    # Skip single_rank tests when running with multiple ranks
+    MARKER_ARG=\"\"
+    if [ \"$NUM_RANKS\" -gt 1 ]; then
+        MARKER_ARG=\"-m 'not single_rank'\"
+    fi
+    
     for test_file in tests/$TEST_DIR/test_*.py; do
         if [ -f \"\$test_file\" ]; then
-            echo \"Testing: \$test_file with $NUM_RANKS ranks (install: $INSTALL_METHOD, marker: $MARKER)\"
+            echo \"Testing: \$test_file with $NUM_RANKS ranks (install: $INSTALL_METHOD)\"
             python tests/run_tests_distributed.py --num_ranks $NUM_RANKS \"\$test_file\" $MARKER_ARG -v --tb=short --durations=10
         fi
     done
diff --git a/.github/workflows/iris-tests.yml b/.github/workflows/iris-tests.yml
index 0f07c8885..fdfef7330 100644
--- a/.github/workflows/iris-tests.yml
+++ b/.github/workflows/iris-tests.yml
@@ -38,196 +38,74 @@ jobs:
           bash .github/scripts/container_build.sh
 
   test-git:
-    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, ${{ matrix.marker }}, git install)
+    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, git install)
     needs: build-container-image
     runs-on: [self-hosted, mi3xx]
     strategy:
       fail-fast: false
       matrix:
         include:
-          # Phase 1: Run single_rank tests only on 1 rank
+          # Test each subdirectory with each rank count using git install
           - test_dir: examples
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: unittests
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: ccl
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: x
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: ops
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          # Phase 2: Run multi_rank_required tests on all rank configs
-          - test_dir: examples
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
-          # Phase 3: Run unmarked tests on all rank configs (backward compatibility)
-          - test_dir: examples
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: examples
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: examples
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: examples
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: x
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: x
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: x
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: x
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ops
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ops
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ops
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ops
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
 
     steps:
       - name: Checkout repository
@@ -243,207 +121,84 @@ jobs:
           GITHUB_SHA: ${{ github.sha }}
         run: |
           set -e
-          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: git, marker: ${{ matrix.marker }})"
+          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: git)"
           bash .github/scripts/run_tests.sh \
             "${{ matrix.test_dir }}" \
             "${{ matrix.num_ranks }}" \
             "${{ matrix.gpu_devices }}" \
-            "git" \
-            "${{ matrix.marker }}"
+            "git"
           echo "::endgroup::"
-          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (git, marker: ${{ matrix.marker }}) passed!"
+          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (git) passed!"
 
   test-editable:
-    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, ${{ matrix.marker }}, editable install)
+    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, editable install)
     needs: [build-container-image, test-git]
     runs-on: [self-hosted, mi3xx]
     strategy:
       fail-fast: false
       matrix:
         include:
-          # Phase 1: Run single_rank tests only on 1 rank
-          - test_dir: examples
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: unittests
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: ccl
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: x
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: ops
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          # Phase 2: Run multi_rank_required tests on all rank configs
+          # Test each subdirectory with each rank count using editable install
           - test_dir: examples
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: examples
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: unittests
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: ccl
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: x
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
           - test_dir: ops
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
-          # Phase 3: Run unmarked tests on all rank configs (backward compatibility)
-          - test_dir: examples
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: examples
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: examples
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: examples
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: x
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: x
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: x
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: x
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ops
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ops
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ops
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
-          - test_dir: ops
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
 
     steps:
       - name: Checkout repository
@@ -456,207 +211,84 @@ jobs:
       - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (editable install)
         run: |
           set -e
-          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: editable, marker: ${{ matrix.marker }})"
+          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: editable)"
           bash .github/scripts/run_tests.sh \
             "${{ matrix.test_dir }}" \
             "${{ matrix.num_ranks }}" \
             "${{ matrix.gpu_devices }}" \
-            "editable" \
-            "${{ matrix.marker }}"
+            "editable"
           echo "::endgroup::"
-          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (editable, marker: ${{ matrix.marker }}) passed!"
+          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (editable) passed!"
 
   test-install:
-    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, ${{ matrix.marker }}, pip install)
+    name: Test ${{ matrix.test_dir }} (${{ matrix.num_ranks }} ranks, pip install)
     needs: [build-container-image, test-editable]
     runs-on: [self-hosted, mi3xx]
     strategy:
       fail-fast: false
       matrix:
         include:
-          # Phase 1: Run single_rank tests only on 1 rank
-          - test_dir: examples
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: unittests
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: ccl
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: x
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          - test_dir: ops
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "single_rank"
-          # Phase 2: Run multi_rank_required tests on all rank configs
-          - test_dir: examples
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "multi_rank_required"
-          - test_dir: examples
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "multi_rank_required"
-          - test_dir: examples
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
-          - test_dir: examples
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
-          - test_dir: unittests
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
-          - test_dir: ccl
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
-          - test_dir: x
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "multi_rank_required"
-          - test_dir: x
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "multi_rank_required"
-          - test_dir: x
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
-          - test_dir: x
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
-          - test_dir: ops
-            num_ranks: 1
-            gpu_devices: "0,1"
-            marker: "multi_rank_required"
-          - test_dir: ops
-            num_ranks: 2
-            gpu_devices: "2,3"
-            marker: "multi_rank_required"
-          - test_dir: ops
-            num_ranks: 4
-            gpu_devices: "4,5,6,7"
-            marker: "multi_rank_required"
-          - test_dir: ops
-            num_ranks: 8
-            gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "multi_rank_required"
-          # Phase 3: Run unmarked tests on all rank configs (backward compatibility)
+          # Test each subdirectory with each rank count using pip install
           - test_dir: examples
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: examples
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: unittests
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: ccl
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: x
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 1
             gpu_devices: "0,1"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 2
             gpu_devices: "2,3"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 4
             gpu_devices: "4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
           - test_dir: ops
             num_ranks: 8
             gpu_devices: "0,1,2,3,4,5,6,7"
-            marker: "not single_rank and not multi_rank_required"
 
     steps:
       - name: Checkout repository
@@ -669,13 +301,12 @@ jobs:
       - name: Run ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (pip install)
         run: |
           set -e
-          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: install, marker: ${{ matrix.marker }})"
+          echo "::group::Running ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install: install)"
           bash .github/scripts/run_tests.sh \
             "${{ matrix.test_dir }}" \
             "${{ matrix.num_ranks }}" \
             "${{ matrix.gpu_devices }}" \
-            "install" \
-            "${{ matrix.marker }}"
+            "install"
           echo "::endgroup::"
-          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install, marker: ${{ matrix.marker }}) passed!"
+          echo "✅ ${{ matrix.test_dir }} tests with ${{ matrix.num_ranks }} ranks (install) passed!"
 
diff --git a/docs/test_optimization.md b/docs/test_optimization.md
index ff0b4163d..d0b9b2bb5 100644
--- a/docs/test_optimization.md
+++ b/docs/test_optimization.md
@@ -8,14 +8,15 @@ This document describes the Phase 1 test suite optimization implemented to reduc
 
 Analysis revealed that the original test suite was running **every test** on **all 4 rank configurations** (1, 2, 4, 8 ranks), which was wasteful. While multi-rank validation is essential for distributed features (symmetric heap allocation, cross-rank operations), many tests only validate tensor properties (shape, dtype, values) and don't require multi-rank execution.
 
-### Original Test Matrix
+### Original Test Execution
 - **3 install methods** × **5 test directories** × **4 rank configs** = **60 CI jobs**
 - Each job runs all tests in a directory
 - Total multi-rank test runs: **6.37M**
 
-### Optimized Test Matrix
-- **3 install methods** × **65 matrix entries** = **195 CI jobs**
-- Tests are filtered by pytest markers
+### Optimized Test Execution
+- **Same CI matrix structure** (no workflow changes)
+- Tests are filtered automatically by pytest markers
+- Single-rank tests skip execution when NUM_RANKS > 1
 - Total multi-rank test runs: **3.98M** (37.5% reduction)
 
 ## Implementation
@@ -59,50 +60,23 @@ The script:
 - Adds `pytestmark = pytest.mark.<marker>` to test files
 - Preserves backward compatibility for unmarked tests
 
-### 4. CI Workflow Updates
+### 4. Test Filtering
 
-The `.github/workflows/iris-tests.yml` file was updated to run tests based on markers:
-
-**Phase 1: Single-rank tests (5 entries per install method)**
-```yaml
-- test_dir: examples
-  num_ranks: 1
-  marker: "single_rank"
-```
-
-**Phase 2: Multi-rank tests (20 entries per install method)**
-```yaml
-- test_dir: examples
-  num_ranks: 1
-  marker: "multi_rank_required"
-- test_dir: examples
-  num_ranks: 2
-  marker: "multi_rank_required"
-# ... continues for 4 and 8 ranks
-```
-
-**Phase 3: Unmarked tests (20 entries per install method for backward compatibility)**
-```yaml
-- test_dir: examples
-  num_ranks: 1
-  marker: "not single_rank and not multi_rank_required"
-# ... continues for all ranks
-```
-
-### 5. Test Script Updates
-
-The `run_tests.sh` script was updated to accept a marker parameter:
+The `.github/scripts/run_tests.sh` script was minimally modified to skip `single_rank` tests when running with multiple ranks:
 
 ```bash
-bash .github/scripts/run_tests.sh \
-  "$test_dir" \
-  "$num_ranks" \
-  "$gpu_devices" \
-  "$install_method" \
-  "$marker"  # New parameter
+# Skip single_rank tests when running with multiple ranks
+MARKER_ARG=""
+if [ "$NUM_RANKS" -gt 1 ]; then
+    MARKER_ARG="-m 'not single_rank'"
+fi
 ```
 
-The marker is passed to pytest using `-m "marker_expression"`.
+This approach:
+- Requires minimal changes to CI infrastructure
+- Uses pytest's built-in marker filtering
+- Automatically skips single_rank tests on multi-rank configurations
+- Preserves the existing CI workflow structure
 
 ## Adding New Tests
 

From 7c7a2b8d12fdaa09799eae286f896f00690c616f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 4 Feb 2026 17:40:58 +0000
Subject: [PATCH 07/11] Simplify markers - assume multi-rank by default, only
 mark single_rank tests

- Removed assign_test_markers.py script (no longer needed)
- Removed all multi_rank_required markers from 47 test files
- Kept only single_rank markers on 10 test files (zeros, ones, rand, etc.)
- Updated pytest.ini to define only single_rank marker
- Updated documentation to reflect simpler approach: multi-rank is default
- This reduces marking burden and makes the intent clearer

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 docs/test_optimization.md                  |  81 +++---
 pytest.ini                                 |   3 +-
 scripts/assign_test_markers.py             | 284 ---------------------
 tests/ccl/test_all_gather.py               |   2 -
 tests/ccl/test_all_reduce.py               |   2 -
 tests/ccl/test_all_to_all.py               |   2 -
 tests/ccl/test_all_to_all_gluon.py         |   2 -
 tests/ccl/test_process_groups.py           |   2 -
 tests/examples/test_all_load_bench.py      |   2 -
 tests/examples/test_atomic_add_bench.py    |   2 -
 tests/examples/test_flash_decode.py        |   2 -
 tests/examples/test_load_bench.py          |   2 -
 tests/examples/test_message_passing.py     |   2 -
 tests/ops/test_all_gather_matmul.py        |   2 -
 tests/ops/test_matmul_all_gather.py        |   2 -
 tests/ops/test_matmul_all_reduce.py        |   2 -
 tests/ops/test_matmul_reduce_scatter.py    |   2 -
 tests/unittests/test_atomic_add_gluon.py   |   2 -
 tests/unittests/test_atomic_add_triton.py  |   2 -
 tests/unittests/test_atomic_and_gluon.py   |   2 -
 tests/unittests/test_atomic_and_triton.py  |   2 -
 tests/unittests/test_atomic_cas_gluon.py   |   2 -
 tests/unittests/test_atomic_cas_triton.py  |   2 -
 tests/unittests/test_atomic_max_gluon.py   |   2 -
 tests/unittests/test_atomic_max_triton.py  |   2 -
 tests/unittests/test_atomic_min_gluon.py   |   2 -
 tests/unittests/test_atomic_min_triton.py  |   2 -
 tests/unittests/test_atomic_or_gluon.py    |   2 -
 tests/unittests/test_atomic_or_triton.py   |   2 -
 tests/unittests/test_atomic_xchg_gluon.py  |   2 -
 tests/unittests/test_atomic_xchg_triton.py |   2 -
 tests/unittests/test_atomic_xor_gluon.py   |   2 -
 tests/unittests/test_atomic_xor_triton.py  |   2 -
 tests/unittests/test_broadcast_gluon.py    |   2 -
 tests/unittests/test_broadcast_triton.py   |   2 -
 tests/unittests/test_copy_gluon.py         |   2 -
 tests/unittests/test_copy_triton.py        |   2 -
 tests/unittests/test_get_gluon.py          |   2 -
 tests/unittests/test_get_triton.py         |   2 -
 tests/unittests/test_load_gluon.py         |   2 -
 tests/unittests/test_load_triton.py        |   2 -
 tests/unittests/test_put_gluon.py          |   2 -
 tests/unittests/test_put_triton.py         |   2 -
 tests/unittests/test_store_gluon.py        |   2 -
 tests/unittests/test_store_triton.py       |   2 -
 tests/x/test_all_gather.py                 |   2 -
 tests/x/test_all_reduce.py                 |   2 -
 tests/x/test_all_to_all.py                 |   2 -
 tests/x/test_gather.py                     |   2 -
 tests/x/test_reduce_scatter.py             |   2 -
 50 files changed, 37 insertions(+), 425 deletions(-)
 delete mode 100755 scripts/assign_test_markers.py

diff --git a/docs/test_optimization.md b/docs/test_optimization.md
index d0b9b2bb5..97fa96b5c 100644
--- a/docs/test_optimization.md
+++ b/docs/test_optimization.md
@@ -17,50 +17,31 @@ Analysis revealed that the original test suite was running **every test** on **a
 - **Same CI matrix structure** (no workflow changes)
 - Tests are filtered automatically by pytest markers
 - Single-rank tests skip execution when NUM_RANKS > 1
+- **Default behavior**: All tests run on all ranks unless marked `single_rank`
 - Total multi-rank test runs: **3.98M** (37.5% reduction)
 
 ## Implementation
 
-### 1. Pytest Markers
+### 1. Pytest Marker
 
-Two new markers were added in `pytest.ini`:
+One marker is defined in `pytest.ini`:
 
 - **`@pytest.mark.single_rank`**: Tests that validate tensor properties (shape, dtype, values)
   - These tests only need to run on **1 rank**
   - Examples: `test_zeros`, `test_ones`, `test_rand`, `test_full`, `test_empty`
   
-- **`@pytest.mark.multi_rank_required`**: Tests that validate distributed behavior
-  - These tests must run on **all rank configurations** (1, 2, 4, 8)
-  - Examples: `test_get_*`, `test_put_*`, `test_load_*`, `test_store_*`, `test_all_reduce`, `test_all_gather`
+**Default behavior**: Tests without the `single_rank` marker run on **all rank configurations** (1, 2, 4, 8). This includes all distributed tests (get, put, load, store, atomics, collectives) without requiring explicit marking.
 
 ### 2. Test Classification
 
-Tests were classified into three categories:
+Tests are classified into two categories:
 
 | Category | Count | Runs on Ranks | Examples |
 |----------|-------|---------------|----------|
-| `single_rank` | 10 files | 1 only | zeros, ones, rand, empty, full, arange, linspace, randint, randn, zeros_like |
-| `multi_rank_required` | 47 files | 1, 2, 4, 8 | get, put, load, store, atomic_*, broadcast, copy, all_reduce, all_gather, all_to_all |
-| Unmarked | 4 files | 1, 2, 4, 8 | logging, dmabuf_apis, get_num_xcc, iris_helpers |
+| `single_rank` (marked) | 10 files | 1 only | zeros, ones, rand, empty, full, arange, linspace, randint, randn, zeros_like |
+| Default (unmarked) | 51 files | 1, 2, 4, 8 | get, put, load, store, atomic_*, broadcast, copy, all_reduce, all_gather, all_to_all, logging, dmabuf_apis, get_num_xcc, iris_helpers |
 
-### 3. Automated Marker Assignment
-
-A Python script `scripts/assign_test_markers.py` was created to automate the marker assignment process:
-
-```bash
-# Preview changes (dry run)
-python scripts/assign_test_markers.py --dry-run --test-dir tests
-
-# Apply markers
-python scripts/assign_test_markers.py --test-dir tests
-```
-
-The script:
-- Classifies tests based on their functionality
-- Adds `pytestmark = pytest.mark.<marker>` to test files
-- Preserves backward compatibility for unmarked tests
-
-### 4. Test Filtering
+### 3. Test Filtering
 
 The `.github/scripts/run_tests.sh` script was minimally modified to skip `single_rank` tests when running with multiple ranks:
 
@@ -73,9 +54,10 @@ fi
 ```
 
 This approach:
-- Requires minimal changes to CI infrastructure
+- Requires minimal changes to CI infrastructure (only 6 lines added)
 - Uses pytest's built-in marker filtering
 - Automatically skips single_rank tests on multi-rank configurations
+- Assumes multi-rank by default (simpler, less marking required)
 - Preserves the existing CI workflow structure
 
 ## Adding New Tests
@@ -96,6 +78,24 @@ import iris
 
 pytestmark = pytest.mark.single_rank
 
+## Adding New Tests
+
+When adding new tests, follow these guidelines:
+
+### Single-rank Tests (Minority - Require Marking)
+Use `@pytest.mark.single_rank` **only** for tests that:
+- Validate tensor properties (shape, dtype, values)
+- Test tensor creation functions (zeros, ones, rand, etc.)
+- Don't involve cross-rank communication
+- Can verify correctness on a single rank
+
+Example:
+```python
+import pytest
+import iris
+
+pytestmark = pytest.mark.single_rank
+
 def test_zeros():
     shmem = iris.iris(1 << 20)
     result = shmem.zeros(2, 3, dtype=torch.float32)
@@ -103,20 +103,20 @@ def test_zeros():
     assert result.dtype == torch.float32
 ```
 
-### Multi-rank Tests
-Use `@pytest.mark.multi_rank_required` for tests that:
+### Multi-rank Tests (Majority - Default, No Marking Needed)
+**Do not mark** tests that:
 - Validate distributed behavior
 - Test cross-rank operations (get, put, load, store)
 - Test collective operations (all_reduce, all_gather, all_to_all)
 - Test atomic operations across ranks
 - Require symmetric heap visibility validation
 
-Example:
+Example (no marker needed):
 ```python
 import pytest
 import iris
 
-pytestmark = pytest.mark.multi_rank_required
+# No pytestmark needed - runs on all ranks by default
 
 def test_all_reduce():
     shmem = iris.iris(1 << 20)
@@ -126,11 +126,7 @@ def test_all_reduce():
     # Validation logic...
 ```
 
-### Unmarked Tests
-Leave tests unmarked if:
-- They test infrastructure/utilities (logging, helpers)
-- Classification is unclear
-- Backward compatibility is preferred
+**Key principle**: Assume multi-rank by default. Only mark the small subset of tests that are `single_rank`.
 
 ## Running Tests Locally
 
@@ -144,19 +140,14 @@ pytest tests/
 pytest tests/ -m single_rank
 ```
 
-### Run only multi-rank tests
-```bash
-pytest tests/ -m multi_rank_required
-```
-
-### Run unmarked tests
+### Run only multi-rank tests (unmarked)
 ```bash
-pytest tests/ -m "not single_rank and not multi_rank_required"
+pytest tests/ -m "not single_rank"
 ```
 
 ### Run with specific rank count
 ```bash
-python tests/run_tests_distributed.py --num_ranks 4 tests/ccl/test_all_reduce.py -m multi_rank_required
+python tests/run_tests_distributed.py --num_ranks 4 tests/ccl/test_all_reduce.py
 ```
 
 ## Expected Impact
diff --git a/pytest.ini b/pytest.ini
index 7413a3d3e..c12be3955 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -3,5 +3,4 @@
 
 [pytest]
 markers =
-    single_rank: Tests validating tensor properties (shape, dtype, values) - run on 1 rank only
-    multi_rank_required: Tests validating distributed behavior (symmetric heap visibility, cross-rank operations) - run on all ranks
+    single_rank: Tests validating tensor properties (shape, dtype, values) - run on 1 rank only. All other tests run on all rank configurations by default.
diff --git a/scripts/assign_test_markers.py b/scripts/assign_test_markers.py
deleted file mode 100755
index 492a9e01f..000000000
--- a/scripts/assign_test_markers.py
+++ /dev/null
@@ -1,284 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: MIT
-# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
-
-"""
-Automated Test Marker Assignment Script
-
-This script assigns pytest markers (@pytest.mark.single_rank or @pytest.mark.multi_rank_required)
-to test files based on the type of functionality they test.
-
-Classification rules:
-- single_rank: Tests validating tensor properties (shape, dtype, values) - run on 1 rank only
-  Examples: zeros, ones, empty, full, rand, randint, randn, arange, linspace
-  
-- multi_rank_required: Tests validating distributed behavior (symmetric heap visibility, cross-rank operations) - run on all ranks
-  Examples: get, put, load, store, atomic operations, broadcast, copy, all_reduce, all_gather, all_to_all
-"""
-
-import os
-import sys
-import re
-from pathlib import Path
-
-
-# Tests that should be marked as single_rank (tensor property tests)
-SINGLE_RANK_PATTERNS = [
-    "test_zeros.py",
-    "test_ones.py", 
-    "test_empty.py",
-    "test_full.py",
-    "test_rand.py",
-    "test_randint.py",
-    "test_randn.py",
-    "test_arange.py",
-    "test_linspace.py",
-    "test_zeros_like.py",
-]
-
-# Tests that should be marked as multi_rank_required (distributed tests)
-MULTI_RANK_PATTERNS = [
-    # Remote memory access operations
-    "test_get_gluon.py",
-    "test_get_triton.py",
-    "test_put_gluon.py",
-    "test_put_triton.py",
-    "test_load_gluon.py",
-    "test_load_triton.py",
-    "test_store_gluon.py",
-    "test_store_triton.py",
-    # Atomic operations
-    "test_atomic_add_gluon.py",
-    "test_atomic_add_triton.py",
-    "test_atomic_and_gluon.py",
-    "test_atomic_and_triton.py",
-    "test_atomic_cas_gluon.py",
-    "test_atomic_cas_triton.py",
-    "test_atomic_max_gluon.py",
-    "test_atomic_max_triton.py",
-    "test_atomic_min_gluon.py",
-    "test_atomic_min_triton.py",
-    "test_atomic_or_gluon.py",
-    "test_atomic_or_triton.py",
-    "test_atomic_xchg_gluon.py",
-    "test_atomic_xchg_triton.py",
-    "test_atomic_xor_gluon.py",
-    "test_atomic_xor_triton.py",
-    # Data movement operations
-    "test_broadcast_gluon.py",
-    "test_broadcast_triton.py",
-    "test_copy_gluon.py",
-    "test_copy_triton.py",
-    # Collective operations (all in ccl, ops, x directories)
-    "test_all_reduce.py",
-    "test_all_gather.py",
-    "test_all_to_all.py",
-    "test_all_to_all_gluon.py",
-    "test_process_groups.py",
-    "test_reduce_scatter.py",
-    "test_gather.py",
-    # Matmul + collective operations
-    "test_all_gather_matmul.py",
-    "test_matmul_all_gather.py",
-    "test_matmul_all_reduce.py",
-    "test_matmul_reduce_scatter.py",
-]
-
-# Tests in examples directory that test distributed behavior
-EXAMPLE_MULTI_RANK_PATTERNS = [
-    "test_load_bench.py",
-    "test_all_load_bench.py",
-    "test_atomic_add_bench.py",
-    "test_message_passing.py",
-    "test_flash_decode.py",
-]
-
-
-def should_mark_single_rank(filepath: Path) -> bool:
-    """Check if a test file should be marked as single_rank."""
-    filename = filepath.name
-    return filename in SINGLE_RANK_PATTERNS
-
-
-def should_mark_multi_rank(filepath: Path) -> bool:
-    """Check if a test file should be marked as multi_rank_required."""
-    filename = filepath.name
-    
-    # Check if it's in the patterns list
-    if filename in MULTI_RANK_PATTERNS:
-        return True
-    
-    # Check if it's in examples directory and matches example patterns
-    if "examples" in filepath.parts and filename in EXAMPLE_MULTI_RANK_PATTERNS:
-        return True
-    
-    return False
-
-
-def get_marker_for_file(filepath: Path) -> str:
-    """Determine the appropriate marker for a test file."""
-    if should_mark_single_rank(filepath):
-        return "single_rank"
-    elif should_mark_multi_rank(filepath):
-        return "multi_rank_required"
-    else:
-        # Leave unmarked for backward compatibility
-        return None
-
-
-def has_marker(content: str, marker: str) -> bool:
-    """Check if the file already has the specified marker."""
-    marker_pattern = rf"pytestmark\s*=\s*pytest\.mark\.{marker}"
-    return re.search(marker_pattern, content) is not None
-
-
-def add_marker_to_file(filepath: Path, marker: str, dry_run: bool = False) -> bool:
-    """Add a pytest marker to a test file using pytestmark."""
-    with open(filepath, 'r') as f:
-        content = f.read()
-    
-    # Check if marker already exists
-    if has_marker(content, marker):
-        print(f"  ✓ {filepath.name} already has pytestmark = pytest.mark.{marker}")
-        return False
-    
-    lines = content.split('\n')
-    
-    # Find the position to insert the marker
-    # It should go after the last import and before the first non-comment, non-import line
-    insert_pos = None
-    in_docstring = False
-    docstring_char = None
-    
-    for i, line in enumerate(lines):
-        stripped = line.strip()
-        
-        # Handle docstrings
-        if stripped.startswith('"""') or stripped.startswith("'''"):
-            if not in_docstring:
-                # Starting a docstring
-                in_docstring = True
-                docstring_char = stripped[:3]
-                # Check if it's a one-liner docstring
-                if stripped.count(docstring_char) >= 2:
-                    in_docstring = False
-                continue
-            elif stripped.endswith(docstring_char):
-                # Ending a docstring
-                in_docstring = False
-                continue
-        
-        if in_docstring:
-            continue
-            
-        # Skip empty lines and comments
-        if not stripped or stripped.startswith('#'):
-            continue
-            
-        # Skip import lines
-        if stripped.startswith('import ') or stripped.startswith('from '):
-            continue
-            
-        # This is the first non-import, non-docstring line - insert before it
-        insert_pos = i
-        break
-    
-    if insert_pos is None:
-        print(f"  ✗ Could not find appropriate location to add marker in {filepath.name}")
-        return False
-    
-    # Insert the marker with appropriate spacing
-    marker_line = f"\npytestmark = pytest.mark.{marker}\n"
-    lines.insert(insert_pos, marker_line)
-    
-    new_content = '\n'.join(lines)
-    
-    if dry_run:
-        print(f"  → Would add pytestmark = pytest.mark.{marker} to {filepath.name}")
-        return True
-    else:
-        with open(filepath, 'w') as f:
-            f.write(new_content)
-        print(f"  ✓ Added pytestmark = pytest.mark.{marker} to {filepath.name}")
-        return True
-
-
-def process_test_directory(test_dir: Path, dry_run: bool = False) -> dict:
-    """Process all test files in a directory."""
-    stats = {
-        'total': 0,
-        'single_rank': 0,
-        'multi_rank': 0,
-        'unmarked': 0,
-        'modified': 0,
-    }
-    
-    for test_file in test_dir.rglob('test_*.py'):
-        stats['total'] += 1
-        marker = get_marker_for_file(test_file)
-        
-        if marker == 'single_rank':
-            stats['single_rank'] += 1
-            if add_marker_to_file(test_file, marker, dry_run):
-                stats['modified'] += 1
-        elif marker == 'multi_rank_required':
-            stats['multi_rank'] += 1
-            if add_marker_to_file(test_file, marker, dry_run):
-                stats['modified'] += 1
-        else:
-            stats['unmarked'] += 1
-            print(f"  - {test_file.name} left unmarked (backward compatibility)")
-    
-    return stats
-
-
-def main():
-    """Main entry point."""
-    import argparse
-    
-    parser = argparse.ArgumentParser(
-        description='Assign pytest markers to test files based on functionality',
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__
-    )
-    parser.add_argument(
-        '--dry-run',
-        action='store_true',
-        help='Show what would be done without making changes'
-    )
-    parser.add_argument(
-        '--test-dir',
-        type=Path,
-        default=Path('tests'),
-        help='Path to tests directory (default: tests)'
-    )
-    
-    args = parser.parse_args()
-    
-    if not args.test_dir.exists():
-        print(f"Error: Test directory {args.test_dir} does not exist")
-        sys.exit(1)
-    
-    print(f"Processing test files in {args.test_dir}...")
-    if args.dry_run:
-        print("DRY RUN - no files will be modified\n")
-    
-    stats = process_test_directory(args.test_dir, args.dry_run)
-    
-    print("\n" + "="*70)
-    print("Summary:")
-    print("="*70)
-    print(f"Total test files:           {stats['total']}")
-    print(f"Single-rank tests:          {stats['single_rank']}")
-    print(f"Multi-rank required tests:  {stats['multi_rank']}")
-    print(f"Unmarked tests:             {stats['unmarked']}")
-    print(f"Files modified:             {stats['modified']}")
-    
-    if args.dry_run:
-        print("\nRun without --dry-run to apply changes")
-    
-    return 0
-
-
-if __name__ == '__main__':
-    sys.exit(main())
diff --git a/tests/ccl/test_all_gather.py b/tests/ccl/test_all_gather.py
index 51acc2fdd..da69e6fa9 100644
--- a/tests/ccl/test_all_gather.py
+++ b/tests/ccl/test_all_gather.py
@@ -13,8 +13,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/ccl/test_all_reduce.py b/tests/ccl/test_all_reduce.py
index 877e895da..e2913662f 100644
--- a/tests/ccl/test_all_reduce.py
+++ b/tests/ccl/test_all_reduce.py
@@ -13,8 +13,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @pytest.mark.parametrize(
     "variant",
     [
diff --git a/tests/ccl/test_all_to_all.py b/tests/ccl/test_all_to_all.py
index 9a321c11f..f7fa392fb 100644
--- a/tests/ccl/test_all_to_all.py
+++ b/tests/ccl/test_all_to_all.py
@@ -13,8 +13,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/ccl/test_all_to_all_gluon.py b/tests/ccl/test_all_to_all_gluon.py
index 52771f85c..fffdfbdb4 100644
--- a/tests/ccl/test_all_to_all_gluon.py
+++ b/tests/ccl/test_all_to_all_gluon.py
@@ -11,8 +11,6 @@
 
 # Try to import Gluon, skip tests if not available
 
-pytestmark = pytest.mark.multi_rank_required
-
 try:
     import iris.experimental.iris_gluon as iris_gluon
     from iris.ccl import Config
diff --git a/tests/ccl/test_process_groups.py b/tests/ccl/test_process_groups.py
index 9b5399a77..82c813f15 100644
--- a/tests/ccl/test_process_groups.py
+++ b/tests/ccl/test_process_groups.py
@@ -18,8 +18,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 def _get_world_info():
     """Get world size and rank, skip if not enough ranks."""
     if not dist.is_initialized():
diff --git a/tests/examples/test_all_load_bench.py b/tests/examples/test_all_load_bench.py
index 2ee4deef8..b1ba077e0 100644
--- a/tests/examples/test_all_load_bench.py
+++ b/tests/examples/test_all_load_bench.py
@@ -10,8 +10,6 @@
 from pathlib import Path
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 current_dir = Path(__file__).parent
 file_path = (current_dir / "../../examples/02_all_load/all_load_bench.py").resolve()
 module_name = "all_load_bench"
diff --git a/tests/examples/test_atomic_add_bench.py b/tests/examples/test_atomic_add_bench.py
index a9f9b1402..53ec41630 100644
--- a/tests/examples/test_atomic_add_bench.py
+++ b/tests/examples/test_atomic_add_bench.py
@@ -11,8 +11,6 @@
 from pathlib import Path
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 current_dir = Path(__file__).parent
 
 # Add examples directory to sys.path so that example files can import from examples.common
diff --git a/tests/examples/test_flash_decode.py b/tests/examples/test_flash_decode.py
index 80a0e8349..5a516c991 100644
--- a/tests/examples/test_flash_decode.py
+++ b/tests/examples/test_flash_decode.py
@@ -40,8 +40,6 @@
 import iris
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 project_root = Path(__file__).resolve()
 while not (project_root / "tests").is_dir() or not (project_root / "examples").is_dir():
     if project_root == project_root.parent:
diff --git a/tests/examples/test_load_bench.py b/tests/examples/test_load_bench.py
index 3d9082a5e..60a252ce9 100644
--- a/tests/examples/test_load_bench.py
+++ b/tests/examples/test_load_bench.py
@@ -11,8 +11,6 @@
 from pathlib import Path
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 current_dir = Path(__file__).parent
 file_path = (current_dir / "../../examples/00_load/load_bench.py").resolve()
 module_name = "load_bench"
diff --git a/tests/examples/test_message_passing.py b/tests/examples/test_message_passing.py
index d1104113d..87798f5c4 100644
--- a/tests/examples/test_message_passing.py
+++ b/tests/examples/test_message_passing.py
@@ -11,8 +11,6 @@
 from pathlib import Path
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 current_dir = Path(__file__).parent
 
 # Import message_passing_load_store module
diff --git a/tests/ops/test_all_gather_matmul.py b/tests/ops/test_all_gather_matmul.py
index 8d93f2662..393b057b8 100644
--- a/tests/ops/test_all_gather_matmul.py
+++ b/tests/ops/test_all_gather_matmul.py
@@ -16,8 +16,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/ops/test_matmul_all_gather.py b/tests/ops/test_matmul_all_gather.py
index f94984d79..2d9eaac25 100644
--- a/tests/ops/test_matmul_all_gather.py
+++ b/tests/ops/test_matmul_all_gather.py
@@ -15,8 +15,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/ops/test_matmul_all_reduce.py b/tests/ops/test_matmul_all_reduce.py
index 521dbcc1e..5e6b70792 100644
--- a/tests/ops/test_matmul_all_reduce.py
+++ b/tests/ops/test_matmul_all_reduce.py
@@ -16,8 +16,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/ops/test_matmul_reduce_scatter.py b/tests/ops/test_matmul_reduce_scatter.py
index e985c7801..802d3dfc7 100644
--- a/tests/ops/test_matmul_reduce_scatter.py
+++ b/tests/ops/test_matmul_reduce_scatter.py
@@ -13,8 +13,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/unittests/test_atomic_add_gluon.py b/tests/unittests/test_atomic_add_gluon.py
index 88b35af91..1db01b790 100644
--- a/tests/unittests/test_atomic_add_gluon.py
+++ b/tests/unittests/test_atomic_add_gluon.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def atomic_add_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_add_triton.py b/tests/unittests/test_atomic_add_triton.py
index dd1a9a811..af9c5f4d0 100644
--- a/tests/unittests/test_atomic_add_triton.py
+++ b/tests/unittests/test_atomic_add_triton.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def atomic_add_kernel(
     results,
diff --git a/tests/unittests/test_atomic_and_gluon.py b/tests/unittests/test_atomic_and_gluon.py
index 27aceb016..81c482a3f 100644
--- a/tests/unittests/test_atomic_and_gluon.py
+++ b/tests/unittests/test_atomic_and_gluon.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def atomic_and_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_and_triton.py b/tests/unittests/test_atomic_and_triton.py
index 5abe90d7c..895c13d6b 100644
--- a/tests/unittests/test_atomic_and_triton.py
+++ b/tests/unittests/test_atomic_and_triton.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def atomic_and_kernel(
     results,
diff --git a/tests/unittests/test_atomic_cas_gluon.py b/tests/unittests/test_atomic_cas_gluon.py
index fd856d3b1..76e13a82c 100644
--- a/tests/unittests/test_atomic_cas_gluon.py
+++ b/tests/unittests/test_atomic_cas_gluon.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def atomic_cas_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_cas_triton.py b/tests/unittests/test_atomic_cas_triton.py
index a63db3b74..5f1071d05 100644
--- a/tests/unittests/test_atomic_cas_triton.py
+++ b/tests/unittests/test_atomic_cas_triton.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def atomic_cas_kernel(
     results,
diff --git a/tests/unittests/test_atomic_max_gluon.py b/tests/unittests/test_atomic_max_gluon.py
index 7a33df0ab..4b0b2aa4d 100644
--- a/tests/unittests/test_atomic_max_gluon.py
+++ b/tests/unittests/test_atomic_max_gluon.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def atomic_max_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_max_triton.py b/tests/unittests/test_atomic_max_triton.py
index d9875786f..194c6dba0 100644
--- a/tests/unittests/test_atomic_max_triton.py
+++ b/tests/unittests/test_atomic_max_triton.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def atomic_max_kernel(
     results,
diff --git a/tests/unittests/test_atomic_min_gluon.py b/tests/unittests/test_atomic_min_gluon.py
index 7f2ba7d55..d0aa2bfd1 100644
--- a/tests/unittests/test_atomic_min_gluon.py
+++ b/tests/unittests/test_atomic_min_gluon.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def atomic_min_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_min_triton.py b/tests/unittests/test_atomic_min_triton.py
index 1fa6f90de..39e9f2efc 100644
--- a/tests/unittests/test_atomic_min_triton.py
+++ b/tests/unittests/test_atomic_min_triton.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def atomic_min_kernel(
     results,
diff --git a/tests/unittests/test_atomic_or_gluon.py b/tests/unittests/test_atomic_or_gluon.py
index 8de1ade2c..82d5122b2 100644
--- a/tests/unittests/test_atomic_or_gluon.py
+++ b/tests/unittests/test_atomic_or_gluon.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def atomic_or_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_or_triton.py b/tests/unittests/test_atomic_or_triton.py
index 132be6d57..cca442b8d 100644
--- a/tests/unittests/test_atomic_or_triton.py
+++ b/tests/unittests/test_atomic_or_triton.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def atomic_or_kernel(
     results,
diff --git a/tests/unittests/test_atomic_xchg_gluon.py b/tests/unittests/test_atomic_xchg_gluon.py
index 7ef3a4fb1..a120c6824 100644
--- a/tests/unittests/test_atomic_xchg_gluon.py
+++ b/tests/unittests/test_atomic_xchg_gluon.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def atomic_xchg_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_xchg_triton.py b/tests/unittests/test_atomic_xchg_triton.py
index cfbbd2df1..5236d3394 100644
--- a/tests/unittests/test_atomic_xchg_triton.py
+++ b/tests/unittests/test_atomic_xchg_triton.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def atomic_xchg_kernel(
     results,
diff --git a/tests/unittests/test_atomic_xor_gluon.py b/tests/unittests/test_atomic_xor_gluon.py
index f2cf50743..a564dbcd1 100644
--- a/tests/unittests/test_atomic_xor_gluon.py
+++ b/tests/unittests/test_atomic_xor_gluon.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def atomic_xor_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_atomic_xor_triton.py b/tests/unittests/test_atomic_xor_triton.py
index d12644ebf..668fd5e31 100644
--- a/tests/unittests/test_atomic_xor_triton.py
+++ b/tests/unittests/test_atomic_xor_triton.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def atomic_xor_kernel(
     results,
diff --git a/tests/unittests/test_broadcast_gluon.py b/tests/unittests/test_broadcast_gluon.py
index 574b06db5..f9584038b 100644
--- a/tests/unittests/test_broadcast_gluon.py
+++ b/tests/unittests/test_broadcast_gluon.py
@@ -8,8 +8,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @pytest.mark.parametrize(
     "value,expected",
     [
diff --git a/tests/unittests/test_broadcast_triton.py b/tests/unittests/test_broadcast_triton.py
index 42362aff8..ba002ec02 100644
--- a/tests/unittests/test_broadcast_triton.py
+++ b/tests/unittests/test_broadcast_triton.py
@@ -8,8 +8,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @pytest.mark.parametrize(
     "value,expected",
     [
diff --git a/tests/unittests/test_copy_gluon.py b/tests/unittests/test_copy_gluon.py
index 4590bbe61..2276ca7a8 100644
--- a/tests/unittests/test_copy_gluon.py
+++ b/tests/unittests/test_copy_gluon.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def copy_get_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_copy_triton.py b/tests/unittests/test_copy_triton.py
index 32cc0a797..90584d6e3 100644
--- a/tests/unittests/test_copy_triton.py
+++ b/tests/unittests/test_copy_triton.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def copy_get_kernel(
     data,
diff --git a/tests/unittests/test_get_gluon.py b/tests/unittests/test_get_gluon.py
index 910721b75..d3cb58a9a 100644
--- a/tests/unittests/test_get_gluon.py
+++ b/tests/unittests/test_get_gluon.py
@@ -13,8 +13,6 @@
 # 2. for remote get with one other rank.
 # 3. for remote get with more than one rank (if num_ranks > 2).
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def get_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_get_triton.py b/tests/unittests/test_get_triton.py
index 090f50a14..845e1c99a 100644
--- a/tests/unittests/test_get_triton.py
+++ b/tests/unittests/test_get_triton.py
@@ -13,8 +13,6 @@
 # 2. for remote get with one other rank.
 # 3. for remote get with more than one rank (if num_ranks > 2).
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def get_kernel(
     data,
diff --git a/tests/unittests/test_load_gluon.py b/tests/unittests/test_load_gluon.py
index 9dbd7df43..98ef319ee 100644
--- a/tests/unittests/test_load_gluon.py
+++ b/tests/unittests/test_load_gluon.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def load_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_load_triton.py b/tests/unittests/test_load_triton.py
index 733c6de8a..6ac6cf324 100644
--- a/tests/unittests/test_load_triton.py
+++ b/tests/unittests/test_load_triton.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def load_kernel(
     data,
diff --git a/tests/unittests/test_put_gluon.py b/tests/unittests/test_put_gluon.py
index 659366c8f..04ee9d8de 100644
--- a/tests/unittests/test_put_gluon.py
+++ b/tests/unittests/test_put_gluon.py
@@ -13,8 +13,6 @@
 # 2. for remote put with one other rank.
 # 3. for remote put with more than one rank (if num_ranks > 2).
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def put_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_put_triton.py b/tests/unittests/test_put_triton.py
index 0d5f61919..2febb43d4 100644
--- a/tests/unittests/test_put_triton.py
+++ b/tests/unittests/test_put_triton.py
@@ -13,8 +13,6 @@
 # 2. for remote put with one other rank.
 # 3. for remote put with more than one rank (if num_ranks > 2).
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def put_kernel(
     data,
diff --git a/tests/unittests/test_store_gluon.py b/tests/unittests/test_store_gluon.py
index 90616b38c..384f32275 100644
--- a/tests/unittests/test_store_gluon.py
+++ b/tests/unittests/test_store_gluon.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @gluon.jit
 def store_kernel(
     IrisDeviceCtx: gl.constexpr,
diff --git a/tests/unittests/test_store_triton.py b/tests/unittests/test_store_triton.py
index eee57658a..4319fd94f 100644
--- a/tests/unittests/test_store_triton.py
+++ b/tests/unittests/test_store_triton.py
@@ -9,8 +9,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def store_kernel(
     data,
diff --git a/tests/x/test_all_gather.py b/tests/x/test_all_gather.py
index f42b88939..ecf7fa70f 100644
--- a/tests/x/test_all_gather.py
+++ b/tests/x/test_all_gather.py
@@ -15,8 +15,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def x_all_gather_kernel(
     input_ptr,
diff --git a/tests/x/test_all_reduce.py b/tests/x/test_all_reduce.py
index 5ff434f5c..d50100b38 100644
--- a/tests/x/test_all_reduce.py
+++ b/tests/x/test_all_reduce.py
@@ -15,8 +15,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def x_all_reduce_atomic_kernel(
     input_ptr,
diff --git a/tests/x/test_all_to_all.py b/tests/x/test_all_to_all.py
index 89782d1ed..70e6b6e54 100644
--- a/tests/x/test_all_to_all.py
+++ b/tests/x/test_all_to_all.py
@@ -15,8 +15,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def x_all_to_all_kernel(
     input_ptr,
diff --git a/tests/x/test_gather.py b/tests/x/test_gather.py
index 020b1f8e6..051da63db 100644
--- a/tests/x/test_gather.py
+++ b/tests/x/test_gather.py
@@ -14,8 +14,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def gather_kernel(
     input_ptr,
diff --git a/tests/x/test_reduce_scatter.py b/tests/x/test_reduce_scatter.py
index 1773fe14c..5c1e9f6c3 100644
--- a/tests/x/test_reduce_scatter.py
+++ b/tests/x/test_reduce_scatter.py
@@ -15,8 +15,6 @@
 
 
-pytestmark = pytest.mark.multi_rank_required
-
 @triton.jit
 def x_reduce_scatter_kernel(
     input_ptr,

From 17c1c28fa8535eb10c6431c59bee964bd1a051cc Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 4 Feb 2026 17:44:06 +0000
Subject: [PATCH 08/11] Remove excessive blank lines from test files

- Fixed 47 test files that had 3+ consecutive blank lines
- Reduced to maximum 2 consecutive blank lines for cleaner formatting
- All files still have valid Python syntax

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 tests/ccl/test_all_gather.py               |  2 --
 tests/ccl/test_all_reduce.py               |  3 ---
 tests/ccl/test_all_to_all.py               |  2 --
 tests/ccl/test_all_to_all_gluon.py         |  1 -
 tests/ccl/test_process_groups.py           | 18 ------------------
 tests/examples/test_all_load_bench.py      |  4 ----
 tests/examples/test_atomic_add_bench.py    |  3 ---
 tests/examples/test_flash_decode.py        |  5 -----
 tests/examples/test_load_bench.py          |  2 --
 tests/examples/test_message_passing.py     |  5 -----
 tests/ops/test_all_gather_matmul.py        |  3 ---
 tests/ops/test_matmul_all_gather.py        |  2 --
 tests/ops/test_matmul_all_reduce.py        |  3 ---
 tests/ops/test_matmul_reduce_scatter.py    |  3 ---
 tests/unittests/test_atomic_add_gluon.py   |  3 ---
 tests/unittests/test_atomic_add_triton.py  |  3 ---
 tests/unittests/test_atomic_and_gluon.py   |  3 ---
 tests/unittests/test_atomic_and_triton.py  |  3 ---
 tests/unittests/test_atomic_cas_gluon.py   |  3 ---
 tests/unittests/test_atomic_cas_triton.py  |  3 ---
 tests/unittests/test_atomic_max_gluon.py   |  3 ---
 tests/unittests/test_atomic_max_triton.py  |  3 ---
 tests/unittests/test_atomic_min_gluon.py   |  3 ---
 tests/unittests/test_atomic_min_triton.py  |  3 ---
 tests/unittests/test_atomic_or_gluon.py    |  3 ---
 tests/unittests/test_atomic_or_triton.py   |  3 ---
 tests/unittests/test_atomic_xchg_gluon.py  |  3 ---
 tests/unittests/test_atomic_xchg_triton.py |  3 ---
 tests/unittests/test_atomic_xor_gluon.py   |  3 ---
 tests/unittests/test_atomic_xor_triton.py  |  3 ---
 tests/unittests/test_broadcast_gluon.py    |  4 ----
 tests/unittests/test_broadcast_triton.py   |  4 ----
 tests/unittests/test_copy_gluon.py         |  7 -------
 tests/unittests/test_copy_triton.py        |  7 -------
 tests/unittests/test_get_gluon.py          |  2 --
 tests/unittests/test_get_triton.py         |  2 --
 tests/unittests/test_load_gluon.py         |  3 ---
 tests/unittests/test_load_triton.py        |  3 ---
 tests/unittests/test_put_gluon.py          |  2 --
 tests/unittests/test_put_triton.py         |  2 --
 tests/unittests/test_store_gluon.py        |  3 ---
 tests/unittests/test_store_triton.py       |  3 ---
 tests/x/test_all_gather.py                 |  5 -----
 tests/x/test_all_reduce.py                 |  6 ------
 tests/x/test_all_to_all.py                 |  3 ---
 tests/x/test_gather.py                     |  5 -----
 tests/x/test_reduce_scatter.py             |  3 ---
 47 files changed, 168 deletions(-)

diff --git a/tests/ccl/test_all_gather.py b/tests/ccl/test_all_gather.py
index da69e6fa9..721b649bc 100644
--- a/tests/ccl/test_all_gather.py
+++ b/tests/ccl/test_all_gather.py
@@ -11,8 +11,6 @@
 import iris
 from iris.ccl import Config
 
-
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/ccl/test_all_reduce.py b/tests/ccl/test_all_reduce.py
index e2913662f..1e6971159 100644
--- a/tests/ccl/test_all_reduce.py
+++ b/tests/ccl/test_all_reduce.py
@@ -11,8 +11,6 @@
 import iris
 from iris.ccl import Config
 
-
-
 @pytest.mark.parametrize(
     "variant",
     [
@@ -108,7 +106,6 @@ def test_all_reduce(variant, dtype, M, N):
 
         gc.collect()
 
-
 @pytest.mark.parametrize(
     "distribution",
     [
diff --git a/tests/ccl/test_all_to_all.py b/tests/ccl/test_all_to_all.py
index f7fa392fb..be647829a 100644
--- a/tests/ccl/test_all_to_all.py
+++ b/tests/ccl/test_all_to_all.py
@@ -11,8 +11,6 @@
 import iris
 from iris.ccl import Config
 
-
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/ccl/test_all_to_all_gluon.py b/tests/ccl/test_all_to_all_gluon.py
index fffdfbdb4..280a6c234 100644
--- a/tests/ccl/test_all_to_all_gluon.py
+++ b/tests/ccl/test_all_to_all_gluon.py
@@ -20,7 +20,6 @@
 except ImportError:
     GLUON_AVAILABLE = False
 
-
 @pytest.mark.skipif(not GLUON_AVAILABLE, reason="Gluon not available")
 @pytest.mark.parametrize(
     "dtype",
diff --git a/tests/ccl/test_process_groups.py b/tests/ccl/test_process_groups.py
index 82c813f15..0e7bf198c 100644
--- a/tests/ccl/test_process_groups.py
+++ b/tests/ccl/test_process_groups.py
@@ -16,8 +16,6 @@
 import iris
 from iris.ccl import Config
 
-
-
 def _get_world_info():
     """Get world size and rank, skip if not enough ranks."""
     if not dist.is_initialized():
@@ -31,7 +29,6 @@ def _get_world_info():
 
     return world_size, rank
 
-
 def _create_consecutive_groups(world_size, group_size=2):
     """
     Create consecutive (TP-like) groups.
@@ -55,7 +52,6 @@ def _create_consecutive_groups(world_size, group_size=2):
             groups.append(None)
     return groups
 
-
 def _create_strided_groups(world_size, num_groups=2):
     """
     Create strided (DP-like) groups.
@@ -77,7 +73,6 @@ def _create_strided_groups(world_size, num_groups=2):
 
     return groups
 
-
 def _get_my_group(groups, rank):
     """Find which group the current rank belongs to."""
     for i, group in enumerate(groups):
@@ -87,12 +82,10 @@ def _get_my_group(groups, rank):
                 return i, group
     return None, None
 
-
 # =============================================================================
 # All-Reduce with Process Groups
 # =============================================================================
 
-
 @pytest.mark.parametrize(
     "variant",
     [
@@ -173,12 +166,10 @@ def test_all_reduce_with_groups(variant, group_type, dtype=torch.float32, M=256,
 
         gc.collect()
 
-
 # =============================================================================
 # All-Gather with Process Groups
 # =============================================================================
 
-
 @pytest.mark.parametrize("group_type", ["consecutive", "strided"])
 def test_all_gather_with_groups(group_type, dtype=torch.float32, M=128, N=64):
     """Test all-gather with ProcessGroups."""
@@ -235,12 +226,10 @@ def test_all_gather_with_groups(group_type, dtype=torch.float32, M=128, N=64):
 
         gc.collect()
 
-
 # =============================================================================
 # All-to-All with Process Groups
 # =============================================================================
 
-
 @pytest.mark.parametrize("group_type", ["consecutive", "strided"])
 def test_all_to_all_with_groups(group_type, dtype=torch.float32, M=128, N=64):
     """Test all-to-all with ProcessGroups."""
@@ -305,7 +294,6 @@ def test_all_to_all_with_groups(group_type, dtype=torch.float32, M=128, N=64):
 
         gc.collect()
 
-
 # =============================================================================
 # Reduce-Scatter with Process Groups
 # =============================================================================
@@ -317,7 +305,6 @@ def test_all_to_all_with_groups(group_type, dtype=torch.float32, M=128, N=64):
 # Until semantics are aligned, we test reduce_scatter with groups by verifying
 # that the group operations produce mathematically correct results.
 
-
 @pytest.mark.parametrize("group_type", ["consecutive", "strided"])
 def test_reduce_scatter_with_groups(group_type, dtype=torch.float32, M=256, N=128):
     """
@@ -380,12 +367,10 @@ def test_reduce_scatter_with_groups(group_type, dtype=torch.float32, M=256, N=12
 
         gc.collect()
 
-
 # =============================================================================
 # Edge Cases and Verification Tests
 # =============================================================================
 
-
 def test_group_info_extraction():
     """Test that extract_group_info returns correct values for different groups."""
     world_size, rank = _get_world_info()
@@ -432,7 +417,6 @@ def test_group_info_extraction():
 
     gc.collect()
 
-
 def test_all_reduce_group_correctness():
     """
     Verify all-reduce with groups produces correct mathematical results.
@@ -482,7 +466,6 @@ def test_all_reduce_group_correctness():
 
         gc.collect()
 
-
 def test_rank_stride_target_rank_calculation():
     """
     Explicitly test that rank_start + i * rank_stride correctly computes target_rank.
@@ -566,7 +549,6 @@ def test_rank_stride_target_rank_calculation():
 
     gc.collect()
 
-
 def test_all_gather_strided_data_placement():
     """
     Verify all-gather with strided groups places data in correct output locations.
diff --git a/tests/examples/test_all_load_bench.py b/tests/examples/test_all_load_bench.py
index b1ba077e0..5912797de 100644
--- a/tests/examples/test_all_load_bench.py
+++ b/tests/examples/test_all_load_bench.py
@@ -9,7 +9,6 @@
 import importlib.util
 from pathlib import Path
 
-
 current_dir = Path(__file__).parent
 file_path = (current_dir / "../../examples/02_all_load/all_load_bench.py").resolve()
 module_name = "all_load_bench"
@@ -17,7 +16,6 @@
 module = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(module)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -90,7 +88,6 @@ def test_all_load_bench(dtype, buffer_size, heap_size, block_size):
 
             gc.collect()
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -149,7 +146,6 @@ def test_all_load_bench_with_validation(dtype):
 
             gc.collect()
 
-
 def _torch_dtype_to_str(dtype):
     """Convert torch dtype to string format expected by all_load_bench.py"""
     if dtype == torch.int8:
diff --git a/tests/examples/test_atomic_add_bench.py b/tests/examples/test_atomic_add_bench.py
index 53ec41630..750f0508f 100644
--- a/tests/examples/test_atomic_add_bench.py
+++ b/tests/examples/test_atomic_add_bench.py
@@ -10,7 +10,6 @@
 import sys
 from pathlib import Path
 
-
 current_dir = Path(__file__).parent
 
 # Add examples directory to sys.path so that example files can import from examples.common
@@ -36,7 +35,6 @@
 module = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(module)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -104,7 +102,6 @@ def test_atomic_bandwidth(dtype, buffer_size, heap_size, block_size):
 
             gc.collect()
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/examples/test_flash_decode.py b/tests/examples/test_flash_decode.py
index 5a516c991..a8748f8c5 100644
--- a/tests/examples/test_flash_decode.py
+++ b/tests/examples/test_flash_decode.py
@@ -29,7 +29,6 @@
 #
 ################################################################################
 
-
 import sys
 from pathlib import Path
 import pytest
@@ -39,7 +38,6 @@
 import torch
 import iris
 
-
 project_root = Path(__file__).resolve()
 while not (project_root / "tests").is_dir() or not (project_root / "examples").is_dir():
     if project_root == project_root.parent:
@@ -60,7 +58,6 @@
 from flash_decode_fused_layer import flash_decode_fused_layer  # noqa: E402
 from utils import print_correctness_report  # noqa: E402
 
-
 def ref_paged_attn(
     query: torch.Tensor,
     key_cache: torch.Tensor,
@@ -100,7 +97,6 @@ def ref_paged_attn(
         start_idx += query_len
     return torch.cat(outputs, dim=0)
 
-
 def prepare_correctness_data(cfg, args, num_query_heads, num_kv_heads, NUM_BLOCKS):
     head_dim = cfg["head_dim"]
     if args.rank == 0:
@@ -117,7 +113,6 @@ def prepare_correctness_data(cfg, args, num_query_heads, num_kv_heads, NUM_BLOCK
 
     return {"query": query, "key_value_cache": key_value_cache}
 
-
 @pytest.mark.parametrize("head_dim", [128])
 @pytest.mark.parametrize("num_seqs", [1, 8])
 @pytest.mark.parametrize("num_heads", [48, 96])
diff --git a/tests/examples/test_load_bench.py b/tests/examples/test_load_bench.py
index 60a252ce9..a33556441 100644
--- a/tests/examples/test_load_bench.py
+++ b/tests/examples/test_load_bench.py
@@ -10,7 +10,6 @@
 import importlib.util
 from pathlib import Path
 
-
 current_dir = Path(__file__).parent
 file_path = (current_dir / "../../examples/00_load/load_bench.py").resolve()
 module_name = "load_bench"
@@ -18,7 +17,6 @@
 module = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(module)
 
-
 @pytest.mark.skip(reason="Test is inconsistent and needs debugging - tracked in issue")
 @pytest.mark.parametrize(
     "dtype",
diff --git a/tests/examples/test_message_passing.py b/tests/examples/test_message_passing.py
index 87798f5c4..d63443260 100644
--- a/tests/examples/test_message_passing.py
+++ b/tests/examples/test_message_passing.py
@@ -10,7 +10,6 @@
 import importlib.util
 from pathlib import Path
 
-
 current_dir = Path(__file__).parent
 
 # Import message_passing_load_store module
@@ -27,12 +26,10 @@
 put_module = importlib.util.module_from_spec(put_spec)
 put_spec.loader.exec_module(put_module)
 
-
 def create_test_args(dtype_str, buffer_size, heap_size, block_size):
     """Create args dict that matches what parse_args() returns."""
     return {"datatype": dtype_str, "buffer_size": buffer_size, "heap_size": heap_size, "block_size": block_size}
 
-
 def run_message_passing_kernels(module, args):
     """Run the core message passing logic without command line argument parsing."""
     shmem = None
@@ -109,7 +106,6 @@ def run_message_passing_kernels(module, args):
 
             gc.collect()
 
-
 @pytest.mark.parametrize(
     "dtype_str",
     [
@@ -139,7 +135,6 @@ def test_message_passing_load_store(dtype_str, buffer_size, heap_size, block_siz
     success = run_message_passing_kernels(load_store_module, args)
     assert success, "Message passing load/store validation failed"
 
-
 @pytest.mark.parametrize(
     "dtype_str",
     [
diff --git a/tests/ops/test_all_gather_matmul.py b/tests/ops/test_all_gather_matmul.py
index 393b057b8..224690f40 100644
--- a/tests/ops/test_all_gather_matmul.py
+++ b/tests/ops/test_all_gather_matmul.py
@@ -14,8 +14,6 @@
 
 import iris
 
-
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
@@ -101,7 +99,6 @@ def test_all_gather_matmul(dtype, atol, rtol, M, K_local, N):
         f"Rank {rank}: Max diff {max_diff}, expected < {atol}"
     )
 
-
 if __name__ == "__main__":
     # For quick debugging
     import sys
diff --git a/tests/ops/test_matmul_all_gather.py b/tests/ops/test_matmul_all_gather.py
index 2d9eaac25..7e12d87ac 100644
--- a/tests/ops/test_matmul_all_gather.py
+++ b/tests/ops/test_matmul_all_gather.py
@@ -13,8 +13,6 @@
 import torch.distributed as dist
 import iris
 
-
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/ops/test_matmul_all_reduce.py b/tests/ops/test_matmul_all_reduce.py
index 5e6b70792..99e34fee7 100644
--- a/tests/ops/test_matmul_all_reduce.py
+++ b/tests/ops/test_matmul_all_reduce.py
@@ -14,8 +14,6 @@
 import iris
 import iris.ops as ops
 
-
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
@@ -103,7 +101,6 @@ def test_matmul_all_reduce(dtype, atol, rtol, M, N, K, variant):
 
     gc.collect()
 
-
 def test_matmul_all_reduce_via_shmem_ops():
     """Test accessing matmul_all_reduce via shmem.ops namespace."""
     if not dist.is_initialized():
diff --git a/tests/ops/test_matmul_reduce_scatter.py b/tests/ops/test_matmul_reduce_scatter.py
index 802d3dfc7..89ae65260 100644
--- a/tests/ops/test_matmul_reduce_scatter.py
+++ b/tests/ops/test_matmul_reduce_scatter.py
@@ -11,8 +11,6 @@
 import iris
 import iris.ops as ops
 
-
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
@@ -98,7 +96,6 @@ def test_matmul_reduce_scatter(dtype, atol, rtol, M, N, K):
 
     gc.collect()
 
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/unittests/test_atomic_add_gluon.py b/tests/unittests/test_atomic_add_gluon.py
index 1db01b790..833f7d31c 100644
--- a/tests/unittests/test_atomic_add_gluon.py
+++ b/tests/unittests/test_atomic_add_gluon.py
@@ -7,8 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @gluon.jit
 def atomic_add_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -40,7 +38,6 @@ def atomic_add_kernel(
             scope=scope,
         )
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_add_triton.py b/tests/unittests/test_atomic_add_triton.py
index af9c5f4d0..a56875690 100644
--- a/tests/unittests/test_atomic_add_triton.py
+++ b/tests/unittests/test_atomic_add_triton.py
@@ -7,8 +7,6 @@
 import pytest
 import iris
 
-
-
 @triton.jit
 def atomic_add_kernel(
     results,
@@ -40,7 +38,6 @@ def atomic_add_kernel(
             scope=scope,
         )
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_and_gluon.py b/tests/unittests/test_atomic_and_gluon.py
index 81c482a3f..494fb8798 100644
--- a/tests/unittests/test_atomic_and_gluon.py
+++ b/tests/unittests/test_atomic_and_gluon.py
@@ -7,8 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @gluon.jit
 def atomic_and_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -34,7 +32,6 @@ def atomic_and_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_and(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_and_triton.py b/tests/unittests/test_atomic_and_triton.py
index 895c13d6b..b866c80e4 100644
--- a/tests/unittests/test_atomic_and_triton.py
+++ b/tests/unittests/test_atomic_and_triton.py
@@ -7,8 +7,6 @@
 import pytest
 import iris
 
-
-
 @triton.jit
 def atomic_and_kernel(
     results,
@@ -31,7 +29,6 @@ def atomic_and_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_and(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_cas_gluon.py b/tests/unittests/test_atomic_cas_gluon.py
index 76e13a82c..86e539796 100644
--- a/tests/unittests/test_atomic_cas_gluon.py
+++ b/tests/unittests/test_atomic_cas_gluon.py
@@ -7,8 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @gluon.jit
 def atomic_cas_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -31,7 +29,6 @@ def atomic_cas_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_cas(results, cmp, val, target_rank, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_cas_triton.py b/tests/unittests/test_atomic_cas_triton.py
index 5f1071d05..0ab4ef68b 100644
--- a/tests/unittests/test_atomic_cas_triton.py
+++ b/tests/unittests/test_atomic_cas_triton.py
@@ -7,8 +7,6 @@
 import pytest
 import iris
 
-
-
 @triton.jit
 def atomic_cas_kernel(
     results,
@@ -26,7 +24,6 @@ def atomic_cas_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_cas(results, cmp, val, cur_rank, target_rank, heap_bases, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_max_gluon.py b/tests/unittests/test_atomic_max_gluon.py
index 4b0b2aa4d..b6c67a3f9 100644
--- a/tests/unittests/test_atomic_max_gluon.py
+++ b/tests/unittests/test_atomic_max_gluon.py
@@ -7,8 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @gluon.jit
 def atomic_max_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -32,7 +30,6 @@ def atomic_max_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_max(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_max_triton.py b/tests/unittests/test_atomic_max_triton.py
index 194c6dba0..ef674082b 100644
--- a/tests/unittests/test_atomic_max_triton.py
+++ b/tests/unittests/test_atomic_max_triton.py
@@ -7,8 +7,6 @@
 import pytest
 import iris
 
-
-
 @triton.jit
 def atomic_max_kernel(
     results,
@@ -29,7 +27,6 @@ def atomic_max_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_max(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_min_gluon.py b/tests/unittests/test_atomic_min_gluon.py
index d0aa2bfd1..f1de1167e 100644
--- a/tests/unittests/test_atomic_min_gluon.py
+++ b/tests/unittests/test_atomic_min_gluon.py
@@ -7,8 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @gluon.jit
 def atomic_min_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -32,7 +30,6 @@ def atomic_min_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_min(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_min_triton.py b/tests/unittests/test_atomic_min_triton.py
index 39e9f2efc..e22767a6f 100644
--- a/tests/unittests/test_atomic_min_triton.py
+++ b/tests/unittests/test_atomic_min_triton.py
@@ -7,8 +7,6 @@
 import pytest
 import iris
 
-
-
 @triton.jit
 def atomic_min_kernel(
     results,
@@ -29,7 +27,6 @@ def atomic_min_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_min(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_or_gluon.py b/tests/unittests/test_atomic_or_gluon.py
index 82d5122b2..1841f00bf 100644
--- a/tests/unittests/test_atomic_or_gluon.py
+++ b/tests/unittests/test_atomic_or_gluon.py
@@ -7,8 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @gluon.jit
 def atomic_or_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -33,7 +31,6 @@ def atomic_or_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_or(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_or_triton.py b/tests/unittests/test_atomic_or_triton.py
index cca442b8d..a369f7e1a 100644
--- a/tests/unittests/test_atomic_or_triton.py
+++ b/tests/unittests/test_atomic_or_triton.py
@@ -7,8 +7,6 @@
 import pytest
 import iris
 
-
-
 @triton.jit
 def atomic_or_kernel(
     results,
@@ -30,7 +28,6 @@ def atomic_or_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_or(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_xchg_gluon.py b/tests/unittests/test_atomic_xchg_gluon.py
index a120c6824..122ffa704 100644
--- a/tests/unittests/test_atomic_xchg_gluon.py
+++ b/tests/unittests/test_atomic_xchg_gluon.py
@@ -7,8 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @gluon.jit
 def atomic_xchg_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -29,7 +27,6 @@ def atomic_xchg_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_xchg(results, val, target_rank, mask=None, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_xchg_triton.py b/tests/unittests/test_atomic_xchg_triton.py
index 5236d3394..d69a7eed1 100644
--- a/tests/unittests/test_atomic_xchg_triton.py
+++ b/tests/unittests/test_atomic_xchg_triton.py
@@ -7,8 +7,6 @@
 import pytest
 import iris
 
-
-
 @triton.jit
 def atomic_xchg_kernel(
     results,
@@ -25,7 +23,6 @@ def atomic_xchg_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_xchg(results, val, cur_rank, target_rank, heap_bases, mask=None, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_xor_gluon.py b/tests/unittests/test_atomic_xor_gluon.py
index a564dbcd1..fb5c76acc 100644
--- a/tests/unittests/test_atomic_xor_gluon.py
+++ b/tests/unittests/test_atomic_xor_gluon.py
@@ -7,8 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @gluon.jit
 def atomic_xor_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -34,7 +32,6 @@ def atomic_xor_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_xor(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_xor_triton.py b/tests/unittests/test_atomic_xor_triton.py
index 668fd5e31..24af6da36 100644
--- a/tests/unittests/test_atomic_xor_triton.py
+++ b/tests/unittests/test_atomic_xor_triton.py
@@ -7,8 +7,6 @@
 import pytest
 import iris
 
-
-
 @triton.jit
 def atomic_xor_kernel(
     results,
@@ -31,7 +29,6 @@ def atomic_xor_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_xor(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_broadcast_gluon.py b/tests/unittests/test_broadcast_gluon.py
index f9584038b..b51b407ca 100644
--- a/tests/unittests/test_broadcast_gluon.py
+++ b/tests/unittests/test_broadcast_gluon.py
@@ -6,8 +6,6 @@
 import pytest
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @pytest.mark.parametrize(
     "value,expected",
     [
@@ -44,7 +42,6 @@ def test_broadcast_scalar(value, expected):
 
         gc.collect()
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -77,7 +74,6 @@ def test_broadcast_tensor_dtype(dtype):
 
         gc.collect()
 
-
 @pytest.mark.parametrize(
     "shape",
     [
diff --git a/tests/unittests/test_broadcast_triton.py b/tests/unittests/test_broadcast_triton.py
index ba002ec02..ea6c047d7 100644
--- a/tests/unittests/test_broadcast_triton.py
+++ b/tests/unittests/test_broadcast_triton.py
@@ -6,8 +6,6 @@
 import pytest
 import iris
 
-
-
 @pytest.mark.parametrize(
     "value,expected",
     [
@@ -44,7 +42,6 @@ def test_broadcast_scalar(value, expected):
 
         gc.collect()
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -77,7 +74,6 @@ def test_broadcast_tensor_dtype(dtype):
 
         gc.collect()
 
-
 @pytest.mark.parametrize(
     "shape",
     [
diff --git a/tests/unittests/test_copy_gluon.py b/tests/unittests/test_copy_gluon.py
index 2276ca7a8..d16b816dc 100644
--- a/tests/unittests/test_copy_gluon.py
+++ b/tests/unittests/test_copy_gluon.py
@@ -7,8 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @gluon.jit
 def copy_get_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -32,7 +30,6 @@ def copy_get_kernel(
         dest_data = results + BLOCK_SIZE * target_rank
         ctx.copy(src_data + offsets, dest_data + offsets, target_rank, cur_rank, mask=mask)
 
-
 @gluon.jit
 def copy_put_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -56,7 +53,6 @@ def copy_put_kernel(
         dest_data = results + BLOCK_SIZE * cur_rank
         ctx.copy(src_data + offsets, dest_data + offsets, cur_rank, target_rank, mask=mask)
 
-
 @gluon.jit
 def copy_local_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -80,7 +76,6 @@ def copy_local_kernel(
         dest_data = results + BLOCK_SIZE * i
         ctx.copy(src_data + offsets, dest_data + offsets, cur_rank, cur_rank, mask=mask)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -148,7 +143,6 @@ def test_copy_get(dtype, BLOCK_SIZE):
 
         gc.collect()
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -218,7 +212,6 @@ def test_copy_put(dtype, BLOCK_SIZE):
 
         gc.collect()
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_copy_triton.py b/tests/unittests/test_copy_triton.py
index 90584d6e3..1d611984f 100644
--- a/tests/unittests/test_copy_triton.py
+++ b/tests/unittests/test_copy_triton.py
@@ -7,8 +7,6 @@
 import pytest
 import iris
 
-
-
 @triton.jit
 def copy_get_kernel(
     data,
@@ -29,7 +27,6 @@ def copy_get_kernel(
         dest_data = results + BLOCK_SIZE * target_rank
         iris.copy(src_data + offsets, dest_data + offsets, target_rank, cur_rank, cur_rank, heap_bases, mask)
 
-
 @triton.jit
 def copy_put_kernel(
     data,
@@ -50,7 +47,6 @@ def copy_put_kernel(
         dest_data = results + BLOCK_SIZE * cur_rank
         iris.copy(src_data + offsets, dest_data + offsets, cur_rank, target_rank, cur_rank, heap_bases, mask)
 
-
 @triton.jit
 def copy_local_kernel(
     data,
@@ -71,7 +67,6 @@ def copy_local_kernel(
         dest_data = results + BLOCK_SIZE * i
         iris.copy(src_data + offsets, dest_data + offsets, cur_rank, cur_rank, cur_rank, heap_bases, mask)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -130,7 +125,6 @@ def test_copy_get(dtype, BLOCK_SIZE):
 
         gc.collect()
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -191,7 +185,6 @@ def test_copy_put(dtype, BLOCK_SIZE):
 
         gc.collect()
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_get_gluon.py b/tests/unittests/test_get_gluon.py
index d3cb58a9a..a5440bfdd 100644
--- a/tests/unittests/test_get_gluon.py
+++ b/tests/unittests/test_get_gluon.py
@@ -7,7 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
 # TODO: Separate this kernel out in the following categories:
 # 1. for local get.
 # 2. for remote get with one other rank.
@@ -41,7 +40,6 @@ def get_kernel(
     # Store the accumulated value back to the output.
     gl.store(results + offsets, acc, mask=mask)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_get_triton.py b/tests/unittests/test_get_triton.py
index 845e1c99a..60781cf3e 100644
--- a/tests/unittests/test_get_triton.py
+++ b/tests/unittests/test_get_triton.py
@@ -7,7 +7,6 @@
 import pytest
 import iris
 
-
 # TODO: Separate this kernel out in the following categories:
 # 1. for local get.
 # 2. for remote get with one other rank.
@@ -38,7 +37,6 @@ def get_kernel(
     # Store the accumulated value back to the output.
     tl.store(results + offsets, acc, mask=mask)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_load_gluon.py b/tests/unittests/test_load_gluon.py
index 98ef319ee..37e35d251 100644
--- a/tests/unittests/test_load_gluon.py
+++ b/tests/unittests/test_load_gluon.py
@@ -7,8 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @gluon.jit
 def load_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -33,7 +31,6 @@ def load_kernel(
     result = ctx.load(data + offsets, partner, mask=mask)
     gl.store(results + offsets, result, mask=mask)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_load_triton.py b/tests/unittests/test_load_triton.py
index 6ac6cf324..8738368d3 100644
--- a/tests/unittests/test_load_triton.py
+++ b/tests/unittests/test_load_triton.py
@@ -7,8 +7,6 @@
 import pytest
 import iris
 
-
-
 @triton.jit
 def load_kernel(
     data,
@@ -30,7 +28,6 @@ def load_kernel(
     result = iris.load(data + offsets, source_rank, partner, heap_bases, mask=mask)
     tl.store(results + offsets, result, mask=mask)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_put_gluon.py b/tests/unittests/test_put_gluon.py
index 04ee9d8de..9fcadee33 100644
--- a/tests/unittests/test_put_gluon.py
+++ b/tests/unittests/test_put_gluon.py
@@ -7,7 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
 # TODO: Separate this kernel out in the following categories:
 # 1. for local put.
 # 2. for remote put with one other rank.
@@ -35,7 +34,6 @@ def put_kernel(
     for target_rank in range(num_ranks):
         ctx.put(data + offsets, results + offsets, target_rank, mask=mask)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_put_triton.py b/tests/unittests/test_put_triton.py
index 2febb43d4..c5ce8173c 100644
--- a/tests/unittests/test_put_triton.py
+++ b/tests/unittests/test_put_triton.py
@@ -7,7 +7,6 @@
 import pytest
 import iris
 
-
 # TODO: Separate this kernel out in the following categories:
 # 1. for local put.
 # 2. for remote put with one other rank.
@@ -32,7 +31,6 @@ def put_kernel(
     for target_rank in range(num_ranks):
         iris.put(data + offsets, results + offsets, cur_rank, target_rank, heap_bases, mask=mask)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_store_gluon.py b/tests/unittests/test_store_gluon.py
index 384f32275..5d8eeacb8 100644
--- a/tests/unittests/test_store_gluon.py
+++ b/tests/unittests/test_store_gluon.py
@@ -7,8 +7,6 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
-
-
 @gluon.jit
 def store_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -36,7 +34,6 @@ def store_kernel(
     for dst_rank in range(num_ranks):
         ctx.store(results + offsets, value, dst_rank, mask=mask)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_store_triton.py b/tests/unittests/test_store_triton.py
index 4319fd94f..e05ea1a28 100644
--- a/tests/unittests/test_store_triton.py
+++ b/tests/unittests/test_store_triton.py
@@ -7,8 +7,6 @@
 import pytest
 import iris
 
-
-
 @triton.jit
 def store_kernel(
     data,
@@ -33,7 +31,6 @@ def store_kernel(
     for dst_rank in range(num_ranks):
         iris.store(results + offsets, value, destination_rank, dst_rank, heap_bases, mask=mask)
 
-
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/x/test_all_gather.py b/tests/x/test_all_gather.py
index ecf7fa70f..9f6e47548 100644
--- a/tests/x/test_all_gather.py
+++ b/tests/x/test_all_gather.py
@@ -13,8 +13,6 @@
 import iris
 import iris.x
 
-
-
 @triton.jit
 def x_all_gather_kernel(
     input_ptr,
@@ -63,7 +61,6 @@ def x_all_gather_kernel(
 
         iris.x.all_gather(tile, dst_view, gather_dim, ctx)
 
-
 @pytest.mark.parametrize(
     "gather_dim",
     [0, 1],
@@ -200,7 +197,6 @@ def test_all_gather(gather_dim, dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZ
 
         gc.collect()
 
-
 @triton.jit
 def x_all_gather_ctx_api_kernel(
     input_ptr,
@@ -250,7 +246,6 @@ def x_all_gather_ctx_api_kernel(
         # Call primitive directly (ctx methods don't work due to Triton import restrictions)
         iris.x.all_gather(tile, dst_view, gather_dim, ctx)
 
-
 @pytest.mark.parametrize("gather_dim", [0, 1])
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
diff --git a/tests/x/test_all_reduce.py b/tests/x/test_all_reduce.py
index d50100b38..1a80eda0d 100644
--- a/tests/x/test_all_reduce.py
+++ b/tests/x/test_all_reduce.py
@@ -13,8 +13,6 @@
 import iris
 import iris.x
 
-
-
 @triton.jit
 def x_all_reduce_atomic_kernel(
     input_ptr,
@@ -56,7 +54,6 @@ def x_all_reduce_atomic_kernel(
 
         iris.x.all_reduce_atomic(tile, dst_view, ctx)
 
-
 @triton.jit
 def x_all_reduce_one_shot_kernel(
     input_ptr,
@@ -107,7 +104,6 @@ def x_all_reduce_one_shot_kernel(
 
         iris.x.all_reduce_one_shot(tile, src_view, dst_view, locks, ctx)
 
-
 @triton.jit
 def x_all_reduce_two_shot_kernel(
     input_ptr,
@@ -158,7 +154,6 @@ def x_all_reduce_two_shot_kernel(
 
         iris.x.all_reduce_two_shot(tile, src_view, dst_view, locks, cur_rank, world_size, ctx)
 
-
 @triton.jit
 def x_all_reduce_spinlock_kernel(
     input_ptr,
@@ -201,7 +196,6 @@ def x_all_reduce_spinlock_kernel(
 
         iris.x.all_reduce_spinlock(tile, dst_view, locks_ptr, ctx)
 
-
 @pytest.mark.parametrize(
     "variant",
     [
diff --git a/tests/x/test_all_to_all.py b/tests/x/test_all_to_all.py
index 70e6b6e54..7bcedff84 100644
--- a/tests/x/test_all_to_all.py
+++ b/tests/x/test_all_to_all.py
@@ -13,8 +13,6 @@
 import iris
 import iris.x
 
-
-
 @triton.jit
 def x_all_to_all_kernel(
     input_ptr,
@@ -51,7 +49,6 @@ def x_all_to_all_kernel(
 
         iris.x.all_to_all(tile, src_view, dst_view, N_per_rank, ctx)
 
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/x/test_gather.py b/tests/x/test_gather.py
index 051da63db..d6fff56fd 100644
--- a/tests/x/test_gather.py
+++ b/tests/x/test_gather.py
@@ -12,8 +12,6 @@
 import iris
 import iris.x
 
-
-
 @triton.jit
 def gather_kernel(
     input_ptr,
@@ -59,7 +57,6 @@ def gather_kernel(
         out_ptr = output_ptr + rm[:, None] * stride_out_m + rn[None, :] * stride_out_n
         tl.store(out_ptr, data, mask=mask)
 
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
@@ -125,7 +122,6 @@ def test_gather_from_specific_rank(dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_
         f"Rank {rank}: gather from rank {source_rank} failed"
     )
 
-
 @triton.jit
 def gather_accumulate_kernel(
     input_ptr,
@@ -173,7 +169,6 @@ def gather_accumulate_kernel(
         result = acc.to(output_ptr.type.element_ty)
         tl.store(out_ptr, result, mask=mask)
 
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/x/test_reduce_scatter.py b/tests/x/test_reduce_scatter.py
index 5c1e9f6c3..fa0b09aed 100644
--- a/tests/x/test_reduce_scatter.py
+++ b/tests/x/test_reduce_scatter.py
@@ -13,8 +13,6 @@
 import iris
 import iris.x
 
-
-
 @triton.jit
 def x_reduce_scatter_kernel(
     input_ptr,
@@ -64,7 +62,6 @@ def x_reduce_scatter_kernel(
 
         iris.x.reduce_scatter(tile, src_view, dst_view, locks, ctx)
 
-
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [

From ff227264872bc6d08cb77ba25ef1479079592b72 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 4 Feb 2026 17:58:30 +0000
Subject: [PATCH 09/11] Reset files with only whitespace changes to original
 state

- Reset 47 test files that only had blank line changes
- Kept 10 test files with single_rank markers (real changes)
- All reset files now match their original state before PR changes

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 tests/ccl/test_all_gather.py               |  1 +
 tests/ccl/test_all_reduce.py               |  2 ++
 tests/ccl/test_all_to_all.py               |  1 +
 tests/ccl/test_all_to_all_gluon.py         |  2 +-
 tests/ccl/test_process_groups.py           | 17 +++++++++++++++++
 tests/examples/test_all_load_bench.py      |  3 +++
 tests/examples/test_atomic_add_bench.py    |  2 ++
 tests/examples/test_flash_decode.py        |  4 ++++
 tests/examples/test_load_bench.py          |  1 +
 tests/examples/test_message_passing.py     |  4 ++++
 tests/ops/test_all_gather_matmul.py        |  2 ++
 tests/ops/test_matmul_all_gather.py        |  1 +
 tests/ops/test_matmul_all_reduce.py        |  2 ++
 tests/ops/test_matmul_reduce_scatter.py    |  2 ++
 tests/unittests/test_atomic_add_gluon.py   |  2 ++
 tests/unittests/test_atomic_add_triton.py  |  2 ++
 tests/unittests/test_atomic_and_gluon.py   |  2 ++
 tests/unittests/test_atomic_and_triton.py  |  2 ++
 tests/unittests/test_atomic_cas_gluon.py   |  2 ++
 tests/unittests/test_atomic_cas_triton.py  |  2 ++
 tests/unittests/test_atomic_max_gluon.py   |  2 ++
 tests/unittests/test_atomic_max_triton.py  |  2 ++
 tests/unittests/test_atomic_min_gluon.py   |  2 ++
 tests/unittests/test_atomic_min_triton.py  |  2 ++
 tests/unittests/test_atomic_or_gluon.py    |  2 ++
 tests/unittests/test_atomic_or_triton.py   |  2 ++
 tests/unittests/test_atomic_xchg_gluon.py  |  2 ++
 tests/unittests/test_atomic_xchg_triton.py |  2 ++
 tests/unittests/test_atomic_xor_gluon.py   |  2 ++
 tests/unittests/test_atomic_xor_triton.py  |  2 ++
 tests/unittests/test_broadcast_gluon.py    |  3 +++
 tests/unittests/test_broadcast_triton.py   |  3 +++
 tests/unittests/test_copy_gluon.py         |  6 ++++++
 tests/unittests/test_copy_triton.py        |  6 ++++++
 tests/unittests/test_get_gluon.py          |  3 ++-
 tests/unittests/test_get_triton.py         |  3 ++-
 tests/unittests/test_load_gluon.py         |  2 ++
 tests/unittests/test_load_triton.py        |  2 ++
 tests/unittests/test_put_gluon.py          |  3 ++-
 tests/unittests/test_put_triton.py         |  3 ++-
 tests/unittests/test_store_gluon.py        |  2 ++
 tests/unittests/test_store_triton.py       |  2 ++
 tests/x/test_all_gather.py                 |  4 ++++
 tests/x/test_all_reduce.py                 |  5 +++++
 tests/x/test_all_to_all.py                 |  2 ++
 tests/x/test_gather.py                     |  4 ++++
 tests/x/test_reduce_scatter.py             |  2 ++
 47 files changed, 126 insertions(+), 5 deletions(-)

diff --git a/tests/ccl/test_all_gather.py b/tests/ccl/test_all_gather.py
index 721b649bc..ae6490432 100644
--- a/tests/ccl/test_all_gather.py
+++ b/tests/ccl/test_all_gather.py
@@ -11,6 +11,7 @@
 import iris
 from iris.ccl import Config
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/ccl/test_all_reduce.py b/tests/ccl/test_all_reduce.py
index 1e6971159..ffd55e9d1 100644
--- a/tests/ccl/test_all_reduce.py
+++ b/tests/ccl/test_all_reduce.py
@@ -11,6 +11,7 @@
 import iris
 from iris.ccl import Config
 
+
 @pytest.mark.parametrize(
     "variant",
     [
@@ -106,6 +107,7 @@ def test_all_reduce(variant, dtype, M, N):
 
         gc.collect()
 
+
 @pytest.mark.parametrize(
     "distribution",
     [
diff --git a/tests/ccl/test_all_to_all.py b/tests/ccl/test_all_to_all.py
index be647829a..76478f5a0 100644
--- a/tests/ccl/test_all_to_all.py
+++ b/tests/ccl/test_all_to_all.py
@@ -11,6 +11,7 @@
 import iris
 from iris.ccl import Config
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/ccl/test_all_to_all_gluon.py b/tests/ccl/test_all_to_all_gluon.py
index 280a6c234..1dc485d47 100644
--- a/tests/ccl/test_all_to_all_gluon.py
+++ b/tests/ccl/test_all_to_all_gluon.py
@@ -10,7 +10,6 @@
 import torch.distributed as dist
 
 # Try to import Gluon, skip tests if not available
-
 try:
     import iris.experimental.iris_gluon as iris_gluon
     from iris.ccl import Config
@@ -20,6 +19,7 @@
 except ImportError:
     GLUON_AVAILABLE = False
 
+
 @pytest.mark.skipif(not GLUON_AVAILABLE, reason="Gluon not available")
 @pytest.mark.parametrize(
     "dtype",
diff --git a/tests/ccl/test_process_groups.py b/tests/ccl/test_process_groups.py
index 0e7bf198c..4bc6e3689 100644
--- a/tests/ccl/test_process_groups.py
+++ b/tests/ccl/test_process_groups.py
@@ -16,6 +16,7 @@
 import iris
 from iris.ccl import Config
 
+
 def _get_world_info():
     """Get world size and rank, skip if not enough ranks."""
     if not dist.is_initialized():
@@ -29,6 +30,7 @@ def _get_world_info():
 
     return world_size, rank
 
+
 def _create_consecutive_groups(world_size, group_size=2):
     """
     Create consecutive (TP-like) groups.
@@ -52,6 +54,7 @@ def _create_consecutive_groups(world_size, group_size=2):
             groups.append(None)
     return groups
 
+
 def _create_strided_groups(world_size, num_groups=2):
     """
     Create strided (DP-like) groups.
@@ -73,6 +76,7 @@ def _create_strided_groups(world_size, num_groups=2):
 
     return groups
 
+
 def _get_my_group(groups, rank):
     """Find which group the current rank belongs to."""
     for i, group in enumerate(groups):
@@ -82,10 +86,12 @@ def _get_my_group(groups, rank):
                 return i, group
     return None, None
 
+
 # =============================================================================
 # All-Reduce with Process Groups
 # =============================================================================
 
+
 @pytest.mark.parametrize(
     "variant",
     [
@@ -166,10 +172,12 @@ def test_all_reduce_with_groups(variant, group_type, dtype=torch.float32, M=256,
 
         gc.collect()
 
+
 # =============================================================================
 # All-Gather with Process Groups
 # =============================================================================
 
+
 @pytest.mark.parametrize("group_type", ["consecutive", "strided"])
 def test_all_gather_with_groups(group_type, dtype=torch.float32, M=128, N=64):
     """Test all-gather with ProcessGroups."""
@@ -226,10 +234,12 @@ def test_all_gather_with_groups(group_type, dtype=torch.float32, M=128, N=64):
 
         gc.collect()
 
+
 # =============================================================================
 # All-to-All with Process Groups
 # =============================================================================
 
+
 @pytest.mark.parametrize("group_type", ["consecutive", "strided"])
 def test_all_to_all_with_groups(group_type, dtype=torch.float32, M=128, N=64):
     """Test all-to-all with ProcessGroups."""
@@ -294,6 +304,7 @@ def test_all_to_all_with_groups(group_type, dtype=torch.float32, M=128, N=64):
 
         gc.collect()
 
+
 # =============================================================================
 # Reduce-Scatter with Process Groups
 # =============================================================================
@@ -305,6 +316,7 @@ def test_all_to_all_with_groups(group_type, dtype=torch.float32, M=128, N=64):
 # Until semantics are aligned, we test reduce_scatter with groups by verifying
 # that the group operations produce mathematically correct results.
 
+
 @pytest.mark.parametrize("group_type", ["consecutive", "strided"])
 def test_reduce_scatter_with_groups(group_type, dtype=torch.float32, M=256, N=128):
     """
@@ -367,10 +379,12 @@ def test_reduce_scatter_with_groups(group_type, dtype=torch.float32, M=256, N=12
 
         gc.collect()
 
+
 # =============================================================================
 # Edge Cases and Verification Tests
 # =============================================================================
 
+
 def test_group_info_extraction():
     """Test that extract_group_info returns correct values for different groups."""
     world_size, rank = _get_world_info()
@@ -417,6 +431,7 @@ def test_group_info_extraction():
 
     gc.collect()
 
+
 def test_all_reduce_group_correctness():
     """
     Verify all-reduce with groups produces correct mathematical results.
@@ -466,6 +481,7 @@ def test_all_reduce_group_correctness():
 
         gc.collect()
 
+
 def test_rank_stride_target_rank_calculation():
     """
     Explicitly test that rank_start + i * rank_stride correctly computes target_rank.
@@ -549,6 +565,7 @@ def test_rank_stride_target_rank_calculation():
 
     gc.collect()
 
+
 def test_all_gather_strided_data_placement():
     """
     Verify all-gather with strided groups places data in correct output locations.
diff --git a/tests/examples/test_all_load_bench.py b/tests/examples/test_all_load_bench.py
index 5912797de..bc925cdd5 100644
--- a/tests/examples/test_all_load_bench.py
+++ b/tests/examples/test_all_load_bench.py
@@ -16,6 +16,7 @@
 module = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(module)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -88,6 +89,7 @@ def test_all_load_bench(dtype, buffer_size, heap_size, block_size):
 
             gc.collect()
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -146,6 +148,7 @@ def test_all_load_bench_with_validation(dtype):
 
             gc.collect()
 
+
 def _torch_dtype_to_str(dtype):
     """Convert torch dtype to string format expected by all_load_bench.py"""
     if dtype == torch.int8:
diff --git a/tests/examples/test_atomic_add_bench.py b/tests/examples/test_atomic_add_bench.py
index 750f0508f..dbf995e96 100644
--- a/tests/examples/test_atomic_add_bench.py
+++ b/tests/examples/test_atomic_add_bench.py
@@ -35,6 +35,7 @@
 module = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(module)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -102,6 +103,7 @@ def test_atomic_bandwidth(dtype, buffer_size, heap_size, block_size):
 
             gc.collect()
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/examples/test_flash_decode.py b/tests/examples/test_flash_decode.py
index a8748f8c5..68b478b58 100644
--- a/tests/examples/test_flash_decode.py
+++ b/tests/examples/test_flash_decode.py
@@ -29,6 +29,7 @@
 #
 ################################################################################
 
+
 import sys
 from pathlib import Path
 import pytest
@@ -58,6 +59,7 @@
 from flash_decode_fused_layer import flash_decode_fused_layer  # noqa: E402
 from utils import print_correctness_report  # noqa: E402
 
+
 def ref_paged_attn(
     query: torch.Tensor,
     key_cache: torch.Tensor,
@@ -97,6 +99,7 @@ def ref_paged_attn(
         start_idx += query_len
     return torch.cat(outputs, dim=0)
 
+
 def prepare_correctness_data(cfg, args, num_query_heads, num_kv_heads, NUM_BLOCKS):
     head_dim = cfg["head_dim"]
     if args.rank == 0:
@@ -113,6 +116,7 @@ def prepare_correctness_data(cfg, args, num_query_heads, num_kv_heads, NUM_BLOCK
 
     return {"query": query, "key_value_cache": key_value_cache}
 
+
 @pytest.mark.parametrize("head_dim", [128])
 @pytest.mark.parametrize("num_seqs", [1, 8])
 @pytest.mark.parametrize("num_heads", [48, 96])
diff --git a/tests/examples/test_load_bench.py b/tests/examples/test_load_bench.py
index a33556441..261c2a8ed 100644
--- a/tests/examples/test_load_bench.py
+++ b/tests/examples/test_load_bench.py
@@ -17,6 +17,7 @@
 module = importlib.util.module_from_spec(spec)
 spec.loader.exec_module(module)
 
+
 @pytest.mark.skip(reason="Test is inconsistent and needs debugging - tracked in issue")
 @pytest.mark.parametrize(
     "dtype",
diff --git a/tests/examples/test_message_passing.py b/tests/examples/test_message_passing.py
index d63443260..aa173dead 100644
--- a/tests/examples/test_message_passing.py
+++ b/tests/examples/test_message_passing.py
@@ -26,10 +26,12 @@
 put_module = importlib.util.module_from_spec(put_spec)
 put_spec.loader.exec_module(put_module)
 
+
 def create_test_args(dtype_str, buffer_size, heap_size, block_size):
     """Create args dict that matches what parse_args() returns."""
     return {"datatype": dtype_str, "buffer_size": buffer_size, "heap_size": heap_size, "block_size": block_size}
 
+
 def run_message_passing_kernels(module, args):
     """Run the core message passing logic without command line argument parsing."""
     shmem = None
@@ -106,6 +108,7 @@ def run_message_passing_kernels(module, args):
 
             gc.collect()
 
+
 @pytest.mark.parametrize(
     "dtype_str",
     [
@@ -135,6 +138,7 @@ def test_message_passing_load_store(dtype_str, buffer_size, heap_size, block_siz
     success = run_message_passing_kernels(load_store_module, args)
     assert success, "Message passing load/store validation failed"
 
+
 @pytest.mark.parametrize(
     "dtype_str",
     [
diff --git a/tests/ops/test_all_gather_matmul.py b/tests/ops/test_all_gather_matmul.py
index 224690f40..193505011 100644
--- a/tests/ops/test_all_gather_matmul.py
+++ b/tests/ops/test_all_gather_matmul.py
@@ -14,6 +14,7 @@
 
 import iris
 
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
@@ -99,6 +100,7 @@ def test_all_gather_matmul(dtype, atol, rtol, M, K_local, N):
         f"Rank {rank}: Max diff {max_diff}, expected < {atol}"
     )
 
+
 if __name__ == "__main__":
     # For quick debugging
     import sys
diff --git a/tests/ops/test_matmul_all_gather.py b/tests/ops/test_matmul_all_gather.py
index 7e12d87ac..78ec0e47a 100644
--- a/tests/ops/test_matmul_all_gather.py
+++ b/tests/ops/test_matmul_all_gather.py
@@ -13,6 +13,7 @@
 import torch.distributed as dist
 import iris
 
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/ops/test_matmul_all_reduce.py b/tests/ops/test_matmul_all_reduce.py
index 99e34fee7..5780b5d4d 100644
--- a/tests/ops/test_matmul_all_reduce.py
+++ b/tests/ops/test_matmul_all_reduce.py
@@ -14,6 +14,7 @@
 import iris
 import iris.ops as ops
 
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
@@ -101,6 +102,7 @@ def test_matmul_all_reduce(dtype, atol, rtol, M, N, K, variant):
 
     gc.collect()
 
+
 def test_matmul_all_reduce_via_shmem_ops():
     """Test accessing matmul_all_reduce via shmem.ops namespace."""
     if not dist.is_initialized():
diff --git a/tests/ops/test_matmul_reduce_scatter.py b/tests/ops/test_matmul_reduce_scatter.py
index 89ae65260..7f75a1b0c 100644
--- a/tests/ops/test_matmul_reduce_scatter.py
+++ b/tests/ops/test_matmul_reduce_scatter.py
@@ -11,6 +11,7 @@
 import iris
 import iris.ops as ops
 
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
@@ -96,6 +97,7 @@ def test_matmul_reduce_scatter(dtype, atol, rtol, M, N, K):
 
     gc.collect()
 
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/unittests/test_atomic_add_gluon.py b/tests/unittests/test_atomic_add_gluon.py
index 833f7d31c..36d26801a 100644
--- a/tests/unittests/test_atomic_add_gluon.py
+++ b/tests/unittests/test_atomic_add_gluon.py
@@ -7,6 +7,7 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 @gluon.jit
 def atomic_add_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -38,6 +39,7 @@ def atomic_add_kernel(
             scope=scope,
         )
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_add_triton.py b/tests/unittests/test_atomic_add_triton.py
index a56875690..8cf2f7f45 100644
--- a/tests/unittests/test_atomic_add_triton.py
+++ b/tests/unittests/test_atomic_add_triton.py
@@ -7,6 +7,7 @@
 import pytest
 import iris
 
+
 @triton.jit
 def atomic_add_kernel(
     results,
@@ -38,6 +39,7 @@ def atomic_add_kernel(
             scope=scope,
         )
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_and_gluon.py b/tests/unittests/test_atomic_and_gluon.py
index 494fb8798..31ebdbc53 100644
--- a/tests/unittests/test_atomic_and_gluon.py
+++ b/tests/unittests/test_atomic_and_gluon.py
@@ -7,6 +7,7 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 @gluon.jit
 def atomic_and_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -32,6 +33,7 @@ def atomic_and_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_and(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_and_triton.py b/tests/unittests/test_atomic_and_triton.py
index b866c80e4..7b2bdf668 100644
--- a/tests/unittests/test_atomic_and_triton.py
+++ b/tests/unittests/test_atomic_and_triton.py
@@ -7,6 +7,7 @@
 import pytest
 import iris
 
+
 @triton.jit
 def atomic_and_kernel(
     results,
@@ -29,6 +30,7 @@ def atomic_and_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_and(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_cas_gluon.py b/tests/unittests/test_atomic_cas_gluon.py
index 86e539796..e10c77c59 100644
--- a/tests/unittests/test_atomic_cas_gluon.py
+++ b/tests/unittests/test_atomic_cas_gluon.py
@@ -7,6 +7,7 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 @gluon.jit
 def atomic_cas_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -29,6 +30,7 @@ def atomic_cas_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_cas(results, cmp, val, target_rank, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_cas_triton.py b/tests/unittests/test_atomic_cas_triton.py
index 0ab4ef68b..fdd59a886 100644
--- a/tests/unittests/test_atomic_cas_triton.py
+++ b/tests/unittests/test_atomic_cas_triton.py
@@ -7,6 +7,7 @@
 import pytest
 import iris
 
+
 @triton.jit
 def atomic_cas_kernel(
     results,
@@ -24,6 +25,7 @@ def atomic_cas_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_cas(results, cmp, val, cur_rank, target_rank, heap_bases, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_max_gluon.py b/tests/unittests/test_atomic_max_gluon.py
index b6c67a3f9..5ff71ea3f 100644
--- a/tests/unittests/test_atomic_max_gluon.py
+++ b/tests/unittests/test_atomic_max_gluon.py
@@ -7,6 +7,7 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 @gluon.jit
 def atomic_max_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -30,6 +31,7 @@ def atomic_max_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_max(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_max_triton.py b/tests/unittests/test_atomic_max_triton.py
index ef674082b..69d9d96d7 100644
--- a/tests/unittests/test_atomic_max_triton.py
+++ b/tests/unittests/test_atomic_max_triton.py
@@ -7,6 +7,7 @@
 import pytest
 import iris
 
+
 @triton.jit
 def atomic_max_kernel(
     results,
@@ -27,6 +28,7 @@ def atomic_max_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_max(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_min_gluon.py b/tests/unittests/test_atomic_min_gluon.py
index f1de1167e..e18836b87 100644
--- a/tests/unittests/test_atomic_min_gluon.py
+++ b/tests/unittests/test_atomic_min_gluon.py
@@ -7,6 +7,7 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 @gluon.jit
 def atomic_min_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -30,6 +31,7 @@ def atomic_min_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_min(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_min_triton.py b/tests/unittests/test_atomic_min_triton.py
index e22767a6f..139e473de 100644
--- a/tests/unittests/test_atomic_min_triton.py
+++ b/tests/unittests/test_atomic_min_triton.py
@@ -7,6 +7,7 @@
 import pytest
 import iris
 
+
 @triton.jit
 def atomic_min_kernel(
     results,
@@ -27,6 +28,7 @@ def atomic_min_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_min(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_or_gluon.py b/tests/unittests/test_atomic_or_gluon.py
index 1841f00bf..bcda75b39 100644
--- a/tests/unittests/test_atomic_or_gluon.py
+++ b/tests/unittests/test_atomic_or_gluon.py
@@ -7,6 +7,7 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 @gluon.jit
 def atomic_or_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -31,6 +32,7 @@ def atomic_or_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_or(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_or_triton.py b/tests/unittests/test_atomic_or_triton.py
index a369f7e1a..c0b8cc25d 100644
--- a/tests/unittests/test_atomic_or_triton.py
+++ b/tests/unittests/test_atomic_or_triton.py
@@ -7,6 +7,7 @@
 import pytest
 import iris
 
+
 @triton.jit
 def atomic_or_kernel(
     results,
@@ -28,6 +29,7 @@ def atomic_or_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_or(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_xchg_gluon.py b/tests/unittests/test_atomic_xchg_gluon.py
index 122ffa704..09ef1e2f2 100644
--- a/tests/unittests/test_atomic_xchg_gluon.py
+++ b/tests/unittests/test_atomic_xchg_gluon.py
@@ -7,6 +7,7 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 @gluon.jit
 def atomic_xchg_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -27,6 +28,7 @@ def atomic_xchg_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_xchg(results, val, target_rank, mask=None, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_xchg_triton.py b/tests/unittests/test_atomic_xchg_triton.py
index d69a7eed1..ffea37e78 100644
--- a/tests/unittests/test_atomic_xchg_triton.py
+++ b/tests/unittests/test_atomic_xchg_triton.py
@@ -7,6 +7,7 @@
 import pytest
 import iris
 
+
 @triton.jit
 def atomic_xchg_kernel(
     results,
@@ -23,6 +24,7 @@ def atomic_xchg_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_xchg(results, val, cur_rank, target_rank, heap_bases, mask=None, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_xor_gluon.py b/tests/unittests/test_atomic_xor_gluon.py
index fb5c76acc..b9e77ce60 100644
--- a/tests/unittests/test_atomic_xor_gluon.py
+++ b/tests/unittests/test_atomic_xor_gluon.py
@@ -7,6 +7,7 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 @gluon.jit
 def atomic_xor_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -32,6 +33,7 @@ def atomic_xor_kernel(
     for target_rank in range(num_ranks):
         ctx.atomic_xor(results + offsets, acc, target_rank, mask=mask, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_atomic_xor_triton.py b/tests/unittests/test_atomic_xor_triton.py
index 24af6da36..639abfcdd 100644
--- a/tests/unittests/test_atomic_xor_triton.py
+++ b/tests/unittests/test_atomic_xor_triton.py
@@ -7,6 +7,7 @@
 import pytest
 import iris
 
+
 @triton.jit
 def atomic_xor_kernel(
     results,
@@ -29,6 +30,7 @@ def atomic_xor_kernel(
     for target_rank in range(num_ranks):
         iris.atomic_xor(results + offsets, acc, cur_rank, target_rank, heap_bases, mask, sem=sem, scope=scope)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_broadcast_gluon.py b/tests/unittests/test_broadcast_gluon.py
index b51b407ca..e2eaac6d1 100644
--- a/tests/unittests/test_broadcast_gluon.py
+++ b/tests/unittests/test_broadcast_gluon.py
@@ -6,6 +6,7 @@
 import pytest
 import iris.experimental.iris_gluon as iris_gl
 
+
 @pytest.mark.parametrize(
     "value,expected",
     [
@@ -42,6 +43,7 @@ def test_broadcast_scalar(value, expected):
 
         gc.collect()
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -74,6 +76,7 @@ def test_broadcast_tensor_dtype(dtype):
 
         gc.collect()
 
+
 @pytest.mark.parametrize(
     "shape",
     [
diff --git a/tests/unittests/test_broadcast_triton.py b/tests/unittests/test_broadcast_triton.py
index ea6c047d7..9563a5916 100644
--- a/tests/unittests/test_broadcast_triton.py
+++ b/tests/unittests/test_broadcast_triton.py
@@ -6,6 +6,7 @@
 import pytest
 import iris
 
+
 @pytest.mark.parametrize(
     "value,expected",
     [
@@ -42,6 +43,7 @@ def test_broadcast_scalar(value, expected):
 
         gc.collect()
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -74,6 +76,7 @@ def test_broadcast_tensor_dtype(dtype):
 
         gc.collect()
 
+
 @pytest.mark.parametrize(
     "shape",
     [
diff --git a/tests/unittests/test_copy_gluon.py b/tests/unittests/test_copy_gluon.py
index d16b816dc..8102640da 100644
--- a/tests/unittests/test_copy_gluon.py
+++ b/tests/unittests/test_copy_gluon.py
@@ -7,6 +7,7 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 @gluon.jit
 def copy_get_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -30,6 +31,7 @@ def copy_get_kernel(
         dest_data = results + BLOCK_SIZE * target_rank
         ctx.copy(src_data + offsets, dest_data + offsets, target_rank, cur_rank, mask=mask)
 
+
 @gluon.jit
 def copy_put_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -53,6 +55,7 @@ def copy_put_kernel(
         dest_data = results + BLOCK_SIZE * cur_rank
         ctx.copy(src_data + offsets, dest_data + offsets, cur_rank, target_rank, mask=mask)
 
+
 @gluon.jit
 def copy_local_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -76,6 +79,7 @@ def copy_local_kernel(
         dest_data = results + BLOCK_SIZE * i
         ctx.copy(src_data + offsets, dest_data + offsets, cur_rank, cur_rank, mask=mask)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -143,6 +147,7 @@ def test_copy_get(dtype, BLOCK_SIZE):
 
         gc.collect()
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -212,6 +217,7 @@ def test_copy_put(dtype, BLOCK_SIZE):
 
         gc.collect()
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_copy_triton.py b/tests/unittests/test_copy_triton.py
index 1d611984f..00bc43e47 100644
--- a/tests/unittests/test_copy_triton.py
+++ b/tests/unittests/test_copy_triton.py
@@ -7,6 +7,7 @@
 import pytest
 import iris
 
+
 @triton.jit
 def copy_get_kernel(
     data,
@@ -27,6 +28,7 @@ def copy_get_kernel(
         dest_data = results + BLOCK_SIZE * target_rank
         iris.copy(src_data + offsets, dest_data + offsets, target_rank, cur_rank, cur_rank, heap_bases, mask)
 
+
 @triton.jit
 def copy_put_kernel(
     data,
@@ -47,6 +49,7 @@ def copy_put_kernel(
         dest_data = results + BLOCK_SIZE * cur_rank
         iris.copy(src_data + offsets, dest_data + offsets, cur_rank, target_rank, cur_rank, heap_bases, mask)
 
+
 @triton.jit
 def copy_local_kernel(
     data,
@@ -67,6 +70,7 @@ def copy_local_kernel(
         dest_data = results + BLOCK_SIZE * i
         iris.copy(src_data + offsets, dest_data + offsets, cur_rank, cur_rank, cur_rank, heap_bases, mask)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -125,6 +129,7 @@ def test_copy_get(dtype, BLOCK_SIZE):
 
         gc.collect()
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -185,6 +190,7 @@ def test_copy_put(dtype, BLOCK_SIZE):
 
         gc.collect()
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_get_gluon.py b/tests/unittests/test_get_gluon.py
index a5440bfdd..5cabc054c 100644
--- a/tests/unittests/test_get_gluon.py
+++ b/tests/unittests/test_get_gluon.py
@@ -7,11 +7,11 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 # TODO: Separate this kernel out in the following categories:
 # 1. for local get.
 # 2. for remote get with one other rank.
 # 3. for remote get with more than one rank (if num_ranks > 2).
-
 @gluon.jit
 def get_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -40,6 +40,7 @@ def get_kernel(
     # Store the accumulated value back to the output.
     gl.store(results + offsets, acc, mask=mask)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_get_triton.py b/tests/unittests/test_get_triton.py
index 60781cf3e..b19cf235d 100644
--- a/tests/unittests/test_get_triton.py
+++ b/tests/unittests/test_get_triton.py
@@ -7,11 +7,11 @@
 import pytest
 import iris
 
+
 # TODO: Separate this kernel out in the following categories:
 # 1. for local get.
 # 2. for remote get with one other rank.
 # 3. for remote get with more than one rank (if num_ranks > 2).
-
 @triton.jit
 def get_kernel(
     data,
@@ -37,6 +37,7 @@ def get_kernel(
     # Store the accumulated value back to the output.
     tl.store(results + offsets, acc, mask=mask)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_load_gluon.py b/tests/unittests/test_load_gluon.py
index 37e35d251..adce4ce39 100644
--- a/tests/unittests/test_load_gluon.py
+++ b/tests/unittests/test_load_gluon.py
@@ -7,6 +7,7 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 @gluon.jit
 def load_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -31,6 +32,7 @@ def load_kernel(
     result = ctx.load(data + offsets, partner, mask=mask)
     gl.store(results + offsets, result, mask=mask)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_load_triton.py b/tests/unittests/test_load_triton.py
index 8738368d3..b73dda3f3 100644
--- a/tests/unittests/test_load_triton.py
+++ b/tests/unittests/test_load_triton.py
@@ -7,6 +7,7 @@
 import pytest
 import iris
 
+
 @triton.jit
 def load_kernel(
     data,
@@ -28,6 +29,7 @@ def load_kernel(
     result = iris.load(data + offsets, source_rank, partner, heap_bases, mask=mask)
     tl.store(results + offsets, result, mask=mask)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_put_gluon.py b/tests/unittests/test_put_gluon.py
index 9fcadee33..6f1172602 100644
--- a/tests/unittests/test_put_gluon.py
+++ b/tests/unittests/test_put_gluon.py
@@ -7,11 +7,11 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 # TODO: Separate this kernel out in the following categories:
 # 1. for local put.
 # 2. for remote put with one other rank.
 # 3. for remote put with more than one rank (if num_ranks > 2).
-
 @gluon.jit
 def put_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -34,6 +34,7 @@ def put_kernel(
     for target_rank in range(num_ranks):
         ctx.put(data + offsets, results + offsets, target_rank, mask=mask)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_put_triton.py b/tests/unittests/test_put_triton.py
index c5ce8173c..d953b42c6 100644
--- a/tests/unittests/test_put_triton.py
+++ b/tests/unittests/test_put_triton.py
@@ -7,11 +7,11 @@
 import pytest
 import iris
 
+
 # TODO: Separate this kernel out in the following categories:
 # 1. for local put.
 # 2. for remote put with one other rank.
 # 3. for remote put with more than one rank (if num_ranks > 2).
-
 @triton.jit
 def put_kernel(
     data,
@@ -31,6 +31,7 @@ def put_kernel(
     for target_rank in range(num_ranks):
         iris.put(data + offsets, results + offsets, cur_rank, target_rank, heap_bases, mask=mask)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_store_gluon.py b/tests/unittests/test_store_gluon.py
index 5d8eeacb8..81ff3c608 100644
--- a/tests/unittests/test_store_gluon.py
+++ b/tests/unittests/test_store_gluon.py
@@ -7,6 +7,7 @@
 from triton.experimental.gluon import language as gl
 import iris.experimental.iris_gluon as iris_gl
 
+
 @gluon.jit
 def store_kernel(
     IrisDeviceCtx: gl.constexpr,
@@ -34,6 +35,7 @@ def store_kernel(
     for dst_rank in range(num_ranks):
         ctx.store(results + offsets, value, dst_rank, mask=mask)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_store_triton.py b/tests/unittests/test_store_triton.py
index e05ea1a28..0632180c8 100644
--- a/tests/unittests/test_store_triton.py
+++ b/tests/unittests/test_store_triton.py
@@ -7,6 +7,7 @@
 import pytest
 import iris
 
+
 @triton.jit
 def store_kernel(
     data,
@@ -31,6 +32,7 @@ def store_kernel(
     for dst_rank in range(num_ranks):
         iris.store(results + offsets, value, destination_rank, dst_rank, heap_bases, mask=mask)
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/x/test_all_gather.py b/tests/x/test_all_gather.py
index 9f6e47548..5cad616d5 100644
--- a/tests/x/test_all_gather.py
+++ b/tests/x/test_all_gather.py
@@ -13,6 +13,7 @@
 import iris
 import iris.x
 
+
 @triton.jit
 def x_all_gather_kernel(
     input_ptr,
@@ -61,6 +62,7 @@ def x_all_gather_kernel(
 
         iris.x.all_gather(tile, dst_view, gather_dim, ctx)
 
+
 @pytest.mark.parametrize(
     "gather_dim",
     [0, 1],
@@ -197,6 +199,7 @@ def test_all_gather(gather_dim, dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_SIZ
 
         gc.collect()
 
+
 @triton.jit
 def x_all_gather_ctx_api_kernel(
     input_ptr,
@@ -246,6 +249,7 @@ def x_all_gather_ctx_api_kernel(
         # Call primitive directly (ctx methods don't work due to Triton import restrictions)
         iris.x.all_gather(tile, dst_view, gather_dim, ctx)
 
+
 @pytest.mark.parametrize("gather_dim", [0, 1])
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
diff --git a/tests/x/test_all_reduce.py b/tests/x/test_all_reduce.py
index 1a80eda0d..864a45006 100644
--- a/tests/x/test_all_reduce.py
+++ b/tests/x/test_all_reduce.py
@@ -13,6 +13,7 @@
 import iris
 import iris.x
 
+
 @triton.jit
 def x_all_reduce_atomic_kernel(
     input_ptr,
@@ -54,6 +55,7 @@ def x_all_reduce_atomic_kernel(
 
         iris.x.all_reduce_atomic(tile, dst_view, ctx)
 
+
 @triton.jit
 def x_all_reduce_one_shot_kernel(
     input_ptr,
@@ -104,6 +106,7 @@ def x_all_reduce_one_shot_kernel(
 
         iris.x.all_reduce_one_shot(tile, src_view, dst_view, locks, ctx)
 
+
 @triton.jit
 def x_all_reduce_two_shot_kernel(
     input_ptr,
@@ -154,6 +157,7 @@ def x_all_reduce_two_shot_kernel(
 
         iris.x.all_reduce_two_shot(tile, src_view, dst_view, locks, cur_rank, world_size, ctx)
 
+
 @triton.jit
 def x_all_reduce_spinlock_kernel(
     input_ptr,
@@ -196,6 +200,7 @@ def x_all_reduce_spinlock_kernel(
 
         iris.x.all_reduce_spinlock(tile, dst_view, locks_ptr, ctx)
 
+
 @pytest.mark.parametrize(
     "variant",
     [
diff --git a/tests/x/test_all_to_all.py b/tests/x/test_all_to_all.py
index 7bcedff84..60de86663 100644
--- a/tests/x/test_all_to_all.py
+++ b/tests/x/test_all_to_all.py
@@ -13,6 +13,7 @@
 import iris
 import iris.x
 
+
 @triton.jit
 def x_all_to_all_kernel(
     input_ptr,
@@ -49,6 +50,7 @@ def x_all_to_all_kernel(
 
         iris.x.all_to_all(tile, src_view, dst_view, N_per_rank, ctx)
 
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/x/test_gather.py b/tests/x/test_gather.py
index d6fff56fd..d364b7c83 100644
--- a/tests/x/test_gather.py
+++ b/tests/x/test_gather.py
@@ -12,6 +12,7 @@
 import iris
 import iris.x
 
+
 @triton.jit
 def gather_kernel(
     input_ptr,
@@ -57,6 +58,7 @@ def gather_kernel(
         out_ptr = output_ptr + rm[:, None] * stride_out_m + rn[None, :] * stride_out_n
         tl.store(out_ptr, data, mask=mask)
 
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
@@ -122,6 +124,7 @@ def test_gather_from_specific_rank(dtype, atol, rtol, M, N, BLOCK_SIZE_M, BLOCK_
         f"Rank {rank}: gather from rank {source_rank} failed"
     )
 
+
 @triton.jit
 def gather_accumulate_kernel(
     input_ptr,
@@ -169,6 +172,7 @@ def gather_accumulate_kernel(
         result = acc.to(output_ptr.type.element_ty)
         tl.store(out_ptr, result, mask=mask)
 
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [
diff --git a/tests/x/test_reduce_scatter.py b/tests/x/test_reduce_scatter.py
index fa0b09aed..eb8099e40 100644
--- a/tests/x/test_reduce_scatter.py
+++ b/tests/x/test_reduce_scatter.py
@@ -13,6 +13,7 @@
 import iris
 import iris.x
 
+
 @triton.jit
 def x_reduce_scatter_kernel(
     input_ptr,
@@ -62,6 +63,7 @@ def x_reduce_scatter_kernel(
 
         iris.x.reduce_scatter(tile, src_view, dst_view, locks, ctx)
 
+
 @pytest.mark.parametrize(
     "dtype, atol, rtol",
     [

From 3f2a9936286f99538bd1641ff0564ad33e0b382d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 4 Feb 2026 18:03:47 +0000
Subject: [PATCH 10/11] Remove test optimization documentation file

- Removed docs/test_optimization.md as requested
- Keeping implementation minimal with only necessary changes

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 docs/test_optimization.md | 197 --------------------------------------
 1 file changed, 197 deletions(-)
 delete mode 100644 docs/test_optimization.md

diff --git a/docs/test_optimization.md b/docs/test_optimization.md
deleted file mode 100644
index 97fa96b5c..000000000
--- a/docs/test_optimization.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# Test Suite Optimization - Phase 1
-
-## Overview
-
-This document describes the Phase 1 test suite optimization implemented to reduce CI time by ~30% (from 210 minutes to 147 minutes).
-
-## Background
-
-Analysis revealed that the original test suite was running **every test** on **all 4 rank configurations** (1, 2, 4, 8 ranks), which was wasteful. While multi-rank validation is essential for distributed features (symmetric heap allocation, cross-rank operations), many tests only validate tensor properties (shape, dtype, values) and don't require multi-rank execution.
-
-### Original Test Execution
-- **3 install methods** × **5 test directories** × **4 rank configs** = **60 CI jobs**
-- Each job runs all tests in a directory
-- Total multi-rank test runs: **6.37M**
-
-### Optimized Test Execution
-- **Same CI matrix structure** (no workflow changes)
-- Tests are filtered automatically by pytest markers
-- Single-rank tests skip execution when NUM_RANKS > 1
-- **Default behavior**: All tests run on all ranks unless marked `single_rank`
-- Total multi-rank test runs: **3.98M** (37.5% reduction)
-
-## Implementation
-
-### 1. Pytest Marker
-
-One marker is defined in `pytest.ini`:
-
-- **`@pytest.mark.single_rank`**: Tests that validate tensor properties (shape, dtype, values)
-  - These tests only need to run on **1 rank**
-  - Examples: `test_zeros`, `test_ones`, `test_rand`, `test_full`, `test_empty`
-  
-**Default behavior**: Tests without the `single_rank` marker run on **all rank configurations** (1, 2, 4, 8). This includes all distributed tests (get, put, load, store, atomics, collectives) without requiring explicit marking.
-
-### 2. Test Classification
-
-Tests are classified into two categories:
-
-| Category | Count | Runs on Ranks | Examples |
-|----------|-------|---------------|----------|
-| `single_rank` (marked) | 10 files | 1 only | zeros, ones, rand, empty, full, arange, linspace, randint, randn, zeros_like |
-| Default (unmarked) | 51 files | 1, 2, 4, 8 | get, put, load, store, atomic_*, broadcast, copy, all_reduce, all_gather, all_to_all, logging, dmabuf_apis, get_num_xcc, iris_helpers |
-
-### 3. Test Filtering
-
-The `.github/scripts/run_tests.sh` script was minimally modified to skip `single_rank` tests when running with multiple ranks:
-
-```bash
-# Skip single_rank tests when running with multiple ranks
-MARKER_ARG=""
-if [ "$NUM_RANKS" -gt 1 ]; then
-    MARKER_ARG="-m 'not single_rank'"
-fi
-```
-
-This approach:
-- Requires minimal changes to CI infrastructure (only 6 lines added)
-- Uses pytest's built-in marker filtering
-- Automatically skips single_rank tests on multi-rank configurations
-- Assumes multi-rank by default (simpler, less marking required)
-- Preserves the existing CI workflow structure
-
-## Adding New Tests
-
-When adding new tests, follow these guidelines:
-
-### Single-rank Tests
-Use `@pytest.mark.single_rank` for tests that:
-- Validate tensor properties (shape, dtype, values)
-- Test tensor creation functions (zeros, ones, rand, etc.)
-- Don't involve cross-rank communication
-- Can verify correctness on a single rank
-
-Example:
-```python
-import pytest
-import iris
-
-pytestmark = pytest.mark.single_rank
-
-## Adding New Tests
-
-When adding new tests, follow these guidelines:
-
-### Single-rank Tests (Minority - Require Marking)
-Use `@pytest.mark.single_rank` **only** for tests that:
-- Validate tensor properties (shape, dtype, values)
-- Test tensor creation functions (zeros, ones, rand, etc.)
-- Don't involve cross-rank communication
-- Can verify correctness on a single rank
-
-Example:
-```python
-import pytest
-import iris
-
-pytestmark = pytest.mark.single_rank
-
-def test_zeros():
-    shmem = iris.iris(1 << 20)
-    result = shmem.zeros(2, 3, dtype=torch.float32)
-    assert result.shape == (2, 3)
-    assert result.dtype == torch.float32
-```
-
-### Multi-rank Tests (Majority - Default, No Marking Needed)
-**Do not mark** tests that:
-- Validate distributed behavior
-- Test cross-rank operations (get, put, load, store)
-- Test collective operations (all_reduce, all_gather, all_to_all)
-- Test atomic operations across ranks
-- Require symmetric heap visibility validation
-
-Example (no marker needed):
-```python
-import pytest
-import iris
-
-# No pytestmark needed - runs on all ranks by default
-
-def test_all_reduce():
-    shmem = iris.iris(1 << 20)
-    # Test requires multiple ranks to validate reduction
-    input_tensor = shmem.ones(10, dtype=torch.float32) * shmem.get_rank()
-    output = shmem.ccl.all_reduce(input_tensor)
-    # Validation logic...
-```
-
-**Key principle**: Assume multi-rank by default. Only mark the small subset of tests that are `single_rank`.
-
-## Running Tests Locally
-
-### Run all tests
-```bash
-pytest tests/
-```
-
-### Run only single-rank tests
-```bash
-pytest tests/ -m single_rank
-```
-
-### Run only multi-rank tests (unmarked)
-```bash
-pytest tests/ -m "not single_rank"
-```
-
-### Run with specific rank count
-```bash
-python tests/run_tests_distributed.py --num_ranks 4 tests/ccl/test_all_reduce.py
-```
-
-## Expected Impact
-
-### Time Savings
-- **Previous CI time**: ~210 minutes
-- **New CI time**: ~147 minutes
-- **Reduction**: 63 minutes (30%)
-
-### Test Execution Reduction
-- **Previous multi-rank test runs**: 6.37M
-- **New multi-rank test runs**: 3.98M
-- **Reduction**: 2.39M test runs (37.5%)
-
-### Key Metrics
-- **Test count**: Unchanged (530,877 tests)
-- **Coverage**: No reduction - all tests still run at least once
-- **Quality**: No degradation - multi-rank tests still validated on all configs
-
-## Future Optimizations (Phase 2+)
-
-Potential future optimizations include:
-1. **Parameterization reduction**: Reduce parameter combinations for single-rank tests
-2. **Test parallelization**: Run independent tests in parallel
-3. **Caching**: Cache build artifacts between jobs
-4. **Smart test selection**: Skip tests unaffected by code changes
-
-## References
-
-- Issue: [Implement test suite optimization](https://github.com/ROCm/iris/issues/XXX)
-- PR: [Test Suite Optimization - Phase 1](https://github.com/ROCm/iris/pull/XXX)
-- Analysis: See PRs #353 and #354 for detailed analysis
-
-## Troubleshooting
-
-### Marker not recognized
-Ensure `pytest.ini` is present in the repository root with the marker definitions.
-
-### Tests not filtered correctly
-1. Check that the marker is added to the test file
-2. Verify the marker syntax: `pytestmark = pytest.mark.<marker>`
-3. Check that the CI workflow passes the marker parameter correctly
-
-### CI failures after optimization
-1. Check that multi-rank tests have `multi_rank_required` marker
-2. Verify that single-rank tests don't depend on multi-rank execution
-3. Review test logs to identify which rank configuration failed

From 15f10e9fb8e1c06c4d1ed5754583cec8084ea4b0 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Wed, 4 Feb 2026 18:16:46 +0000
Subject: [PATCH 11/11] Apply Ruff auto-fixes

---
 tests/unittests/test_arange.py     | 2 +-
 tests/unittests/test_empty.py      | 2 +-
 tests/unittests/test_full.py       | 2 +-
 tests/unittests/test_linspace.py   | 2 +-
 tests/unittests/test_ones.py       | 2 +-
 tests/unittests/test_rand.py       | 2 +-
 tests/unittests/test_randint.py    | 2 +-
 tests/unittests/test_randn.py      | 2 +-
 tests/unittests/test_zeros.py      | 2 +-
 tests/unittests/test_zeros_like.py | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/unittests/test_arange.py b/tests/unittests/test_arange.py
index 971cd67eb..c069ab3cb 100644
--- a/tests/unittests/test_arange.py
+++ b/tests/unittests/test_arange.py
@@ -6,9 +6,9 @@
 import iris
 
 
-
 pytestmark = pytest.mark.single_rank
 
+
 def test_arange_basic_functionality():
     """Test basic arange functionality with various argument combinations."""
     shmem = iris.iris(1 << 20)
diff --git a/tests/unittests/test_empty.py b/tests/unittests/test_empty.py
index a9af622b7..157a46b29 100644
--- a/tests/unittests/test_empty.py
+++ b/tests/unittests/test_empty.py
@@ -6,9 +6,9 @@
 import iris
 
 
-
 pytestmark = pytest.mark.single_rank
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_full.py b/tests/unittests/test_full.py
index 05c792ccd..4cd0468f0 100644
--- a/tests/unittests/test_full.py
+++ b/tests/unittests/test_full.py
@@ -6,9 +6,9 @@
 import iris
 
 
-
 pytestmark = pytest.mark.single_rank
 
+
 @pytest.mark.parametrize(
     "fill_value",
     [
diff --git a/tests/unittests/test_linspace.py b/tests/unittests/test_linspace.py
index a95b9836a..649ad4871 100644
--- a/tests/unittests/test_linspace.py
+++ b/tests/unittests/test_linspace.py
@@ -6,9 +6,9 @@
 import iris
 
 
-
 pytestmark = pytest.mark.single_rank
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_ones.py b/tests/unittests/test_ones.py
index cfe5a70c6..d2e5d44fa 100644
--- a/tests/unittests/test_ones.py
+++ b/tests/unittests/test_ones.py
@@ -6,9 +6,9 @@
 import iris
 
 
-
 pytestmark = pytest.mark.single_rank
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_rand.py b/tests/unittests/test_rand.py
index 30c6dedd0..6ac79f945 100644
--- a/tests/unittests/test_rand.py
+++ b/tests/unittests/test_rand.py
@@ -6,9 +6,9 @@
 import iris
 
 
-
 pytestmark = pytest.mark.single_rank
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_randint.py b/tests/unittests/test_randint.py
index d5c1571a1..d7ae1113e 100644
--- a/tests/unittests/test_randint.py
+++ b/tests/unittests/test_randint.py
@@ -6,9 +6,9 @@
 import iris
 
 
-
 pytestmark = pytest.mark.single_rank
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_randn.py b/tests/unittests/test_randn.py
index e14b9eda1..9e148a34b 100644
--- a/tests/unittests/test_randn.py
+++ b/tests/unittests/test_randn.py
@@ -6,9 +6,9 @@
 import iris
 
 
-
 pytestmark = pytest.mark.single_rank
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_zeros.py b/tests/unittests/test_zeros.py
index ca55d9df1..fc4d90966 100644
--- a/tests/unittests/test_zeros.py
+++ b/tests/unittests/test_zeros.py
@@ -6,9 +6,9 @@
 import iris
 
 
-
 pytestmark = pytest.mark.single_rank
 
+
 @pytest.mark.parametrize(
     "dtype",
     [
diff --git a/tests/unittests/test_zeros_like.py b/tests/unittests/test_zeros_like.py
index 6e3ca4fe6..a5d55ab78 100644
--- a/tests/unittests/test_zeros_like.py
+++ b/tests/unittests/test_zeros_like.py
@@ -6,9 +6,9 @@
 import iris
 
 
-
 pytestmark = pytest.mark.single_rank
 
+
 @pytest.mark.parametrize(
     "dtype",
     [