Snapchat · kmontemayor2-sc · Jan 21, 2026 · Jan 10, 2026 · Jan 10, 2026 · Jan 13, 2026
@@ -2,12 +2,12 @@
 import os
 import socket
 import unittest
-from typing import Optional
+from typing import Optional, Union
 from unittest import mock
 
 import torch
 import torch.multiprocessing as mp
-from torch_geometric.data import Data
+from torch_geometric.data import Data, HeteroData
 
 from gigl.common import Uri
 from gigl.common.logger import Logger
@@ -25,10 +25,11 @@
     COMPUTE_CLUSTER_LOCAL_WORLD_SIZE_ENV_KEY,
     GraphStoreInfo,
 )
-from gigl.src.common.types.graph_data import EdgeType
+from gigl.src.common.types.graph_data import EdgeType, NodeType, Relation
 from gigl.src.mocking.lib.versioning import get_mocked_dataset_artifact_metadata
 from gigl.src.mocking.mocking_assets.mocked_datasets_for_pipeline_tests import (
     CORA_USER_DEFINED_NODE_ANCHOR_MOCKED_DATASET_INFO,
+    DBLP_GRAPH_NODE_ANCHOR_MOCKED_DATASET_INFO,
 )
 from tests.test_assets.distributed.utils import assert_tensor_equality
 
@@ -58,13 +59,18 @@ def _assert_sampler_input(
     torch.distributed.barrier()
 
 
-def _run_client_process(
+def _run_compute_tests(
     client_rank: int,
     cluster_info: GraphStoreInfo,
     mp_sharing_dict: dict[str, torch.Tensor],
+    node_type: Optional[NodeType],
     expected_sampler_input: dict[int, list[torch.Tensor]],
     expected_edge_types: Optional[list[EdgeType]],
 ) -> None:
+    """
+    Process target for "compute" nodes.
+    Each "Client Process" (e.g. cluster_info.num_compute_nodes) will spawn as a process for each "num_processes_per_compute"
+    """
     init_compute_process(client_rank, cluster_info, compute_world_backend="gloo")
 
     remote_dist_dataset = RemoteDistDataset(
@@ -100,36 +106,44 @@ def _run_client_process(
     torch.distributed.barrier()
     logger.info("Verified that all ranks received the same free ports")
 
-    sampler_input = remote_dist_dataset.get_node_ids()
+    sampler_input = remote_dist_dataset.get_node_ids(node_type=node_type)
     _assert_sampler_input(cluster_info, sampler_input, expected_sampler_input)
 
     # test "simple" case where we don't have mp sharing dict too
     simple_sampler_input = RemoteDistDataset(
         cluster_info=cluster_info,
         local_rank=client_rank,
         mp_sharing_dict=None,
-    ).get_node_ids()
+    ).get_node_ids(node_type=node_type)
     _assert_sampler_input(cluster_info, simple_sampler_input, expected_sampler_input)
 
-    # Check that the edge types are correct
     assert (
         remote_dist_dataset.get_edge_types() == expected_edge_types
     ), f"Expected edge types {expected_edge_types}, got {remote_dist_dataset.get_edge_types()}"
 
     torch.distributed.barrier()
-
+    if node_type is not None:
+        input_nodes: Union[list[torch.Tensor], tuple[NodeType, list[torch.Tensor]]] = (
+            node_type,
+            sampler_input,
+        )
+    else:
+        input_nodes = sampler_input
     # Test the DistNeighborLoader
     loader = DistNeighborLoader(
         dataset=remote_dist_dataset,
         num_neighbors=[2, 2],
         pin_memory_device=torch.device("cpu"),
-        input_nodes=sampler_input,
+        input_nodes=input_nodes,
         num_workers=2,
         worker_concurrency=2,
     )
     count = 0
     for datum in loader:
-        assert isinstance(datum, Data)
+        if node_type is not None:
+            assert isinstance(datum, HeteroData)
+        else:
+            assert isinstance(datum, Data)
         count += 1
     torch.distributed.barrier()
     logger.info(f"Rank {torch.distributed.get_rank()} loaded {count} batches")
@@ -148,6 +162,7 @@ def _run_client_process(
 def _client_process(
     client_rank: int,
     cluster_info: GraphStoreInfo,
+    node_type: Optional[NodeType],
     expected_sampler_input: dict[int, list[torch.Tensor]],
     expected_edge_types: Optional[list[EdgeType]],
 ) -> None:
@@ -161,11 +176,12 @@ def _client_process(
     logger.info("Starting client processes")
     for i in range(cluster_info.num_processes_per_compute):
         client_process = mp_context.Process(
-            target=_run_client_process,
+            target=_run_compute_tests,
             args=[
                 i,  # client_rank
                 cluster_info,  # cluster_info
                 mp_sharing_dict,  # mp_sharing_dict
+                node_type,  # node_type
                 expected_sampler_input,  # expected_sampler_input
                 expected_edge_types,  # expected_edge_types
             ],
@@ -239,7 +255,7 @@ def _get_expected_input_nodes_by_rank(
 
 
 class GraphStoreIntegrationTest(unittest.TestCase):
-    def test_graph_store_locally(self):
+    def test_graph_store_homogeneous(self):
         # Simulating two server machine, two compute machines.
         # Each machine has one process.
         cora_supervised_info = get_mocked_dataset_artifact_metadata()[
@@ -283,7 +299,7 @@ def test_graph_store_locally(self):
                     "MASTER_ADDR": host_ip,
                     "MASTER_PORT": str(master_port),
                     "RANK": str(i),
-                    "WORLD_SIZE": str(cluster_info.compute_cluster_world_size),
+                    "WORLD_SIZE": str(cluster_info.num_cluster_nodes),
                     COMPUTE_CLUSTER_LOCAL_WORLD_SIZE_ENV_KEY: str(
                         cluster_info.num_processes_per_compute
                     ),
@@ -295,6 +311,7 @@ def test_graph_store_locally(self):
                     args=[
                         i,  # client_rank
                         cluster_info,  # cluster_info
+                        None,  # node_type - None for homogeneous dataset
                         expected_sampler_input,  # expected_sampler_input
                         None,  # expected_edge_types - None for homogeneous dataset
                     ],
@@ -310,7 +327,108 @@ def test_graph_store_locally(self):
                     "MASTER_ADDR": host_ip,
                     "MASTER_PORT": str(master_port),
                     "RANK": str(i + cluster_info.num_compute_nodes),
-                    "WORLD_SIZE": str(cluster_info.compute_cluster_world_size),
+                    "WORLD_SIZE": str(cluster_info.num_cluster_nodes),
+                    COMPUTE_CLUSTER_LOCAL_WORLD_SIZE_ENV_KEY: str(
+                        cluster_info.num_processes_per_compute
+                    ),
+                },
+                clear=False,
+            ):
+                server_process = ctx.Process(
+                    target=_run_server_processes,
+                    args=[
+                        cluster_info,  # cluster_info
+                        task_config_uri,  # task_config_uri
+                        True,  # is_inference
+                    ],
+                )
+                server_process.start()
+                server_processes.append(server_process)
+
+        for client_process in client_processes:
+            client_process.join()
+        for server_process in server_processes:
+            server_process.join()
+
+    # TODO: (mkolodner-sc) - Figure out why this test is failing on Google Cloud Build
+    @unittest.skip("Failing on Google Cloud Build - skiping for now")
+    def test_graph_store_heterogeneous(self):
+        # Simulating two server machine, two compute machines.
+        # Each machine has one process.
+        dblp_supervised_info = get_mocked_dataset_artifact_metadata()[
+            DBLP_GRAPH_NODE_ANCHOR_MOCKED_DATASET_INFO.name
+        ]
+        task_config_uri = dblp_supervised_info.frozen_gbml_config_uri
+        (
+            cluster_master_port,
+            storage_cluster_master_port,
+            compute_cluster_master_port,
+            master_port,
+            rpc_master_port,
+            rpc_wait_port,
+        ) = get_free_ports(num_ports=6)
+        host_ip = socket.gethostbyname(socket.gethostname())
+        cluster_info = GraphStoreInfo(
+            num_storage_nodes=2,
+            num_compute_nodes=2,
+            num_processes_per_compute=2,
+            cluster_master_ip=host_ip,
+            storage_cluster_master_ip=host_ip,
+            compute_cluster_master_ip=host_ip,
+            cluster_master_port=cluster_master_port,
+            storage_cluster_master_port=storage_cluster_master_port,
+            compute_cluster_master_port=compute_cluster_master_port,
+            rpc_master_port=rpc_master_port,
+            rpc_wait_port=rpc_wait_port,
+        )
+
+        num_dblp_nodes = 4057
+        expected_sampler_input = _get_expected_input_nodes_by_rank(
+            num_dblp_nodes, cluster_info
+        )
+        expected_edge_types = [
+            EdgeType(NodeType("author"), Relation("to"), NodeType("paper")),
+            EdgeType(NodeType("paper"), Relation("to"), NodeType("author")),
+            EdgeType(NodeType("term"), Relation("to"), NodeType("paper")),
+        ]
+        ctx = mp.get_context("spawn")
+        client_processes: list = []
+        for i in range(cluster_info.num_compute_nodes):
+            with mock.patch.dict(
+                os.environ,
+                {
+                    "MASTER_ADDR": host_ip,
+                    "MASTER_PORT": str(master_port),
+                    "RANK": str(i),
+                    "WORLD_SIZE": str(cluster_info.num_cluster_nodes),
+                    COMPUTE_CLUSTER_LOCAL_WORLD_SIZE_ENV_KEY: str(
+                        cluster_info.num_processes_per_compute
+                    ),
+                },
+                clear=False,
+            ):
+                client_process = ctx.Process(
+                    target=_client_process,
+                    args=[
+                        i,  # client_rank
+                        cluster_info,  # cluster_info
+                        NodeType("author"),  # node_type
+                        expected_sampler_input,  # expected_sampler_input
+                        expected_edge_types,  # expected_edge_types
+                    ],
+                )
+                client_process.start()
+                client_processes.append(client_process)
+        # Start server process
+        server_processes = []
+        for i in range(cluster_info.num_storage_nodes):
+            with mock.patch.dict(
+                os.environ,
+                {
+                    "MASTER_ADDR": host_ip,
+                    "MASTER_PORT": str(master_port),
+                    "RANK": str(i + cluster_info.num_compute_nodes),
+                    "WORLD_SIZE": str(cluster_info.num_cluster_nodes),
                     COMPUTE_CLUSTER_LOCAL_WORLD_SIZE_ENV_KEY: str(
                         cluster_info.num_processes_per_compute
                     ),