Fix HGTConv edge_type_vec construction (pyg-team#7194)

berkekisin · berke.kisin · toenshoff · web-flow · commit c9fef62c2f6f · 2023-04-25T10:34:14.000+05:30
This pr fixes the utility function https://github.com/pyg-team/pytorch_geometric/blob/3d4836bc24dbb1b180f29cbbbdbcd18b94116dd7/torch_geometric/nn/conv/hgt_conv.py#L123, which constructs the type_vec of edges wrong and also crashes if some edge_types are not present in the current edge_index_dict. Consider the following scenario: ```python # N =2, D=2, H=2 (2 nodes, head_dim 2, 2 heads) k = [ [0,0,1,1], [2,2,3,3] ] ``` after calling this line: https://github.com/pyg-team/pytorch_geometric/blob/3d4836bc24dbb1b180f29cbbbdbcd18b94116dd7/torch_geometric/nn/conv/hgt_conv.py#L141 the matrix k looks like this: ```python k= [ [0,0], [1,1], [2,2], [3,3]] # the type vec should look like this type_vec = [0,1,0,1] # but at current implementation it would look like this type_vec = [0,0,1,1] ``` After the reshape the attention heads are interleaved but the type vector that is currently constructed is sorted. We fixed this issue by constructing interleaved type vec. Alternatively we can transpose the k before the reshape to ensure that we can use sorted type vec. This will also allow us to set `is_sorted=True` for the heterolinear `k_rel` which would be more efficient. Also note that we added a test case for missing edge type in edge_index_dict. --------- Co-authored-by: berke.kisin <kisin@newton.lics.rwth-aachen.de> Co-authored-by: toensoff <toenshoff@informatik.rwth-aachen.de> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jinu Sunil <jinu.sunil@gmail.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,6 +29,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
+- Fixed `HGTConv` utility function `_construct_src_node_feat` ([#7194](https://github.com/pyg-team/pytorch_geometric/pull/7194))
 - Extend dataset summary to create stats for each node/edge type ([#7203](https://github.com/pyg-team/pytorch_geometric/pull/7203))
 - Added an optional `batch_size` argument to `avg_pool_x` and `max_pool_x` ([#7216](https://github.com/pyg-team/pytorch_geometric/pull/7216))
 - Fixed `subgraph` on unordered inputs ([#7187](https://github.com/pyg-team/pytorch_geometric/pull/7187))
diff --git a/test/nn/conv/test_hgt_conv.py b/test/nn/conv/test_hgt_conv.py
@@ -193,7 +193,45 @@ def test_hgt_conv_missing_dst_node_type():
     out_dict = conv(data.x_dict, data.edge_index_dict)
     assert out_dict['author'].size() == (4, 64)
     assert out_dict['paper'].size() == (6, 64)
-    assert out_dict['university'] is None
+    assert 'university' not in out_dict
+
+
+def test_hgt_conv_missing_input_node_type():
+    data = HeteroData()
+    data['author'].x = torch.randn(4, 16)
+    data['paper'].x = torch.randn(6, 32)
+    data['author', 'writes',
+         'paper'].edge_index = get_random_edge_index(4, 6, 20)
+
+    # Some nodes from metadata are missing in data.
+    # This might happen while using NeighborLoader.
+    metadata = (['author', 'paper',
+                 'university'], [('author', 'writes', 'paper')])
+    conv = HGTConv(-1, 64, metadata, heads=1)
+
+    out_dict = conv(data.x_dict, data.edge_index_dict)
+    assert out_dict['paper'].size() == (6, 64)
+    assert 'university' not in out_dict
+
+
+def test_hgt_conv_missing_edge_type():
+    data = HeteroData()
+    data['author'].x = torch.randn(4, 16)
+    data['paper'].x = torch.randn(6, 32)
+    data['university'].x = torch.randn(10, 32)
+
+    data['author', 'writes',
+         'paper'].edge_index = get_random_edge_index(4, 6, 20)
+
+    metadata = (['author', 'paper',
+                 'university'], [('author', 'writes', 'paper'),
+                                 ('university', 'employs', 'author')])
+    conv = HGTConv(-1, 64, metadata, heads=1)
+
+    out_dict = conv(data.x_dict, data.edge_index_dict)
+    assert out_dict['author'].size() == (4, 64)
+    assert out_dict['paper'].size() == (6, 64)
+    assert 'university' not in out_dict
 
 
 if __name__ == '__main__':
diff --git a/torch_geometric/nn/conv/hgt_conv.py b/torch_geometric/nn/conv/hgt_conv.py
@@ -71,6 +71,10 @@ def __init__(
         self.heads = heads
         self.node_types = metadata[0]
         self.edge_types = metadata[1]
+        self.edge_types_map = {
+            edge_type: i
+            for i, edge_type in enumerate(metadata[1])
+        }
 
         self.dst_node_types = set([key[-1] for key in self.edge_types])
 
@@ -83,10 +87,10 @@ def __init__(
         dim = out_channels // heads
         num_types = heads * len(self.edge_types)
 
-        self.k_rel = HeteroLinear(dim, dim, num_types, is_sorted=True,
-                                  bias=False)
-        self.v_rel = HeteroLinear(dim, dim, num_types, is_sorted=True,
-                                  bias=False)
+        self.k_rel = HeteroLinear(dim, dim, num_types, bias=False,
+                                  is_sorted=True)
+        self.v_rel = HeteroLinear(dim, dim, num_types, bias=False,
+                                  is_sorted=True)
 
         self.skip = ParameterDict({
             node_type: Parameter(torch.Tensor(1))
@@ -121,36 +125,40 @@ def _cat(self, x_dict: Dict[str, Tensor]) -> Tuple[Tensor, Dict[str, int]]:
         return torch.cat(outs, dim=0), offset
 
     def _construct_src_node_feat(
-        self,
-        k_dict: Dict[str, Tensor],
-        v_dict: Dict[str, Tensor],
+        self, k_dict: Dict[str, Tensor], v_dict: Dict[str, Tensor],
+        edge_index_dict: Dict[EdgeType, Adj]
     ) -> Tuple[Tensor, Tensor, Dict[EdgeType, int]]:
         """Constructs the source node representations."""
-        count = 0
         cumsum = 0
+        num_edge_types = len(self.edge_types)
         H, D = self.heads, self.out_channels // self.heads
 
         # Flatten into a single tensor with shape [num_edge_types * heads, D]:
         ks: List[Tensor] = []
         vs: List[Tensor] = []
-        type_list: List[int] = []
+        type_list: List[Tensor] = []
         offset: Dict[EdgeType] = {}
-        for edge_type in self.edge_types:
-            src, _, _ = edge_type
-
-            ks.append(k_dict[src].reshape(-1, D))
-            vs.append(v_dict[src].reshape(-1, D))
-
+        for edge_type in edge_index_dict.keys():
+            src = edge_type[0]
             N = k_dict[src].size(0)
-            for _ in range(H):
-                type_list.append(torch.full((N, ), count, dtype=torch.long))
-                count += 1
             offset[edge_type] = cumsum
             cumsum += N
 
-        type_vec = torch.cat(type_list, dim=0)
-        k = self.k_rel(torch.cat(ks, dim=0), type_vec).view(-1, H, D)
-        v = self.v_rel(torch.cat(vs, dim=0), type_vec).view(-1, H, D)
+            # construct type_vec for curr edge_type with shape [H, D]
+            edge_type_offset = self.edge_types_map[edge_type]
+            type_vec = torch.arange(H, dtype=torch.long).view(-1, 1).repeat(
+                1, N) * num_edge_types + edge_type_offset
+
+            type_list.append(type_vec)
+            ks.append(k_dict[src])
+            vs.append(v_dict[src])
+
+        ks = torch.cat(ks, dim=0).transpose(0, 1).reshape(-1, D)
+        vs = torch.cat(vs, dim=0).transpose(0, 1).reshape(-1, D)
+        type_vec = torch.cat(type_list, dim=1).flatten()
+
+        k = self.k_rel(ks, type_vec).view(H, -1, D).transpose(0, 1)
+        v = self.v_rel(vs, type_vec).view(H, -1, D).transpose(0, 1)
 
         return k, v, offset
 
@@ -184,12 +192,14 @@ def forward(
         # Compute K, Q, V over node types:
         kqv_dict = self.kqv_lin(x_dict)
         for key, val in kqv_dict.items():
-            k_dict[key] = val[:, :F].view(-1, H, D)
-            q_dict[key] = val[:, F:2 * F].view(-1, H, D)
-            v_dict[key] = val[:, 2 * F:].view(-1, H, D)
+            k, q, v = torch.tensor_split(val, 3, dim=1)
+            k_dict[key] = k.view(-1, H, D)
+            q_dict[key] = q.view(-1, H, D)
+            v_dict[key] = v.view(-1, H, D)
 
         q, dst_offset = self._cat(q_dict)
-        k, v, src_offset = self._construct_src_node_feat(k_dict, v_dict)
+        k, v, src_offset = self._construct_src_node_feat(
+            k_dict, v_dict, edge_index_dict)
 
         edge_index, edge_attr = construct_bipartite_edge_index(
             edge_index_dict, src_offset, dst_offset, edge_attr_dict=self.p_rel)
@@ -200,7 +210,8 @@ def forward(
         # Reconstruct output node embeddings dict:
         for node_type, start_offset in dst_offset.items():
             end_offset = start_offset + q_dict[node_type].size(0)
-            out_dict[node_type] = out[start_offset:end_offset]
+            if node_type in self.dst_node_types:
+                out_dict[node_type] = out[start_offset:end_offset]
 
         # Transform output node embeddings:
         a_dict = self.out_lin({
@@ -210,11 +221,7 @@ def forward(
 
         # Iterate over node types:
         for node_type, out in out_dict.items():
-            if node_type not in self.dst_node_types:
-                out_dict[node_type] = None
-                continue
-            else:
-                out = a_dict[node_type]
+            out = a_dict[node_type]
 
             if out.size(-1) == x_dict[node_type].size(-1):
                 alpha = self.skip[node_type].sigmoid()
diff --git a/torch_geometric/nn/dense/linear.py b/torch_geometric/nn/dense/linear.py
@@ -387,7 +387,7 @@ def forward(
                     biases.append(lin.bias)
             biases = None if biases[0] is None else biases
             outs = pyg_lib.ops.grouped_matmul(xs, weights, biases)
-            for key, out in zip(self.lins.keys(), outs):
+            for key, out in zip(x_dict.keys(), outs):
                 if key in x_dict:
                     out_dict[key] = out
         else: