Make edge importing more efficient

lw · facebook-github-bot · commit 5e75d706fb05 · 2019-09-23T08:24:29.000-07:00
Summary:
As we introduced support for buffered streaming edge appending, we can make use of it to avoid storing all the edges in memory at the same time. This also showcases how to use that streaming API, so that other users can get inspiration from it for their own importers.

Also, increase chunk sizes in HDF5 files from 1MiB to 50MiB. I used 1MiB because it was suggested in the h5py doc but I now suspect that it was a recommendation if one was doing random access to the dataset. This is not our case, so we can use larger chunks, which make writing more efficient.

Reviewed By: adamlerer

Differential Revision: D17525462

fbshipit-source-id: 646e6f733b3262128a17c3ab3372acbe27c8c6a0
diff --git a/torchbiggraph/converters/import_from_tsv.py b/torchbiggraph/converters/import_from_tsv.py
@@ -8,9 +8,10 @@
 
 import argparse
 import random
+from contextlib import ExitStack
 from itertools import chain
 from pathlib import Path
-from typing import Any, Counter, DefaultDict, Dict, List, Optional, Tuple
+from typing import Any, Counter, Dict, List, Optional, Tuple
 
 import torch
 
@@ -25,6 +26,7 @@
 from torchbiggraph.edgelist import EdgeList
 from torchbiggraph.entitylist import EntityList
 from torchbiggraph.graph_storages import (
+    AbstractEdgeAppender,
     AbstractEdgeStorage,
     AbstractEntityStorage,
     AbstractRelationTypeStorage,
@@ -181,12 +183,12 @@ def generate_edge_path_files(
 
     print(f"- Edges will be partitioned in {num_lhs_parts} x {num_rhs_parts} buckets.")
 
-    buckets: DefaultDict[Tuple[int, int], List[Tuple[int, int, int]]] = \
-        DefaultDict(list)
     processed = 0
     skipped = 0
 
-    with edge_file_in.open("rt") as tf:
+    # We use an ExitStack in order to close the dynamically-created edge appenders.
+    with edge_file_in.open("rt") as tf, ExitStack() as appender_stack:
+        appenders: Dict[Tuple[int, int], AbstractEdgeAppender] = {}
         for line_num, line in enumerate(tf, start=1):
             words = line.split()
             try:
@@ -225,7 +227,14 @@ def generate_edge_path_files(
                 skipped += 1
                 continue
 
-            buckets[lhs_part, rhs_part].append((lhs_offset, rhs_offset, rel_id))
+            if (lhs_part, rhs_part) not in appenders:
+                appenders[lhs_part, rhs_part] = appender_stack.enter_context(
+                    edge_storage.save_edges_by_appending(lhs_part, rhs_part))
+            appenders[lhs_part, rhs_part].append_edges(EdgeList(
+                EntityList.from_tensor(torch.tensor([lhs_offset], dtype=torch.long)),
+                EntityList.from_tensor(torch.tensor([rhs_offset], dtype=torch.long)),
+                torch.tensor([rel_id], dtype=torch.long),
+            ))
 
             processed = processed + 1
             if processed % 100000 == 0:
@@ -237,17 +246,6 @@ def generate_edge_path_files(
               f"entities were unknown (either not given in the config or "
               f"filtered out as too rare).")
 
-    for i in range(num_lhs_parts):
-        for j in range(num_rhs_parts):
-            print(f"- Writing bucket ({i}, {j}), "
-                  f"containing {len(buckets[i, j])} edges...")
-            edges = torch.tensor(buckets[i, j], dtype=torch.long).view((-1, 3))
-            edge_storage.save_edges(i, j, EdgeList(
-                EntityList.from_tensor(edges[:, 0]),
-                EntityList.from_tensor(edges[:, 1]),
-                edges[:, 2],
-            ))
-
 
 def convert_input_data(
     entity_configs: Dict[str, EntitySchema],
diff --git a/torchbiggraph/graph_storages.py b/torchbiggraph/graph_storages.py
@@ -262,7 +262,7 @@ def torch_to_numpy_dtype(dtype):
 class BufferedDataset:
 
     DATA_TYPE = torch.long  # int64, 8 bytes
-    BUFFER_SIZE = 2 ** 20 // 8  # 1MiB
+    BUFFER_SIZE = 50 * 2 ** 20 // 8  # 50MiB
 
     def __init__(self, hf: h5py.File, dataset_name: str) -> None:
         self.hf: h5py.File = hf