Auto-tune num_edge_chunks

lw · facebook-github-bot · commit 9c9e809a6910 · 2019-09-26T05:30:32.000-07:00
Summary: One reason we allow to chunk edgelists is to be able to load edgelists that are too big to fit in memory at once, piece by piece. (There are other reasons however, for example more frequent mixing of edges from different buckets). The former goal can be achieved automatically, that is, given a certain maximum size that edgelists can take in memort, PBG can figure out what is the smallest number of edge chunks that achieve this.

Reviewed By: adamlerer

Differential Revision: D17571778

fbshipit-source-id: e4977078c35f4bdb212ad163acf137eb94d33994
diff --git a/docs/source/configuration_file.rst b/docs/source/configuration_file.rst
@@ -139,9 +139,13 @@ See :ref:`batch-preparation` for more details.
 
   The number of times the training loop iterates over all the edges.
 
-- ``num_edge_chunks`` (type: integer; default: ``1``)
+- ``num_edge_chunks`` (type: integer or null; default: ``null``)
 
-  The number of equally-sized parts each bucket will be split into. Training will first proceed over all the first chunks of all buckets, then over all the second chunks, and so on. A higher value allows better mixing of partitions, at the cost of more time spent on I/O.
+  The number of equally-sized parts each bucket will be split into. Training will first proceed over all the first chunks of all buckets, then over all the second chunks, and so on. A higher value allows better mixing of partitions, at the cost of more time spent on I/O. If unset, will be automatically calculated so that no chunk has more than max_edges_per_chunk edges.
+
+- ``max_edges_per_chunk`` (type: integer, default: ``1000000000``)
+
+  The maximum number of edges that each edge chunk should contain if the number of edge chunks is left unspecified and has to be automatically figured out. Each edge takes up at least 12 bytes (3 int64s), more if using featurized entities.
 
 - ``bucket_order`` (type: string, either ``"random"``, ``"affinity"``, ``"inside_out"`` or ``"outside_in"``; default: ``"inside_out"``)
 
diff --git a/torchbiggraph/config.py b/torchbiggraph/config.py
@@ -222,15 +222,26 @@ class ConfigSchema(Schema):
         metadata={'help': "The number of times the training loop iterates over "
                           "all the edges."},
     )
-    num_edge_chunks: int = attr.ib(
-        default=1,
-        validator=positive,
+    num_edge_chunks: Optional[int] = attr.ib(
+        default=None,
+        validator=optional(positive),
         metadata={'help': "The number of equally-sized parts each bucket will "
                           "be split into. Training will first proceed over all "
                           "the first chunks of all buckets, then over all the "
                           "second chunks, and so on. A higher value allows "
                           "better mixing of partitions, at the cost of more "
-                          "time spent on I/O."},
+                          "time spent on I/O. If unset, will be automatically "
+                          "calculated so that no chunk has more than "
+                          "max_edges_per_chunk edges."},
+    )
+    max_edges_per_chunk: int = attr.ib(
+        default=1_000_000_000,  # Each edge having 3 int64s, this is 12GB.
+        validator=positive,
+        metadata={'help': "The maximum number of edges that each edge chunk "
+                          "should contain if the number of edge chunks is left "
+                          "unspecified and has to be automatically figured "
+                          "out. Each edge takes up at least 12 bytes (3 "
+                          "int64s), more if using featurized entities."},
     )
     bucket_order: BucketOrder = attr.ib(
         default=BucketOrder.INSIDE_OUT,
diff --git a/torchbiggraph/graph_storages.py b/torchbiggraph/graph_storages.py
@@ -122,6 +122,10 @@ def has_edges(self, lhs_p: int, rhs_p: int) -> bool:
     def load_edges(self, lhs_p: int, rhs_p: int) -> EdgeList:
         return self.load_chunk_of_edges(lhs_p, rhs_p, chunk_idx=0, num_chunks=1)
 
+    @abstractmethod
+    def get_number_of_edges(self, lhs_p: int, rhs_p: int) -> int:
+        pass
+
     @abstractmethod
     def load_chunk_of_edges(
         self,
@@ -388,6 +392,15 @@ def has_edges(
     ) -> bool:
         return self.get_edges_file(lhs_p, rhs_p).is_file()
 
+    def get_number_of_edges(self, lhs_p: int, rhs_p: int) -> int:
+        file_path = self.get_edges_file(lhs_p, rhs_p)
+        if not file_path.is_file():
+            raise RuntimeError(f"{file_path} does not exist")
+        with h5py.File(file_path, "r") as hf:
+            if hf.attrs.get(FORMAT_VERSION_ATTR, None) != FORMAT_VERSION:
+                raise RuntimeError(f"Version mismatch in edge file {file_path}")
+            return hf["rel"].len()
+
     def load_chunk_of_edges(
         self,
         lhs_p: int,
diff --git a/torchbiggraph/train.py b/torchbiggraph/train.py
@@ -8,6 +8,7 @@
 
 import argparse
 import logging
+import math
 import time
 from abc import ABC, abstractmethod
 from functools import partial
@@ -283,6 +284,30 @@ def should_preserve_old_checkpoint(
     return is_checkpoint_epoch and is_first_edge_path and is_first_edge_chunk
 
 
+def get_num_edge_chunks(
+    edge_paths: List[str],
+    nparts_lhs: int,
+    nparts_rhs: int,
+    max_edges_per_chunk: int,
+) -> int:
+    max_edges_per_bucket = 0
+    # We should check all edge paths, all lhs partitions and all rhs partitions,
+    # but the combinatorial explosion could lead to thousands of checks. Let's
+    # assume that edges are uniformly distributed among buckets (this is not
+    # exactly the case, as it's the entities that are uniformly distributed
+    # among the partitions, and edge assignments to buckets are a function of
+    # that, thus, for example, very high degree entities could skew this), and
+    # use the size of bucket (0, 0) as an estimate of the average bucket size.
+    # We still do it for all edge paths as there could be semantic differences
+    # between them which lead to different sizes.
+    for edge_path in edge_paths:
+        edge_storage = EDGE_STORAGES.make_instance(edge_path)
+        max_edges_per_bucket = max(
+            max_edges_per_bucket,
+            edge_storage.get_number_of_edges(0, 0))
+    return max(1, math.ceil(max_edges_per_bucket / max_edges_per_chunk))
+
+
 def train_and_report_stats(
     config: ConfigSchema,
     model: Optional[MultiRelationEmbedder] = None,
@@ -446,8 +471,13 @@ def make_optimizer(params: Iterable[torch.nn.Parameter], is_emb: bool) -> Optimi
     checkpoint_manager.register_metadata_provider(ConfigMetadataProvider(config))
     checkpoint_manager.write_config(config)
 
+    if config.num_edge_chunks is not None:
+        num_edge_chunks = config.num_edge_chunks
+    else:
+        num_edge_chunks = get_num_edge_chunks(
+            config.edge_paths, nparts_lhs, nparts_rhs, config.max_edges_per_chunk)
     iteration_manager = IterationManager(
-        config.num_epochs, config.edge_paths, config.num_edge_chunks,
+        config.num_epochs, config.edge_paths, num_edge_chunks,
         iteration_idx=checkpoint_manager.checkpoint_version)
     checkpoint_manager.register_metadata_provider(iteration_manager)
 
@@ -680,7 +710,7 @@ def swap_partitioned_embeddings(
 
             bucket_logger.debug("Loading edges")
             edges = edge_storage.load_chunk_of_edges(
-                cur_b.lhs, cur_b.rhs, edge_chunk_idx, config.num_edge_chunks)
+                cur_b.lhs, cur_b.rhs, edge_chunk_idx, iteration_manager.num_edge_chunks)
             num_edges = len(edges)
             # this might be off in the case of tensorlist or extra edge fields
             io_bytes += edges.lhs.tensor.numel() * edges.lhs.tensor.element_size()