Checkpoint Learning Stats (#78)

chandlerzuo · facebook-github-bot · commit f0dd9a4e882a · 2019-09-13T10:49:19.000-07:00
Summary: Pull Request resolved: #78 Currently, when resuming from a failed training, learning curve stats history is lost. This diff adds the learning curve stats in the checkpoint file. Reviewed By: lerks Differential Revision: D15977931 fbshipit-source-id: 7031e61f28fa9dc11f9424a67e1447ec64899f01
diff --git a/test/test_functional.py b/test/test_functional.py
@@ -14,7 +14,7 @@
 import time
 from functools import partial
 from tempfile import TemporaryDirectory
-from typing import Dict, Iterable, List, NamedTuple, Tuple
+from typing import Dict, Iterable, List, Mapping, NamedTuple, Tuple, Union
 from unittest import TestCase, main
 
 import attr
@@ -28,6 +28,7 @@
 )
 from torchbiggraph.eval import do_eval
 from torchbiggraph.partitionserver import run_partition_server
+from torchbiggraph.stats import SerializedStats
 from torchbiggraph.train import train
 from torchbiggraph.util import (
     call_one_after_the_other,
@@ -225,6 +226,20 @@ def assertIsEmbeddings(
         self.assertTrue(np.all(np.isfinite(dataset[...])))
         self.assertTrue(np.all(np.linalg.norm(dataset[...], axis=-1) != 0))
 
+    def assertIsStatsDict(self, stats: Mapping[str, Union[int, SerializedStats]]) -> None:
+        self.assertIsInstance(stats, dict)
+        self.assertIn("index", stats)
+        for k, v in stats.items():
+            if k == "index":
+                self.assertIsInstance(v, int)
+            else:
+                self.assertIsInstance(v, dict)
+                self.assertCountEqual(v.keys(), ["count", "metrics"])
+                self.assertIsInstance(v["count"], int)
+                self.assertIsInstance(v["metrics"], dict)
+                for m in v["metrics"].values():
+                    self.assertIsInstance(m, float)
+
     def assertCheckpointWritten(self, config: ConfigSchema, *, version: int) -> None:
         with open(os.path.join(config.checkpoint_path, "checkpoint_version.txt"), "rt") as tf:
             self.assertEqual(version, int(tf.read().strip()))
@@ -239,6 +254,10 @@ def assertCheckpointWritten(self, config: ConfigSchema, *, version: int) -> None
             self.assertIsModelParameters(hf["model"])
             self.assertIsOptimStateDict(hf["optimizer/state_dict"])
 
+        with open(os.path.join(config.checkpoint_path, "training_stats.json"), "rt") as tf:
+            for line in tf:
+                self.assertIsStatsDict(json.loads(line))
+
         for entity_name, entity in config.entities.items():
             for partition in range(entity.num_partitions):
                 with open(os.path.join(
diff --git a/torchbiggraph/checkpoint_manager.py b/torchbiggraph/checkpoint_manager.py
@@ -14,7 +14,18 @@
 import re
 from abc import ABC, abstractmethod
 from collections import OrderedDict
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    List,
+    Mapping,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
 
 import numpy as np
 import torch
@@ -27,6 +38,7 @@
 )
 from torchbiggraph.config import ConfigSchema
 from torchbiggraph.parameter_sharing import ParameterClient
+from torchbiggraph.stats import SerializedStats
 from torchbiggraph.types import (
     ByteTensorType,
     EntityName,
@@ -418,6 +430,22 @@ def read_config(self) -> ConfigSchema:
         config_json = self.storage.load_config()
         return ConfigSchema.from_dict(json.loads(config_json))
 
+    def append_stats(
+        self,
+        stats: Mapping[str, Union[int, SerializedStats]],
+    ) -> None:
+        self.storage.append_stats(json.dumps(stats))
+
+    def read_stats(self) -> Generator[Dict[str, Union[int, SerializedStats]], None, None]:
+        for line in self.storage.load_stats():
+            yield json.loads(line)
+
+    def maybe_read_stats(self) -> Generator[Dict[str, Union[int, SerializedStats]], None, None]:
+        try:
+            yield from self.read_stats()
+        except CouldNotLoadData:
+            pass
+
     def write_new_version(self, config: ConfigSchema) -> None:
         if self.background:
             self._sync()
diff --git a/torchbiggraph/checkpoint_storage.py b/torchbiggraph/checkpoint_storage.py
@@ -11,7 +11,7 @@
 import os
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any, Dict, NamedTuple, Optional, Tuple
+from typing import Any, Dict, Generator, NamedTuple, Optional, Tuple
 
 import h5py
 import numpy as np
@@ -116,6 +116,14 @@ def save_config(self, config_json: str) -> None:
     def load_config(self) -> str:
         pass
 
+    @abstractmethod
+    def append_stats(self, stats_json: str) -> None:
+        pass
+
+    @abstractmethod
+    def load_stats(self) -> Generator[str, None, None]:
+        pass
+
     @abstractmethod
     def prepare_snapshot(self, version: int, epoch_idx: int) -> None:
         pass
@@ -278,6 +286,11 @@ def get_model_file(self, version: int, *, path: Optional[Path] = None) -> Path:
             path = self.path
         return path / f"model.v{version}.h5"
 
+    def get_stats_file(self, *, path: Optional[Path] = None) -> Path:
+        if path is None:
+            path = self.path
+        return path / "training_stats.json"
+
     def get_snapshot_path(self, epoch_idx: int) -> Path:
         return self.path / f"epoch_{epoch_idx}"
 
@@ -417,6 +430,18 @@ def load_config(self) -> str:
         except FileNotFoundError as err:
             raise CouldNotLoadData() from err
 
+    def append_stats(self, stats_json: str) -> None:
+        with self.get_stats_file().open("at") as tf:
+            tf.write(f"{stats_json}\n")
+
+    def load_stats(self) -> Generator[str, None, None]:
+        try:
+            with self.get_stats_file().open("rt") as tf:
+                for line in tf:
+                    yield line
+        except FileNotFoundError as err:
+            raise CouldNotLoadData() from err
+
     def prepare_snapshot(self, version: int, epoch_idx: int) -> None:
         self.get_snapshot_path(epoch_idx).mkdir(parents=True, exist_ok=True)
 
diff --git a/torchbiggraph/stats.py b/torchbiggraph/stats.py
@@ -8,7 +8,7 @@
 
 from collections import defaultdict
 from statistics import mean
-from typing import Iterable, Type
+from typing import Dict, Iterable, Type, Union
 
 from torchbiggraph.types import FloatTensorType
 
@@ -17,6 +17,9 @@ def average_of_sums(*tensors: FloatTensorType) -> float:
     return mean(t.sum().item() for t in tensors)
 
 
+SerializedStats = Dict[str, Union[int, Dict[str, float]]]
+
+
 class Stats:
     """A class collecting a set of metrics.
 
@@ -66,3 +69,14 @@ def __eq__(self, other: "Stats") -> bool:
         return (isinstance(other, Stats)
                 and self.count == other.count
                 and self.metrics == other.metrics)
+
+    def to_dict(self) -> SerializedStats:
+        return {"count": self.count, "metrics": self.metrics}
+
+    @classmethod
+    def from_dict(cls, d: SerializedStats) -> "Stats":
+        if set(d.keys()) != {"count", "metrics"}:
+            raise ValueError(
+                f"Expect keys ['count', 'metrics'] from input but get {list(d.keys())}."
+            )
+        return Stats(count=d["count"], **d["metrics"])
diff --git a/torchbiggraph/train.py b/torchbiggraph/train.py
@@ -612,6 +612,15 @@ def swap_partitioned_embeddings(
 
         return io_bytes
 
+    if rank == RANK_ZERO:
+        for stats in checkpoint_manager.maybe_read_stats():
+            yield (
+                stats["index"],
+                Stats.from_dict(stats["eval_stats_before"]),
+                Stats.from_dict(stats["stats"]),
+                Stats.from_dict(stats["eval_stats_after"]),
+            )
+
     # Start of the main training loop.
     for epoch_idx, edge_path_idx, edge_chunk_idx in iteration_manager:
         logger.info(
@@ -762,6 +771,14 @@ def swap_partitioned_embeddings(
                 bucket_logger.info(f"Stats after training: {eval_stats_after}")
 
             # Add train/eval metrics to queue
+            checkpoint_manager.append_stats(
+                {
+                    "index": current_index,
+                    "eval_stats_before": eval_stats_before.to_dict(),
+                    "stats": stats.to_dict(),
+                    "eval_stats_after": eval_stats_after.to_dict(),
+                }
+            )
             yield current_index, eval_stats_before, stats, eval_stats_after
 
         swap_partitioned_embeddings(cur_b, None)