facebookresearch
diff --git a/‎README.md‎
Lines changed: 0 additions & 1 deletion b/‎README.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/source/downstream_tasks.rst‎
Lines changed: 28 additions & 21 deletions b/‎docs/source/downstream_tasks.rst‎
Lines changed: 28 additions & 21 deletions
diff --git a/‎torchbiggraph/checkpoint_manager.py‎
Lines changed: 1 addition & 2 deletions b/‎torchbiggraph/checkpoint_manager.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎torchbiggraph/checkpoint_storage.py‎
Lines changed: 1 addition & 4 deletions b/‎torchbiggraph/checkpoint_storage.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎torchbiggraph/converters/dictionary.py‎
Lines changed: 19 additions & 9 deletions b/‎torchbiggraph/converters/dictionary.py‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎torchbiggraph/converters/export_to_tsv.py‎
Lines changed: 18 additions & 20 deletions b/‎torchbiggraph/converters/export_to_tsv.py‎
Lines changed: 18 additions & 20 deletions
@@ -139,7 +139,6 @@ During preprocessing, the entities and relation types had their identifiers conv
 ```bash
 torchbiggraph_export_to_tsv \
   torchbiggraph/examples/configs/fb15k_config.py \
-  --dict data/FB15k/dictionary.json \
   --checkpoint model/fb15k \
   --entities-output entity_embeddings.tsv \
   --relation-types-output relation_types_parameters.tsv
 
@@ -30,19 +30,20 @@ Reading the HDF5 format
 Suppose that you have completed the training of the ``torchbiggraph_example_fb15k`` command and want to now
 look up the embedding of some entity. For that, we'll need to read:
 
-- the embeddings, from the checkpoint files (the :file:`.h5` files in the `model/fb15k` directory, or
+- the embeddings, from the checkpoint files (the :file:`.h5` files in the :file:`model/fb15k` directory, or
   whatever directory was specified as the ``checkpoint_path``); and
-- the mapping from entity names to their partitions and offsets, from the :file:`data/FB15k/dictionary.json`
-  file created by the ``torchbiggraph_import_from_tsv`` command.
+- the names of the entities of a certain type and partition (ordere by their offset), from the files in the
+  :file:`data/FB15k` directory (or an alternative directory given as the ``entity_path``), created by the
+  ``torchbiggraph_import_from_tsv`` command.
 
 The embedding of, say, entity ``/m/05hf_5`` can be found as follows::
 
     import json
     import h5py
 
-    with open("data/FB15k/dictionary.json", "rt") as tf:
-        dictionary = json.load(tf)
-    offset = dictionary["entities"]["all"].index("/m/05hf_5")
+    with open("data/FB15k/entity_names_all_0.json", "rt") as tf:
+        names = json.load(tf)
+    offset = names.index("/m/05hf_5")
 
     with h5py.File("model/fb15k/embeddings_all_0.v50.h5", "r") as hf:
         embedding = hf["embeddings"][offset, :]
@@ -162,12 +163,16 @@ being the capital of France::
     operator.load_state_dict(operator_state_dict)
     comparator = DotComparator()
 
-    # Load the offsets of the entities and the index of the relation type
-    with open("data/FB15k/dictionary.json", "rt") as tf:
-        dictionary = json.load(tf)
-    src_entity_offset = dictionary["entities"]["all"].index("/m/0f8l9c")  # France
-    dest_entity_offset = dictionary["entities"]["all"].index("/m/05qtj")  # Paris
-    rel_type_index = dictionary["relations"].index("/location/country/capital")
+    # Load the names of the entities, ordered by offset.
+    with open("data/FB15k/entity_names_all_0.json", "rt") as tf:
+        entity_names = json.load(tf)
+    src_entity_offset = entity_names.index("/m/0f8l9c")  # France
+    dest_entity_offset = entity_names.index("/m/05qtj")  # Paris
+
+    # Load the names of the relation types, ordered by index.
+    with open("data/FB15k/dynamic_rel_names.json", "rt") as tf:
+        rel_type_names = json.load(tf)
+    rel_type_index = rel_type_names.index("/location/country/capital")
 
     # Load the trained embeddings
     with h5py.File("model/fb15k/embeddings_all_0.v50.h5", "r") as hf:
@@ -220,10 +225,12 @@ entities are most likely to be the capital of France::
     comparator = DotComparator()
 
     # Load the offsets of the entities and the index of the relation type
-    with open("data/FB15k/dictionary.json", "rt") as tf:
-        dictionary = json.load(tf)
-    src_entity_offset = dictionary["entities"]["all"].index("/m/0f8l9c")  # France
-    rel_type_index = dictionary["relations"].index("/location/country/capital")
+    with open("data/FB15k/entity_names_all_0.json", "rt") as tf:
+        entity_names = json.load(tf)
+    src_entity_offset = entity_names.index("/m/0f8l9c")  # France
+    with open("data/FB15k/dynamic_rel_names.json", "rt") as tf:
+        rel_type_names = json.load(tf)
+    rel_type_index = rel_type_names.index("/location/country/capital")
 
     # Load the trained embeddings
     with h5py.File("model/fb15k/embeddings_all_0.v50.h5", "r") as hf:
@@ -245,7 +252,7 @@ entities are most likely to be the capital of France::
 
     # Sort the entities by their score
     permutation = scores.flatten().argsort(descending=True)
-    top5_entities = [dictionary["entities"]["all"][index] for index in permutation[:5]]
+    top5_entities = [entity_names[index] for index in permutation[:5]]
 
     print(top5_entities)
 
@@ -271,17 +278,17 @@ library. The following code looks for the entities that are closest to Paris::
         index.add(hf["embeddings"][...])
 
     # Get trained embedding of Paris
-    with open("data/FB15k/dictionary.json", "rt") as f:
-        dictionary = json.load(f)
-    target_entity_offset = dictionary["entities"]["all"].index("/m/05qtj")  # Paris
+    with open("data/FB15k/entity_names_all_0.json", "rt") as tf:
+        entity_names = json.load(tf)
+    target_entity_offset = entity_names.index("/m/05qtj")  # Paris
     with h5py.File("model/fb15k/embeddings_all_0.v50.h5", "r") as hf:
         target_embedding = hf["embeddings"][target_entity_offset, :]
 
     # Search nearest neighbors
     _, neighbors = index.search(target_embedding.reshape((1, 400)), 5)
 
     # Map back to entity names
-    top5_entities = [dictionary["entities"]["all"][index] for index in neighbors[0]]
+    top5_entities = [entity_names[index] for index in neighbors[0]]
 
     print(top5_entities)
 
 
@@ -23,7 +23,6 @@
 from torchbiggraph.checkpoint_storage import (
     AbstractCheckpointStorage,
     CHECKPOINT_STORAGES,
-    CouldNotLoadData,
     ModelParameter,
 )
 from torchbiggraph.config import ConfigSchema
@@ -37,7 +36,7 @@
     Partition,
     Rank,
 )
-from torchbiggraph.util import create_pool, get_async_result
+from torchbiggraph.util import CouldNotLoadData, create_pool, get_async_result
 
 
 logger = logging.getLogger("torchbiggraph")
 
@@ -24,15 +24,12 @@
     ModuleStateDict,
     Partition,
 )
+from torchbiggraph.util import CouldNotLoadData
 
 
 logger = logging.getLogger("torchbiggraph")
 
 
-class CouldNotLoadData(Exception):
-    pass
-
-
 class ModelParameter(NamedTuple):
     # This is the "internal" name, the one of the model's state dict, which is
     # considered an implementation detail. Thus the parameters are stored under
 
@@ -6,6 +6,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE.txt file in the root directory of this source tree.
 
+import math
 from typing import Dict, List, Tuple
 
 
@@ -27,18 +28,27 @@ def get_id(self, word: str) -> int:
     def size(self) -> int:
         return len(self.ix_to_word)
 
+    def get_list(self) -> List[str]:
+        return self.ix_to_word
+
+    def part_start(self, part: int) -> int:
+        return math.ceil(part / self.num_parts * self.size())
+
+    def part_end(self, part: int) -> int:
+        return self.part_start(part + 1)
+
     def part_size(self, part: int) -> int:
         if not 0 <= part < self.num_parts:
-            raise ValueError("%d not in [0, %d)" % (part, self.num_parts))
-        part_begin = (part * self.size() - 1) // self.num_parts + 1
-        part_end = ((part + 1) * self.size() - 1) // self.num_parts
-        return part_end - part_begin + 1
+            raise ValueError(f"{part} not in [0, {self.num_parts})")
+        return self.part_end(part) - self.part_start(part)
 
     def get_partition(self, word: str) -> Tuple[int, int]:
         idx = self.get_id(word)
-        part = idx * self.num_parts // self.size()
-        part_begin = (part * self.size() - 1) // self.num_parts + 1
-        return part, idx - part_begin
+        part = math.floor(idx / self.size() * self.num_parts)
+        assert self.part_start(part) <= idx < self.part_end(part)
+        return part, idx - self.part_start(part)
 
-    def get_list(self) -> List[str]:
-        return self.ix_to_word
+    def get_part_list(self, part: int) -> List[str]:
+        if not 0 <= part < self.num_parts:
+            raise ValueError(f"{part} not in [0, {self.num_parts})")
+        return self.ix_to_word[self.part_start(part):self.part_end(part)]
@@ -7,12 +7,17 @@
 # LICENSE.txt file in the root directory of this source tree.
 
 import argparse
-import json
 from itertools import chain
-from typing import Dict, Iterable, List, TextIO
+from typing import Iterable, TextIO
 
 from torchbiggraph.checkpoint_manager import CheckpointManager
 from torchbiggraph.config import ConfigFileLoader, ConfigSchema
+from torchbiggraph.graph_storages import (
+    AbstractEntityStorage,
+    AbstractRelationTypeStorage,
+    ENTITY_STORAGES,
+    RELATION_TYPE_STORAGES,
+)
 from torchbiggraph.model import MultiRelationEmbedder, make_model
 
 
@@ -23,11 +28,13 @@ def write(outf: TextIO, key: Iterable[str], value: Iterable[float]) -> None:
 def make_tsv(
     config: ConfigSchema,
     checkpoint: str,
-    entities_by_type: Dict[str, List[str]],
-    relation_types: List[str],
     entities_tf: TextIO,
     relation_types_tf: TextIO,
 ) -> None:
+    print("Loading relation types and entities...")
+    entity_storage = ENTITY_STORAGES.make_instance(config.entity_path)
+    relation_type_storage = RELATION_TYPE_STORAGES.make_instance(config.entity_path)
+
     print("Initializing model...")
     model = make_model(config)
 
@@ -40,29 +47,28 @@ def make_tsv(
     make_tsv_for_entities(
         model,
         checkpoint_manager,
-        entities_by_type,
+        entity_storage,
         entities_tf,
     )
     make_tsv_for_relation_types(
         model,
-        relation_types,
+        relation_type_storage,
         relation_types_tf,
     )
 
 
 def make_tsv_for_entities(
     model: MultiRelationEmbedder,
     checkpoint_manager: CheckpointManager,
-    entities_by_type: Dict[str, List[str]],
+    entity_storage: AbstractEntityStorage,
     entities_tf: TextIO,
 ) -> None:
     print("Writing entity embeddings...")
     for ent_t_name, ent_t_config in model.entities.items():
-        entities = entities_by_type[ent_t_name]
-        partition_offset = 0
         for partition in range(ent_t_config.num_partitions):
             print(f"Reading embeddings for entity type {ent_t_name} partition "
                   f"{partition} from checkpoint...")
+            entities = entity_storage.load_names(ent_t_name, partition)
             embeddings, _ = checkpoint_manager.read(ent_t_name, partition)
 
             if model.global_embs is not None:
@@ -71,23 +77,22 @@ def make_tsv_for_entities(
             print(f"Writing embeddings for entity type {ent_t_name} partition "
                   f"{partition} to output file...")
             for ix in range(len(embeddings)):
-                write(entities_tf, (entities[partition_offset + ix],), embeddings[ix])
+                write(entities_tf, (entities[ix],), embeddings[ix])
                 if (ix + 1) % 5000 == 0:
                     print(f"- Processed {ix+1}/{len(embeddings)} entities so far...")
             print(f"- Processed all {len(embeddings)} entities")
 
-            partition_offset += len(embeddings)
-
     entities_output_filename = getattr(entities_tf, "name", "the output file")
     print(f"Done exporting entity data to {entities_output_filename}")
 
 
 def make_tsv_for_relation_types(
     model: MultiRelationEmbedder,
-    relation_types: List[str],
+    relation_type_storage: AbstractRelationTypeStorage,
     relation_types_tf: TextIO,
 ) -> None:
     print("Writing relation type parameters...")
+    relation_types = relation_type_storage.load_names()
     if model.num_dynamic_rels > 0:
         rel_t_config, = model.relations
         op_name = rel_t_config.operator
@@ -132,7 +137,6 @@ def main():
     parser.add_argument('config', help="Path to config file")
     parser.add_argument('-p', '--param', action='append', nargs='*')
     parser.add_argument('--checkpoint')
-    parser.add_argument('--dict', required=True)
     parser.add_argument('--entities-output', required=True)
     parser.add_argument('--relation-types-output', required=True)
     opt = parser.parse_args()
@@ -144,17 +148,11 @@ def main():
     loader = ConfigFileLoader()
     config = loader.load_config(opt.config, overrides)
 
-    print("Loading relation types and entities...")
-    with open(opt.dict, "rt") as tf:
-        dump = json.load(tf)
-
     with open(opt.entities_output, "xt") as entities_tf, \
             open(opt.relation_types_output, "xt") as relation_types_tf:
         make_tsv(
             config,
             opt.checkpoint,
-            dump["entities"],
-            dump["relations"],
             entities_tf,
             relation_types_tf,
         )
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,6 @@`
`23`	`23`	`from torchbiggraph.checkpoint_storage import (`
`24`	`24`	`AbstractCheckpointStorage,`
`25`	`25`	`CHECKPOINT_STORAGES,`
`26`		`- CouldNotLoadData,`
`27`	`26`	`ModelParameter,`
`28`	`27`	`)`
`29`	`28`	`from torchbiggraph.config import ConfigSchema`
`@@ -37,7 +36,7 @@`
`37`	`36`	`Partition,`
`38`	`37`	`Rank,`
`39`	`38`	`)`
`40`		`-from torchbiggraph.util import create_pool, get_async_result`
	`39`	`+from torchbiggraph.util import CouldNotLoadData, create_pool, get_async_result`
`41`	`40`
`42`	`41`
`43`	`42`	`logger = logging.getLogger("torchbiggraph")`