Fix to actually be able to use non-file edge paths in import_from_tsv

lw · facebook-github-bot · commit 50c9038b7584 · 2019-09-23T01:12:53.000-07:00
Summary:
In `import_from_tsv` we constructed the output edge paths by converting the paths passed as command-line args (stripping the extension and appending `_partitioned`). Thus, the output edge paths are always file-based. In order to be able to use different schemas we need to be able to separately specify the output edge paths: it makes perfect sense to use the config file for this (we are already using it for the entity path, and may one day use it for the initial data).

The input edge paths (from the command line) and the output ones (from the config) are then matched (i.e., zipped). Thus order now matters. Therefore in the README we explicitly list them, rather than using a glob wildcard.

Reviewed By: adamlerer

Differential Revision: D17502000

fbshipit-source-id: 9cd2e534ab70c600a35e28c4dae57a6015c2bc1e
diff --git a/README.md b/README.md
@@ -76,7 +76,9 @@ Luckily, there is a command that does all of this:
 torchbiggraph_import_from_tsv \
   --lhs-col=0 --rel-col=1 --rhs-col=2 \
   torchbiggraph/examples/configs/fb15k_config.py \
-  data/FB15k/freebase_mtr100_mte100-*.txt
+  data/FB15k/freebase_mtr100_mte100-train.txt \
+  data/FB15k/freebase_mtr100_mte100-valid.txt \
+  data/FB15k/freebase_mtr100_mte100-test.txt
 ```
 The outputs will be stored next to the inputs in the `data/FB15k` directory.
 
diff --git a/torchbiggraph/converters/import_from_tsv.py b/torchbiggraph/converters/import_from_tsv.py
@@ -22,7 +22,6 @@
     override_config_dict,
 )
 from torchbiggraph.converters.dictionary import Dictionary
-from torchbiggraph.converters.utils import convert_path
 from torchbiggraph.edgelist import EdgeList
 from torchbiggraph.entitylist import EntityList
 from torchbiggraph.graph_storages import (
@@ -254,18 +253,23 @@ def convert_input_data(
     entity_configs: Dict[str, EntitySchema],
     relation_configs: List[RelationSchema],
     entity_path: str,
-    edge_paths: List[Path],
+    edge_paths_out: List[str],
+    edge_paths_in: List[Path],
     lhs_col: int,
     rhs_col: int,
     rel_col: Optional[int] = None,
     entity_min_count: int = 1,
     relation_type_min_count: int = 1,
     dynamic_relations: bool = False,
 ) -> None:
+    if len(edge_paths_in) != len(edge_paths_out):
+        raise ValueError(
+            f"The edge paths passed as inputs ({edge_paths_in}) don't match "
+            f"the ones specified as outputs ({edge_paths_out})")
+
     entity_storage = ENTITY_STORAGES.make_instance(entity_path)
     relation_type_storage = RELATION_TYPE_STORAGES.make_instance(entity_path)
-    edge_paths_out = [convert_path(ep) for ep in edge_paths]
-    edge_storages = [EDGE_STORAGES.make_instance(str(ep)) for ep in edge_paths_out]
+    edge_storages = [EDGE_STORAGES.make_instance(ep) for ep in edge_paths_out]
 
     some_files_exists = []
     some_files_exists.extend(
@@ -291,7 +295,7 @@ def convert_input_data(
 
     relation_types = collect_relation_types(
         relation_configs,
-        edge_paths,
+        edge_paths_in,
         dynamic_relations,
         rel_col,
         relation_type_min_count,
@@ -301,7 +305,7 @@ def convert_input_data(
         relation_types,
         entity_configs,
         relation_configs,
-        edge_paths,
+        edge_paths_in,
         dynamic_relations,
         lhs_col,
         rhs_col,
@@ -317,10 +321,10 @@ def convert_input_data(
         dynamic_relations,
     )
 
-    for edge_path, edge_path_out, edge_storage \
-            in zip(edge_paths, edge_paths_out, edge_storages):
+    for edge_path_in, edge_path_out, edge_storage \
+            in zip(edge_paths_in, edge_paths_out, edge_storages):
         generate_edge_path_files(
-            edge_path,
+            edge_path_in,
             edge_path_out,
             edge_storage,
             entities_by_type,
@@ -339,6 +343,7 @@ def parse_config_partial(
     entities_config = config_dict.get("entities")
     relations_config = config_dict.get("relations")
     entity_path = config_dict.get("entity_path")
+    edge_paths = config_dict.get("edge_paths")
     dynamic_relations = config_dict.get("dynamic_relations", False)
     if not isinstance(entities_config, dict):
         raise TypeError("Config entities is not of type dict")
@@ -348,6 +353,10 @@ def parse_config_partial(
         raise TypeError("Config relations is not of type list")
     if not isinstance(entity_path, str):
         raise TypeError("Config entity_path is not of type str")
+    if not isinstance(edge_paths, list):
+        raise TypeError("Config edge_paths is not of type list")
+    if any(not isinstance(p, str) for p in edge_paths):
+        raise TypeError("Config edge_paths has some items that are not of type str")
     if not isinstance(dynamic_relations, bool):
         raise TypeError("Config dynamic_relations is not of type bool")
 
@@ -358,7 +367,7 @@ def parse_config_partial(
     for relation in relations_config:
         relations.append(RelationSchema.from_dict(relation))
 
-    return entities, relations, entity_path, dynamic_relations
+    return entities, relations, entity_path, edge_paths, dynamic_relations
 
 
 def main():
@@ -390,13 +399,18 @@ def main():
         overrides = chain.from_iterable(opt.param)  # flatten
         config_dict = override_config_dict(config_dict, overrides)
 
-    entity_configs, relation_configs, entity_path, dynamic_relations = \
+    entity_configs, relation_configs, entity_path, edge_paths, dynamic_relations = \
         parse_config_partial(config_dict)
 
+    if len(opt.edge_paths) != len(edge_paths):
+        print(f"The edge paths provided on the command line ({opt.edge_paths}) "
+              f"don't match the ones found in the config file ({edge_paths})")
+
     convert_input_data(
         entity_configs,
         relation_configs,
         entity_path,
+        edge_paths,
         opt.edge_paths,
         opt.lhs_col,
         opt.rhs_col,
diff --git a/torchbiggraph/converters/utils.py b/torchbiggraph/converters/utils.py
@@ -17,10 +17,6 @@
 from tqdm import tqdm
 
 
-def convert_path(fname: Path) -> Path:
-    return fname.parent / f"{fname.stem}_partitioned"
-
-
 def extract_gzip(gzip_path: Path, remove_finished: bool = False) -> str:
     print(f"Extracting {gzip_path}")
     if gzip_path.suffix != ".gz":
diff --git a/torchbiggraph/examples/configs/fb15k_config.py b/torchbiggraph/examples/configs/fb15k_config.py
@@ -6,16 +6,18 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE.txt file in the root directory of this source tree.
 
-entity_base = "data/FB15k"
-
 
 def get_torchbiggraph_config():
 
     config = dict(
         # I/O data
-        entity_path=entity_base,
-        edge_paths=[],
-        checkpoint_path='model/fb15k',
+        entity_path="data/FB15k",
+        edge_paths=[
+            "data/FB15k/freebase_mtr100_mte100-train_partitioned",
+            "data/FB15k/freebase_mtr100_mte100-valid_partitioned",
+            "data/FB15k/freebase_mtr100_mte100-test_partitioned",
+        ],
+        checkpoint_path="model/fb15k",
 
         # Graph structure
         entities={
diff --git a/torchbiggraph/examples/configs/livejournal_config.py b/torchbiggraph/examples/configs/livejournal_config.py
@@ -6,16 +6,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE.txt file in the root directory of this source tree.
 
-entities_base = 'data/livejournal'
-
 
 def get_torchbiggraph_config():
 
     config = dict(
         # I/O data
-        entity_path=entities_base,
-        edge_paths=[],
-        checkpoint_path='model/livejournal',
+        entity_path="data/livejournal",
+        edge_paths=[
+            "data/train_partitioned",
+            "data/test_partitioned",
+        ],
+        checkpoint_path="model/livejournal",
 
         # Graph structure
         entities={
diff --git a/torchbiggraph/examples/fb15k.py b/torchbiggraph/examples/fb15k.py
@@ -13,7 +13,7 @@
 import attr
 import pkg_resources
 
-from torchbiggraph.converters.utils import convert_path, download_url, extract_tar
+from torchbiggraph.converters.utils import download_url, extract_tar
 from torchbiggraph.config import add_to_sys_path, ConfigFileLoader
 from torchbiggraph.converters.import_from_tsv import convert_input_data
 from torchbiggraph.eval import do_eval
@@ -27,11 +27,11 @@
 
 
 FB15K_URL = 'https://dl.fbaipublicfiles.com/starspace/fb15k.tgz'
-FILENAMES = {
-    'train': 'FB15k/freebase_mtr100_mte100-train.txt',
-    'valid': 'FB15k/freebase_mtr100_mte100-valid.txt',
-    'test': 'FB15k/freebase_mtr100_mte100-test.txt',
-}
+FILENAMES = [
+    "FB15k/freebase_mtr100_mte100-train.txt",
+    "FB15k/freebase_mtr100_mte100-valid.txt",
+    "FB15k/freebase_mtr100_mte100-test.txt",
+]
 
 # Figure out the path where the sample config was installed by the package manager.
 # This can be overridden with --config.
@@ -68,33 +68,29 @@ def main():
     subprocess_init = SubprocessInitializer()
     subprocess_init.register(setup_logging, config.verbose)
     subprocess_init.register(add_to_sys_path, loader.config_dir.name)
-    edge_paths = [data_dir / name for name in FILENAMES.values()]
+    input_edge_paths = [data_dir / name for name in FILENAMES]
+    output_train_path, output_valid_path, output_test_path = config.edge_paths
 
     convert_input_data(
         config.entities,
         config.relations,
         config.entity_path,
-        edge_paths,
+        config.edge_paths,
+        input_edge_paths,
         lhs_col=0,
         rhs_col=2,
         rel_col=1,
         dynamic_relations=config.dynamic_relations,
     )
 
-    train_path = [str(convert_path(data_dir / FILENAMES['train']))]
-    train_config = attr.evolve(config, edge_paths=train_path)
-
+    train_config = attr.evolve(config, edge_paths=[output_train_path])
     train(train_config, subprocess_init=subprocess_init)
 
-    eval_path = [str(convert_path(data_dir / FILENAMES['test']))]
     relations = [attr.evolve(r, all_negs=True) for r in config.relations]
-    eval_config = attr.evolve(config, edge_paths=eval_path, relations=relations, num_uniform_negs=0)
+    eval_config = attr.evolve(
+        config, edge_paths=[output_test_path], relations=relations, num_uniform_negs=0)
     if args.filtered:
-        filter_paths = [
-            str(convert_path(data_dir / FILENAMES['test'])),
-            str(convert_path(data_dir / FILENAMES['valid'])),
-            str(convert_path(data_dir / FILENAMES['train'])),
-        ]
+        filter_paths = [output_test_path, output_valid_path, output_train_path]
         do_eval(
             eval_config,
             evaluator=FilteredRankingEvaluator(eval_config, filter_paths),
diff --git a/torchbiggraph/examples/livejournal.py b/torchbiggraph/examples/livejournal.py
@@ -16,7 +16,7 @@
 
 from torchbiggraph.config import add_to_sys_path, ConfigFileLoader
 from torchbiggraph.converters.import_from_tsv import convert_input_data
-from torchbiggraph.converters.utils import convert_path, download_url, extract_gzip
+from torchbiggraph.converters.utils import download_url, extract_gzip
 from torchbiggraph.eval import do_eval
 from torchbiggraph.train import train
 from torchbiggraph.util import (
@@ -27,10 +27,12 @@
 
 
 URL = 'https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz'
-FILENAMES = {
-    'train': 'train.txt',
-    'test': 'test.txt',
-}
+TRAIN_FILENAME = "train.txt"
+TEST_FILENAME = "test.txt"
+FILENAMES = [
+    TRAIN_FILENAME,
+    TEST_FILENAME,
+]
 TRAIN_FRACTION = 0.75
 
 # Figure out the path where the sample config was installed by the package manager.
@@ -40,8 +42,8 @@
 
 
 def random_split_file(fpath: Path) -> None:
-    train_file = fpath.parent / FILENAMES['train']
-    test_file = fpath.parent / FILENAMES['test']
+    train_file = fpath.parent / TRAIN_FILENAME
+    test_file = fpath.parent / TEST_FILENAME
 
     if train_file.exists() and test_file.exists():
         print("Found some files that indicate that the input data "
@@ -103,27 +105,25 @@ def main():
     subprocess_init = SubprocessInitializer()
     subprocess_init.register(setup_logging, config.verbose)
     subprocess_init.register(add_to_sys_path, loader.config_dir.name)
-    edge_paths = [data_dir / name for name in FILENAMES.values()]
+    input_edge_paths = [data_dir / name for name in FILENAMES]
+    output_train_path, output_test_path = config.edge_paths
 
     convert_input_data(
         config.entities,
         config.relations,
         config.entity_path,
-        edge_paths,
+        config.edge_paths,
+        input_edge_paths,
         lhs_col=0,
         rhs_col=1,
         rel_col=None,
         dynamic_relations=config.dynamic_relations,
     )
 
-    train_path = [str(convert_path(data_dir / FILENAMES['train']))]
-    train_config = attr.evolve(config, edge_paths=train_path)
-
+    train_config = attr.evolve(config, edge_paths=[output_train_path])
     train(train_config, subprocess_init=subprocess_init)
 
-    eval_path = [str(convert_path(data_dir / FILENAMES['test']))]
-    eval_config = attr.evolve(config, edge_paths=eval_path)
-
+    eval_config = attr.evolve(config, edge_paths=[output_test_path])
     do_eval(eval_config, subprocess_init=subprocess_init)