Fix things that bothered me while testing

lw · facebook-github-bot · commit 75885edb4d61 · 2019-09-12T01:56:28.000-07:00
Reviewed By: adamlerer

Differential Revision: D17226664

fbshipit-source-id: ee83bf7f498eb5da21f8d0c6f6b4fdc9cad44d62
diff --git a/README.md b/README.md
@@ -139,7 +139,6 @@ During preprocessing, the entities and relation types had their identifiers conv
 ```bash
 torchbiggraph_export_to_tsv \
   torchbiggraph/examples/configs/fb15k_config.py \
-  --checkpoint model/fb15k \
   --entities-output entity_embeddings.tsv \
   --relation-types-output relation_types_parameters.tsv
 ```
diff --git a/torchbiggraph/checkpoint_manager.py b/torchbiggraph/checkpoint_manager.py
@@ -209,7 +209,7 @@ class CheckpointManager:
     def __init__(
         self,
         url: str,
-        rank: Rank = -1,
+        rank: Rank = 0,
         num_machines: int = 1,
         background: bool = False,
         partition_client: Optional[PartitionClient] = None,
diff --git a/torchbiggraph/checkpoint_storage.py b/torchbiggraph/checkpoint_storage.py
@@ -11,7 +11,7 @@
 import os
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any, Callable, Dict, NamedTuple, Optional, Tuple, Type
+from typing import Any, Dict, NamedTuple, Optional, Tuple
 
 import h5py
 import numpy as np
@@ -411,8 +411,11 @@ def save_config(self, config_json: str) -> None:
             tf.write(config_json)
 
     def load_config(self) -> str:
-        with self.get_config_file().open("rt") as tf:
-            return tf.read()
+        try:
+            with self.get_config_file().open("rt") as tf:
+                return tf.read()
+        except FileNotFoundError as err:
+            raise CouldNotLoadData() from err
 
     def prepare_snapshot(self, version: int, epoch_idx: int) -> None:
         self.get_snapshot_path(epoch_idx).mkdir(parents=True, exist_ok=True)
diff --git a/torchbiggraph/converters/export_to_tsv.py b/torchbiggraph/converters/export_to_tsv.py
@@ -27,7 +27,6 @@ def write(outf: TextIO, key: Iterable[str], value: Iterable[float]) -> None:
 
 def make_tsv(
     config: ConfigSchema,
-    checkpoint: str,
     entities_tf: TextIO,
     relation_types_tf: TextIO,
 ) -> None:
@@ -39,7 +38,7 @@ def make_tsv(
     model = make_model(config)
 
     print("Loading model check point...")
-    checkpoint_manager = CheckpointManager(checkpoint)
+    checkpoint_manager = CheckpointManager(config.checkpoint_path)
     state_dict, _ = checkpoint_manager.read_model()
     if state_dict is not None:
         model.load_state_dict(state_dict, strict=False)
@@ -136,7 +135,6 @@ def main():
     )
     parser.add_argument('config', help="Path to config file")
     parser.add_argument('-p', '--param', action='append', nargs='*')
-    parser.add_argument('--checkpoint')
     parser.add_argument('--entities-output', required=True)
     parser.add_argument('--relation-types-output', required=True)
     opt = parser.parse_args()
@@ -152,7 +150,6 @@ def main():
             open(opt.relation_types_output, "xt") as relation_types_tf:
         make_tsv(
             config,
-            opt.checkpoint,
             entities_tf,
             relation_types_tf,
         )
diff --git a/torchbiggraph/converters/import_from_tsv.py b/torchbiggraph/converters/import_from_tsv.py
@@ -7,10 +7,9 @@
 # LICENSE.txt file in the root directory of this source tree.
 
 import argparse
-import os
-import os.path
 import random
 from itertools import chain
+from pathlib import Path
 from typing import Any, Counter, DefaultDict, Dict, List, Optional, Tuple
 
 import torch
@@ -23,6 +22,7 @@
     override_config_dict,
 )
 from torchbiggraph.converters.dictionary import Dictionary
+from torchbiggraph.converters.utils import convert_path
 from torchbiggraph.edgelist import EdgeList
 from torchbiggraph.entitylist import EntityList
 from torchbiggraph.graph_storages import (
@@ -37,7 +37,7 @@
 
 def collect_relation_types(
     relation_configs: List[RelationSchema],
-    edge_paths: List[str],
+    edge_paths: List[Path],
     dynamic_relations: bool,
     rel_col: Optional[int],
     relation_type_min_count: int,
@@ -49,30 +49,29 @@ def collect_relation_types(
         print("Looking up relation types in the edge files...")
         counter: Counter[str] = Counter()
         for edgepath in edge_paths:
-            with open(edgepath, "rt") as tf:
+            with edgepath.open("rt") as tf:
                 for line_num, line in enumerate(tf, start=1):
                     words = line.split()
                     try:
                         rel_word = words[rel_col]
                     except IndexError:
                         raise RuntimeError(
-                            "Line %d of %s has only %d words"
-                            % (line_num, edgepath, len(words))) from None
+                            f"Line {line_num} of {edgepath} has only {len(words)} words"
+                        ) from None
                     counter[rel_word] += 1
-        print("- Found %d relation types" % len(counter))
+        print(f"- Found {len(counter)} relation types")
         if relation_type_min_count > 0:
-            print("- Removing the ones with fewer than %d occurrences..."
-                  % relation_type_min_count)
+            print(f"- Removing the ones with fewer than {relation_type_min_count} occurrences...")
             counter = Counter({k: c for k, c in counter.items()
                                if c >= relation_type_min_count})
-            print("- Left with %d relation types" % len(counter))
+            print(f"- Left with {len(counter)} relation types")
         print("- Shuffling them...")
         names = list(counter.keys())
         random.shuffle(names)
 
     else:
         names = [rconfig.name for rconfig in relation_configs]
-        print("Using the %d relation types given in the config" % len(names))
+        print(f"Using the {len(names)} relation types given in the config")
 
     return Dictionary(names)
 
@@ -81,7 +80,7 @@ def collect_entities_by_type(
     relation_types: Dictionary,
     entity_configs: Dict[str, EntitySchema],
     relation_configs: List[RelationSchema],
-    edge_paths: List[str],
+    edge_paths: List[Path],
     dynamic_relations: bool,
     lhs_col: int,
     rhs_col: int,
@@ -95,7 +94,7 @@ def collect_entities_by_type(
 
     print("Searching for the entities in the edge files...")
     for edgepath in edge_paths:
-        with open(edgepath, "rt") as tf:
+        with edgepath.open("rt") as tf:
             for line_num, line in enumerate(tf, start=1):
                 words = line.split()
                 try:
@@ -120,14 +119,13 @@ def collect_entities_by_type(
 
     entities_by_type: Dict[str, Dictionary] = {}
     for entity_name, counter in counters.items():
-        print("Entity type %s:" % entity_name)
-        print("- Found %d entities" % len(counter))
+        print(f"Entity type {entity_name}:")
+        print(f"- Found {len(counter)} entities")
         if entity_min_count > 0:
-            print("- Removing the ones with fewer than %d occurrences..."
-                  % entity_min_count)
+            print(f"- Removing the ones with fewer than {entity_min_count} occurrences...")
             counter = Counter({k: c for k, c in counter.items()
                                if c >= entity_min_count})
-            print("- Left with %d entities" % len(counter))
+            print(f"- Left with {len(counter)} entities")
         print("- Shuffling them...")
         names = list(counter.keys())
         random.shuffle(names)
@@ -162,8 +160,8 @@ def generate_entity_path_files(
 
 
 def generate_edge_path_files(
-    edge_file_in: str,
-    edge_path_out: str,
+    edge_file_in: Path,
+    edge_path_out: Path,
     edge_storage: AbstractEdgeStorage,
     entities_by_type: Dict[str, Dictionary],
     relation_types: Dictionary,
@@ -189,7 +187,7 @@ def generate_edge_path_files(
     processed = 0
     skipped = 0
 
-    with open(edge_file_in, "rt") as tf:
+    with edge_file_in.open("rt") as tf:
         for line_num, line in enumerate(tf, start=1):
             words = line.split()
             try:
@@ -256,7 +254,7 @@ def convert_input_data(
     entity_configs: Dict[str, EntitySchema],
     relation_configs: List[RelationSchema],
     entity_path: str,
-    edge_paths: List[str],
+    edge_paths: List[Path],
     lhs_col: int,
     rhs_col: int,
     rel_col: Optional[int] = None,
@@ -266,8 +264,8 @@ def convert_input_data(
 ) -> None:
     entity_storage = ENTITY_STORAGES.make_instance(entity_path)
     relation_type_storage = RELATION_TYPE_STORAGES.make_instance(entity_path)
-    edge_paths_out = [os.path.splitext(ep)[0] + "_partitioned" for ep in edge_paths]
-    edge_storages = [EDGE_STORAGES.make_instance(ep) for ep in edge_paths_out]
+    edge_paths_out = [convert_path(ep) for ep in edge_paths]
+    edge_storages = [EDGE_STORAGES.make_instance(str(ep)) for ep in edge_paths_out]
 
     some_files_exists = []
     some_files_exists.extend(
@@ -287,7 +285,8 @@ def convert_input_data(
     if all(some_files_exists):
         print("Found some files that indicate that the input data "
               "has already been preprocessed, not doing it again.")
-        print(f"These files are in {entity_path} and {edge_paths}")
+        all_paths = ", ".join(str(p) for p in [entity_path] + edge_paths_out)
+        print(f"These files are in: {all_paths}")
         return
 
     relation_types = collect_relation_types(
@@ -371,7 +370,7 @@ def main():
     )
     parser.add_argument('config', help='Path to config file')
     parser.add_argument('-p', '--param', action='append', nargs='*')
-    parser.add_argument('edge_paths', nargs='*', help='Input file paths')
+    parser.add_argument('edge_paths', type=Path, nargs='*', help='Input file paths')
     parser.add_argument('-l', '--lhs-col', type=int, required=True,
                         help='Column index for source entity')
     parser.add_argument('-r', '--rhs-col', type=int, required=True,
diff --git a/torchbiggraph/converters/utils.py b/torchbiggraph/converters/utils.py
@@ -7,40 +7,44 @@
 # LICENSE.txt file in the root directory of this source tree.
 
 import gzip
-import os
 import shutil
 import tarfile
-import urllib.request
+from pathlib import Path
 from typing import Callable, Optional
+from urllib.parse import urlparse
+from urllib.request import urlretrieve
 
 from tqdm import tqdm
 
 
-def extract_gzip(gzip_path: str, remove_finished: bool = False) -> str:
-    print('Extracting %s' % gzip_path)
-    fpath, ext = os.path.splitext(gzip_path)
-    if ext != ".gz":
+def convert_path(fname: Path) -> Path:
+    return fname.parent / f"{fname.stem}_partitioned"
+
+
+def extract_gzip(gzip_path: Path, remove_finished: bool = False) -> str:
+    print(f"Extracting {gzip_path}")
+    if gzip_path.suffix != ".gz":
         raise RuntimeError("Not a gzipped file")
+    fpath = gzip_path.with_suffix("")
 
-    if os.path.exists(fpath):
+    if fpath.exists():
         print("Found a file that indicates that the input data "
               "has already been extracted, not doing it again.")
-        print("This file is: %s" % fpath)
+        print(f"This file is: {fpath}")
         return fpath
 
-    with open(fpath, "wb") as out_bf, gzip.GzipFile(gzip_path) as zip_f:
+    with fpath.open("wb") as out_bf, gzip.GzipFile(gzip_path) as zip_f:
         shutil.copyfileobj(zip_f, out_bf)
     if remove_finished:
-        os.unlink(gzip_path)
+        gzip_path.unlink()
 
     return fpath
 
 
-def extract_tar(fpath: str) -> None:
+def extract_tar(fpath: Path) -> None:
     # extract file
-    root = os.path.dirname(fpath)
     with tarfile.open(fpath, "r:gz") as tar:
-        tar.extractall(path=root)
+        tar.extractall(path=fpath.parent)
 
 
 def gen_bar_updater(pbar: tqdm) -> Callable[[int, int, int], None]:
@@ -53,7 +57,7 @@ def bar_update(count: int, block_size: int, total_size: int) -> None:
     return bar_update
 
 
-def download_url(url: str, root: str, filename: Optional[str] = None) -> str:
+def download_url(url: str, root: Path, filename: Optional[str] = None) -> str:
     """Download a file from a url and place it in root.
     Args:
         url (str): URL to download file from
@@ -62,24 +66,24 @@ def download_url(url: str, root: str, filename: Optional[str] = None) -> str:
                         If None, use the basename of the URL
     """
 
-    root = os.path.expanduser(root)
-    if not filename:
-        filename = os.path.basename(url)
-    fpath = os.path.join(root, filename)
-    if not os.path.exists(root):
-        os.makedirs(root)
+    root = root.expanduser()
+    if filename is None:
+        filename = Path(urlparse(url).path).name
+    fpath = root / filename
+    if not root.exists():
+        root.mkdir(parents=True, exist_ok=True)
 
     # downloads file
-    if os.path.isfile(fpath):
-        print('Using downloaded and verified file: ' + fpath)
+    if fpath.is_file():
+        print(f"Using downloaded and verified file: {fpath}")
     else:
         try:
-            print('Downloading ' + url + ' to ' + fpath)
-            urllib.request.urlretrieve(
-                url, fpath,
+            print(f"Downloading {url} to {fpath}")
+            urlretrieve(
+                url, str(fpath),
                 reporthook=gen_bar_updater(tqdm(unit='B', unit_scale=True))
             )
         except OSError:
-            print('Failed to download from url: ' + url)
+            print(f"Failed to download from url: {url}")
 
     return fpath
diff --git a/torchbiggraph/examples/fb15k.py b/torchbiggraph/examples/fb15k.py
diff --git a/torchbiggraph/examples/livejournal.py b/torchbiggraph/examples/livejournal.py
diff --git a/torchbiggraph/schema.py b/torchbiggraph/schema.py