Skip to content

Commit 9dfe704

Browse files
DanielYang59shyuep
andauthored
Use new JSON for ptable with better value/unit handling (also slightly faster) (#4376)
* remove known differences * directly overwrite JSON in core * add todo * write unit to JSON * drop about or other comment from data * csv parser stop fill in no data * drop processing of electronic structure str * avoid hard-coded radii units * separate NMR quadrupole moment * move ground level inside * reduce indentation level * migrate data source to dev_script sources * remove specicial handling of refractive index * move unit for nmr_quadrupole_moment * add note * avoid hard-coded atomic mass unit * remove more comments from value * fix isotope handling in electronic affinity parser * remove almost all postprocessing code in getattr * use sci notation * convert superscript in unit to ^ notation * implement factor support * remove TODO tag * also write factor in yaml * use None over "no data" when data unavailable * filter fields without value * fix typo for specie * explicitly check none instead of falsiness * remove comment * migrate electronic structure reference to source * remove no data in test * save compare script and results but haven't been verified * remove data from property, and avoid deepdiff * update known diff * update known diff * also compare type * remove compare script * save and load JSON compressed * include gz in package * apply factor directly to YAML to be consistent with JSON * round when applying factor * fix type --------- Co-authored-by: Shyue Ping Ong <shyuep@users.noreply.github.com>
1 parent 160d699 commit 9dfe704

13 files changed

Lines changed: 1040 additions & 34020 deletions

dev_scripts/generate_periodic_table_yaml_json.py

Lines changed: 83 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
#!/usr/bin/env python3
22

33
"""Create `core.periodic_table.json` from source files, and as such
4-
you should NOT modify the JSON directly, but work on the data source
5-
and then run this script to regenerate the JSON/YAML.
4+
you should NOT modify the final JSON directly, but work on the data
5+
source and then run this script to generate the JSON/YAML.
66
77
Each source file may be parsed using a common or custom parser. In cases where
8-
a custom parser is required, it should return either a single `Property` or
8+
a custom parser is required, it should return a single `Property` or
99
a sequence of `Property`.
1010
11-
The YAML file is a readable aggregation of all properties in the following structure:
11+
The YAML file is a readable aggregation of all properties in the structure of:
1212
Property name -> {
1313
unit: <unit string or null>,
1414
reference: <reference string or null>,
@@ -17,26 +17,23 @@
1717
}
1818
}
1919
20-
The JSON file is a compact, production-format structure (no metadata):
20+
The JSON file is a compact, production-format structure without metadata:
2121
<element symbol> -> {
2222
<property name>: <value unit>
2323
}
2424
2525
Units are stored separately in a special top-level key:
26-
"_unit" -> {
27-
<property name>: <unit string>
28-
}
29-
30-
TODO: this script is in a transitional state, it would now:
31-
- Generate a duplicate JSON instead of overwriting the one in `core` dir
32-
- Append unit as string instead of as a separate field
33-
- Unit should be converted to pymatgen Unit
26+
"_unit" -> {
27+
<property name>: <unit string>
28+
}
3429
"""
3530

3631
from __future__ import annotations
3732

3833
import csv
34+
import gzip
3935
import json
36+
import math
4037
import os
4138
import warnings
4239
from collections import Counter, defaultdict
@@ -50,7 +47,7 @@
5047
import requests
5148
from ruamel.yaml import YAML
5249

53-
from pymatgen.core import Element
50+
from pymatgen.core import PKG_DIR, Element
5451

5552
if TYPE_CHECKING:
5653
from collections.abc import Callable, Iterable
@@ -59,38 +56,36 @@
5956
from pymatgen.util.typing import PathLike
6057

6158

62-
DEFAULT_VALUE: str = "no data" # The default value if not provided
63-
6459
RESOURCES_DIR: str = f"{Path(__file__).parent}/periodic_table_resources"
6560

61+
# The number of significant digits after applying `factor`
62+
SIGNIFICANT_DIGITS: int = 4
63+
6664

6765
@dataclass
6866
class ElemPropertyValue:
69-
value: Any = DEFAULT_VALUE
70-
reference: str | None = None # Parser not implemented
67+
value: Any = None
68+
reference: str | None = None # per-value ref parser not implemented
7169

7270

7371
@dataclass
7472
class Property:
7573
name: str
7674
data: dict[Element, ElemPropertyValue]
75+
factor: float | None = None
7776
unit: str | None = None
7877
reference: str | None = None
7978

80-
# TODO: don't do unit conversion for now
81-
# def __post_init__(self):
82-
# if self.unit is not None and isinstance(self.unit, str):
83-
# self.unit = Unit(self.unit)
84-
8579

8680
def parse_yaml(file: PathLike) -> list[Property]:
8781
"""Parse a YAML file.
8882
8983
Expected YAML format:
9084
We expect each YAML file to contain one or more properties.
9185
Each property should follow this structure:
92-
- `unit` (optional): The unit of measurement for the values.
86+
- `unit` (str, optional): The unit of measurement for the values.
9387
- `data`: Dict mapping each element symbol (e.g., "Fe") to its corresponding value.
88+
- `factor` (float, optional): A multiplier applied to each value in `data`.
9489
9590
Args:
9691
working_dir (PathLike): directory containing all YAMLs.
@@ -119,6 +114,7 @@ def parse_yaml(file: PathLike) -> list[Property]:
119114
name=prop_name,
120115
unit=prop_info.get("unit"),
121116
reference=prop_info.get("reference"),
117+
factor=prop_info.get("factor"),
122118
data=data,
123119
)
124120
)
@@ -172,16 +168,16 @@ def parse_csv(
172168
data: dict[Element, ElemPropertyValue] = {}
173169

174170
for symbol, value in data_df[prop].items():
175-
if pd.isna(value):
176-
value = DEFAULT_VALUE
177-
elif transform is not None:
178-
try:
179-
value = transform(value)
180-
except (ValueError, TypeError):
181-
warnings.warn(f"Cannot transform {value=}, keep as string", stacklevel=2)
182-
value = str(value)
171+
# NaN would be skipped instead of writing "no data"
172+
if not pd.isna(value):
173+
if transform is not None:
174+
try:
175+
value = transform(value)
176+
except (ValueError, TypeError):
177+
warnings.warn(f"Cannot transform {value=}, keep as string", stacklevel=2)
178+
value = str(value)
183179

184-
data[Element(symbol)] = ElemPropertyValue(value=value)
180+
data[Element(symbol)] = ElemPropertyValue(value=value)
185181

186182
result.append(Property(name=prop, unit=unit, reference=reference, data=data))
187183

@@ -275,31 +271,27 @@ def parse_shannon_radii(
275271
return Property(name="Shannon radii", data=data, unit=unit, reference=reference)
276272

277273

278-
def get_and_parse_electronic_affinities(prop_name: str = "Electron affinity") -> Property:
279-
"""Get electronic affinities from Wikipedia and save a local YAML copy.
274+
def get_and_parse_electronic_affinities(prop_name: str = "Electron affinity", unit: str = "eV") -> Property:
275+
"""Get electronic affinities from Wikipedia and save a local YAML copy."""
276+
yaml_path: str = f"{RESOURCES_DIR}/_electron_affinities.yaml"
280277

281-
TODO:
282-
current wikipedia crawler drops data for Deuterium.
283-
"""
284278
# Get data table from Wikipedia
285279
url: str = "https://en.wikipedia.org/wiki/Electron_affinity_(data_page)"
286280
tables = pd.read_html(StringIO(requests.get(url, timeout=5).text))
287281

288282
# Get the "Elements Electron affinity" table (with unit eV)
289-
ea_df = next(
283+
ea_df: pd.DataFrame = next(
290284
table
291285
for table in tables
292286
if f"{prop_name} (kJ/mol)" in table.columns and table["Name"].astype(str).str.contains("Hydrogen").any()
293287
)
294-
ea_df = ea_df.drop(columns=["References", f"{prop_name} (kJ/mol)", "Element"])
295288

296-
# Drop superheavy elements
289+
# Drop superheavy elements (currently Z > 118)
297290
max_z: int = max(Element(element).Z for element in Element.__members__)
298291
ea_df = ea_df[pd.to_numeric(ea_df["Z"], errors="coerce") <= max_z]
299292

300-
# Drop heavy isotopes
301-
# TODO: correctly handle Deuterium
302-
ea_df = ea_df.drop_duplicates(subset="Z", keep="first")
293+
# Drop heavy isotopes (except for Deuterium, which has a distinct "Name")
294+
ea_df = ea_df.drop_duplicates(subset="Name", keep="first")
303295

304296
# Ensure we cover all elements up to Uranium (Z=92)
305297
if not (z_values := set(ea_df["Z"])).issuperset(range(1, 93)):
@@ -324,20 +316,18 @@ def get_and_parse_electronic_affinities(prop_name: str = "Electron affinity") ->
324316
for name, value in zip(ea_df["Name"], ea_df[f"{prop_name} (eV)"], strict=True)
325317
}
326318

327-
data[Element.D] = 0.754674 # TODO: fix this
328-
329319
output_data = {
330320
prop_name: {
331-
"unit": "eV",
321+
"unit": unit,
332322
"reference": url,
333323
"data": {el.name: val for el, val in sorted(data.items(), key=lambda x: x[0].Z)},
334324
}
335325
}
336326

337-
with open(f"{RESOURCES_DIR}/_electron_affinities.yaml", "w", encoding="utf-8") as f:
327+
with open(yaml_path, "w", encoding="utf-8") as f:
338328
yaml.dump(output_data, f)
339329

340-
return parse_yaml(f"{RESOURCES_DIR}/_electron_affinitIES.yaml")[0]
330+
return parse_yaml(yaml_path)[0]
341331

342332

343333
def generate_iupac_ordering() -> Property:
@@ -399,6 +389,14 @@ def generate_yaml_and_json(
399389
Raises:
400390
ValueError: If duplicate property names are found in the input.
401391
"""
392+
393+
def apply_factor_and_round(value: float, factor: float, digits: int) -> float:
394+
"""Apply a factor to a value and round it to the given number of significant digits."""
395+
result = value * factor
396+
if result == 0:
397+
return 0.0
398+
return round(result, digits - math.floor(math.log10(abs(result))) - 1)
399+
402400
# Check for duplicate
403401
counter: Counter = Counter([prop.name for prop in properties])
404402
if duplicates := [name for name, count in counter.items() if count > 1]:
@@ -408,19 +406,29 @@ def generate_yaml_and_json(
408406
properties = sorted(properties, key=lambda prop: prop.name)
409407

410408
# Save a YAML copy for development
411-
yaml_data: dict[str, dict[Literal["unit", "reference", "data"], Any]] = {}
409+
yaml_data: dict[str, dict[Literal["unit", "reference", "data", "factor"], Any]] = {}
412410
for prop in properties:
413411
# Sort elements by atomic number (Z)
414412
sorted_data: dict[str, Any] = dict(
415413
sorted(((elem.name, val.value) for elem, val in prop.data.items()), key=lambda pair: Element(pair[0]).Z)
416414
)
417415

418-
yaml_data[prop.name] = {
416+
# Apply factor to data to be consistent with final JSON
417+
if prop.factor is not None:
418+
sorted_data = {
419+
k: apply_factor_and_round(v, prop.factor, SIGNIFICANT_DIGITS) for k, v in sorted_data.items()
420+
}
421+
422+
prop_dict = {
419423
"unit": str(prop.unit) if prop.unit is not None else None,
420424
"reference": prop.reference,
421425
"data": sorted_data,
426+
# "factor": prop.factor,
422427
}
423428

429+
# Filter out fields with None
430+
yaml_data[prop.name] = {k: v for k, v in prop_dict.items() if v is not None}
431+
424432
yaml = YAML()
425433
yaml.default_flow_style = None
426434
with open(yaml_file, "w", encoding="utf-8") as f:
@@ -432,24 +440,25 @@ def generate_yaml_and_json(
432440
# Output to JSON (element -> property -> value format, and drop metadata)
433441
element_to_props: dict[str, dict[str, Any]] = defaultdict(dict)
434442

435-
# # Insert units under a special `_unit` key
436-
# element_to_props["_unit"] = {}
443+
# Insert units under a special `_unit` key
444+
element_to_props["_unit"] = {}
437445

438446
for prop in properties:
439-
# # Store unit for this property if available
440-
# if prop.unit is not None:
441-
# element_to_props["_unit"][prop.name] = str(prop.unit)
447+
# Store unit for this property if available
448+
if prop.unit is not None:
449+
element_to_props["_unit"][prop.name] = str(prop.unit)
442450

451+
# Apply `factor`
443452
for elem, prop_val in prop.data.items():
444-
unit = prop.unit
445-
if unit is None:
446-
element_to_props[elem.name][prop.name] = prop_val.value
453+
if prop.factor is not None: # assume numeric if `factor` is given
454+
element_to_props[elem.name][prop.name] = apply_factor_and_round(
455+
prop_val.value, prop.factor, SIGNIFICANT_DIGITS
456+
)
447457
else:
448-
output = str(prop_val.value) + " " + str(unit)
449-
element_to_props[elem.name][prop.name] = output
458+
element_to_props[elem.name][prop.name] = prop_val.value
450459

451-
with open(json_file, "w", encoding="utf-8") as f:
452-
json.dump(element_to_props, f, indent=4)
460+
with gzip.open(json_file, "wt", encoding="utf-8") as f:
461+
json.dump(element_to_props, f)
453462

454463
print(f"Saved JSON to: {json_file}")
455464

@@ -459,21 +468,26 @@ def main():
459468
properties: tuple[Property, ...] = (
460469
*parse_yaml(f"{RESOURCES_DIR}/elemental_properties.yaml"),
461470
*parse_yaml(f"{RESOURCES_DIR}/oxidation_states.yaml"),
462-
*parse_yaml(f"{RESOURCES_DIR}/ionization_energies_nist.yaml"), # Parsed from HTML
471+
*parse_yaml(f"{RESOURCES_DIR}/nmr_quadrupole_moment.yaml"),
472+
*parse_yaml(f"{RESOURCES_DIR}/ground_level_and_ionization_energies_nist.yaml"), # Parsed from HTML
463473
get_and_parse_electronic_affinities(),
464-
*parse_csv(f"{RESOURCES_DIR}/radii.csv", transform=float, unit=None),
474+
*parse_csv(
475+
f"{RESOURCES_DIR}/radii.csv",
476+
transform=float,
477+
unit="ang",
478+
reference="https://wikipedia.org/wiki/Atomic_radii_of_the_elements_(data_page)",
479+
),
465480
*parse_ionic_radii(
466-
f"{RESOURCES_DIR}/ionic_radii.csv", unit=None, reference="https://en.wikipedia.org/wiki/Ionic_radius"
481+
f"{RESOURCES_DIR}/ionic_radii.csv", unit="ang", reference="https://en.wikipedia.org/wiki/Ionic_radius"
467482
),
468-
parse_shannon_radii(f"{RESOURCES_DIR}/Shannon_Radii.csv", unit=None),
483+
parse_shannon_radii(f"{RESOURCES_DIR}/Shannon_Radii.csv", unit="ang"),
469484
generate_iupac_ordering(),
470485
)
471486

472487
generate_yaml_and_json(
473488
properties,
474489
yaml_file=f"{RESOURCES_DIR}/_periodic_table.yaml",
475-
json_file=f"{RESOURCES_DIR}/_periodic_table.json",
476-
# json_file=f"{PKG_DIR}/core/periodic_table.json",
490+
json_file=f"{PKG_DIR}/core/periodic_table.json.gz",
477491
)
478492

479493

0 commit comments

Comments
 (0)