11#!/usr/bin/env python3
22
33"""Create `core.periodic_table.json` from source files, and as such
4- you should NOT modify the JSON directly, but work on the data source
5- and then run this script to regenerate the JSON/YAML.
4+ you should NOT modify the final JSON directly, but work on the data
5+ source and then run this script to generate the JSON/YAML.
66
77Each source file may be parsed using a common or custom parser. In cases where
8- a custom parser is required, it should return either a single `Property` or
8+ a custom parser is required, it should return a single `Property` or
99a sequence of `Property`.
1010
11- The YAML file is a readable aggregation of all properties in the following structure:
11+ The YAML file is a readable aggregation of all properties in the structure of :
1212 Property name -> {
1313 unit: <unit string or null>,
1414 reference: <reference string or null>,
1717 }
1818 }
1919
20- The JSON file is a compact, production-format structure (no metadata) :
20+ The JSON file is a compact, production-format structure without metadata:
2121 <element symbol> -> {
2222 <property name>: <value unit>
2323 }
2424
2525 Units are stored separately in a special top-level key:
26- "_unit" -> {
27- <property name>: <unit string>
28- }
29-
30- TODO: this script is in a transitional state, it would now:
31- - Generate a duplicate JSON instead of overwriting the one in `core` dir
32- - Append unit as string instead of as a separate field
33- - Unit should be converted to pymatgen Unit
26+ "_unit" -> {
27+ <property name>: <unit string>
28+ }
3429"""
3530
3631from __future__ import annotations
3732
3833import csv
34+ import gzip
3935import json
36+ import math
4037import os
4138import warnings
4239from collections import Counter , defaultdict
5047import requests
5148from ruamel .yaml import YAML
5249
53- from pymatgen .core import Element
50+ from pymatgen .core import PKG_DIR , Element
5451
5552if TYPE_CHECKING :
5653 from collections .abc import Callable , Iterable
5956 from pymatgen .util .typing import PathLike
6057
6158
62- DEFAULT_VALUE : str = "no data" # The default value if not provided
63-
6459RESOURCES_DIR : str = f"{ Path (__file__ ).parent } /periodic_table_resources"
6560
61+ # The number of significant digits after applying `factor`
62+ SIGNIFICANT_DIGITS : int = 4
63+
6664
6765@dataclass
6866class ElemPropertyValue :
69- value : Any = DEFAULT_VALUE
70- reference : str | None = None # Parser not implemented
67+ value : Any = None
68+ reference : str | None = None # per-value ref parser not implemented
7169
7270
7371@dataclass
7472class Property :
7573 name : str
7674 data : dict [Element , ElemPropertyValue ]
75+ factor : float | None = None
7776 unit : str | None = None
7877 reference : str | None = None
7978
80- # TODO: don't do unit conversion for now
81- # def __post_init__(self):
82- # if self.unit is not None and isinstance(self.unit, str):
83- # self.unit = Unit(self.unit)
84-
8579
8680def parse_yaml (file : PathLike ) -> list [Property ]:
8781 """Parse a YAML file.
8882
8983 Expected YAML format:
9084 We expect each YAML file to contain one or more properties.
9185 Each property should follow this structure:
92- - `unit` (optional): The unit of measurement for the values.
86+ - `unit` (str, optional): The unit of measurement for the values.
9387 - `data`: Dict mapping each element symbol (e.g., "Fe") to its corresponding value.
88+ - `factor` (float, optional): A multiplier applied to each value in `data`.
9489
9590 Args:
9691 working_dir (PathLike): directory containing all YAMLs.
@@ -119,6 +114,7 @@ def parse_yaml(file: PathLike) -> list[Property]:
119114 name = prop_name ,
120115 unit = prop_info .get ("unit" ),
121116 reference = prop_info .get ("reference" ),
117+ factor = prop_info .get ("factor" ),
122118 data = data ,
123119 )
124120 )
@@ -172,16 +168,16 @@ def parse_csv(
172168 data : dict [Element , ElemPropertyValue ] = {}
173169
174170 for symbol , value in data_df [prop ].items ():
175- if pd . isna ( value ):
176- value = DEFAULT_VALUE
177- elif transform is not None :
178- try :
179- value = transform (value )
180- except (ValueError , TypeError ):
181- warnings .warn (f"Cannot transform { value = } , keep as string" , stacklevel = 2 )
182- value = str (value )
171+ # NaN would be skipped instead of writing "no data"
172+ if not pd . isna ( value ):
173+ if transform is not None :
174+ try :
175+ value = transform (value )
176+ except (ValueError , TypeError ):
177+ warnings .warn (f"Cannot transform { value = } , keep as string" , stacklevel = 2 )
178+ value = str (value )
183179
184- data [Element (symbol )] = ElemPropertyValue (value = value )
180+ data [Element (symbol )] = ElemPropertyValue (value = value )
185181
186182 result .append (Property (name = prop , unit = unit , reference = reference , data = data ))
187183
@@ -275,31 +271,27 @@ def parse_shannon_radii(
275271 return Property (name = "Shannon radii" , data = data , unit = unit , reference = reference )
276272
277273
278- def get_and_parse_electronic_affinities (prop_name : str = "Electron affinity" ) -> Property :
279- """Get electronic affinities from Wikipedia and save a local YAML copy.
274+ def get_and_parse_electronic_affinities (prop_name : str = "Electron affinity" , unit : str = "eV" ) -> Property :
275+ """Get electronic affinities from Wikipedia and save a local YAML copy."""
276+ yaml_path : str = f"{ RESOURCES_DIR } /_electron_affinities.yaml"
280277
281- TODO:
282- current wikipedia crawler drops data for Deuterium.
283- """
284278 # Get data table from Wikipedia
285279 url : str = "https://en.wikipedia.org/wiki/Electron_affinity_(data_page)"
286280 tables = pd .read_html (StringIO (requests .get (url , timeout = 5 ).text ))
287281
288282 # Get the "Elements Electron affinity" table (with unit eV)
289- ea_df = next (
283+ ea_df : pd . DataFrame = next (
290284 table
291285 for table in tables
292286 if f"{ prop_name } (kJ/mol)" in table .columns and table ["Name" ].astype (str ).str .contains ("Hydrogen" ).any ()
293287 )
294- ea_df = ea_df .drop (columns = ["References" , f"{ prop_name } (kJ/mol)" , "Element" ])
295288
296- # Drop superheavy elements
289+ # Drop superheavy elements (currently Z > 118)
297290 max_z : int = max (Element (element ).Z for element in Element .__members__ )
298291 ea_df = ea_df [pd .to_numeric (ea_df ["Z" ], errors = "coerce" ) <= max_z ]
299292
300- # Drop heavy isotopes
301- # TODO: correctly handle Deuterium
302- ea_df = ea_df .drop_duplicates (subset = "Z" , keep = "first" )
293+ # Drop heavy isotopes (except for Deuterium, which has a distinct "Name")
294+ ea_df = ea_df .drop_duplicates (subset = "Name" , keep = "first" )
303295
304296 # Ensure we cover all elements up to Uranium (Z=92)
305297 if not (z_values := set (ea_df ["Z" ])).issuperset (range (1 , 93 )):
@@ -324,20 +316,18 @@ def get_and_parse_electronic_affinities(prop_name: str = "Electron affinity") ->
324316 for name , value in zip (ea_df ["Name" ], ea_df [f"{ prop_name } (eV)" ], strict = True )
325317 }
326318
327- data [Element .D ] = 0.754674 # TODO: fix this
328-
329319 output_data = {
330320 prop_name : {
331- "unit" : "eV" ,
321+ "unit" : unit ,
332322 "reference" : url ,
333323 "data" : {el .name : val for el , val in sorted (data .items (), key = lambda x : x [0 ].Z )},
334324 }
335325 }
336326
337- with open (f" { RESOURCES_DIR } /_electron_affinities.yaml" , "w" , encoding = "utf-8" ) as f :
327+ with open (yaml_path , "w" , encoding = "utf-8" ) as f :
338328 yaml .dump (output_data , f )
339329
340- return parse_yaml (f" { RESOURCES_DIR } /_electron_affinitIES.yaml" )[0 ]
330+ return parse_yaml (yaml_path )[0 ]
341331
342332
343333def generate_iupac_ordering () -> Property :
@@ -399,6 +389,14 @@ def generate_yaml_and_json(
399389 Raises:
400390 ValueError: If duplicate property names are found in the input.
401391 """
392+
393+ def apply_factor_and_round (value : float , factor : float , digits : int ) -> float :
394+ """Apply a factor to a value and round it to the given number of significant digits."""
395+ result = value * factor
396+ if result == 0 :
397+ return 0.0
398+ return round (result , digits - math .floor (math .log10 (abs (result ))) - 1 )
399+
402400 # Check for duplicate
403401 counter : Counter = Counter ([prop .name for prop in properties ])
404402 if duplicates := [name for name , count in counter .items () if count > 1 ]:
@@ -408,19 +406,29 @@ def generate_yaml_and_json(
408406 properties = sorted (properties , key = lambda prop : prop .name )
409407
410408 # Save a YAML copy for development
411- yaml_data : dict [str , dict [Literal ["unit" , "reference" , "data" ], Any ]] = {}
409+ yaml_data : dict [str , dict [Literal ["unit" , "reference" , "data" , "factor" ], Any ]] = {}
412410 for prop in properties :
413411 # Sort elements by atomic number (Z)
414412 sorted_data : dict [str , Any ] = dict (
415413 sorted (((elem .name , val .value ) for elem , val in prop .data .items ()), key = lambda pair : Element (pair [0 ]).Z )
416414 )
417415
418- yaml_data [prop .name ] = {
416+ # Apply factor to data to be consistent with final JSON
417+ if prop .factor is not None :
418+ sorted_data = {
419+ k : apply_factor_and_round (v , prop .factor , SIGNIFICANT_DIGITS ) for k , v in sorted_data .items ()
420+ }
421+
422+ prop_dict = {
419423 "unit" : str (prop .unit ) if prop .unit is not None else None ,
420424 "reference" : prop .reference ,
421425 "data" : sorted_data ,
426+ # "factor": prop.factor,
422427 }
423428
429+ # Filter out fields with None
430+ yaml_data [prop .name ] = {k : v for k , v in prop_dict .items () if v is not None }
431+
424432 yaml = YAML ()
425433 yaml .default_flow_style = None
426434 with open (yaml_file , "w" , encoding = "utf-8" ) as f :
@@ -432,24 +440,25 @@ def generate_yaml_and_json(
432440 # Output to JSON (element -> property -> value format, and drop metadata)
433441 element_to_props : dict [str , dict [str , Any ]] = defaultdict (dict )
434442
435- # # Insert units under a special `_unit` key
436- # element_to_props["_unit"] = {}
443+ # Insert units under a special `_unit` key
444+ element_to_props ["_unit" ] = {}
437445
438446 for prop in properties :
439- # # Store unit for this property if available
440- # if prop.unit is not None:
441- # element_to_props["_unit"][prop.name] = str(prop.unit)
447+ # Store unit for this property if available
448+ if prop .unit is not None :
449+ element_to_props ["_unit" ][prop .name ] = str (prop .unit )
442450
451+ # Apply `factor`
443452 for elem , prop_val in prop .data .items ():
444- unit = prop .unit
445- if unit is None :
446- element_to_props [elem .name ][prop .name ] = prop_val .value
453+ if prop .factor is not None : # assume numeric if `factor` is given
454+ element_to_props [elem .name ][prop .name ] = apply_factor_and_round (
455+ prop_val .value , prop .factor , SIGNIFICANT_DIGITS
456+ )
447457 else :
448- output = str (prop_val .value ) + " " + str (unit )
449- element_to_props [elem .name ][prop .name ] = output
458+ element_to_props [elem .name ][prop .name ] = prop_val .value
450459
451- with open (json_file , "w " , encoding = "utf-8" ) as f :
452- json .dump (element_to_props , f , indent = 4 )
460+ with gzip . open (json_file , "wt " , encoding = "utf-8" ) as f :
461+ json .dump (element_to_props , f )
453462
454463 print (f"Saved JSON to: { json_file } " )
455464
@@ -459,21 +468,26 @@ def main():
459468 properties : tuple [Property , ...] = (
460469 * parse_yaml (f"{ RESOURCES_DIR } /elemental_properties.yaml" ),
461470 * parse_yaml (f"{ RESOURCES_DIR } /oxidation_states.yaml" ),
462- * parse_yaml (f"{ RESOURCES_DIR } /ionization_energies_nist.yaml" ), # Parsed from HTML
471+ * parse_yaml (f"{ RESOURCES_DIR } /nmr_quadrupole_moment.yaml" ),
472+ * parse_yaml (f"{ RESOURCES_DIR } /ground_level_and_ionization_energies_nist.yaml" ), # Parsed from HTML
463473 get_and_parse_electronic_affinities (),
464- * parse_csv (f"{ RESOURCES_DIR } /radii.csv" , transform = float , unit = None ),
474+ * parse_csv (
475+ f"{ RESOURCES_DIR } /radii.csv" ,
476+ transform = float ,
477+ unit = "ang" ,
478+ reference = "https://wikipedia.org/wiki/Atomic_radii_of_the_elements_(data_page)" ,
479+ ),
465480 * parse_ionic_radii (
466- f"{ RESOURCES_DIR } /ionic_radii.csv" , unit = None , reference = "https://en.wikipedia.org/wiki/Ionic_radius"
481+ f"{ RESOURCES_DIR } /ionic_radii.csv" , unit = "ang" , reference = "https://en.wikipedia.org/wiki/Ionic_radius"
467482 ),
468- parse_shannon_radii (f"{ RESOURCES_DIR } /Shannon_Radii.csv" , unit = None ),
483+ parse_shannon_radii (f"{ RESOURCES_DIR } /Shannon_Radii.csv" , unit = "ang" ),
469484 generate_iupac_ordering (),
470485 )
471486
472487 generate_yaml_and_json (
473488 properties ,
474489 yaml_file = f"{ RESOURCES_DIR } /_periodic_table.yaml" ,
475- json_file = f"{ RESOURCES_DIR } /_periodic_table.json" ,
476- # json_file=f"{PKG_DIR}/core/periodic_table.json",
490+ json_file = f"{ PKG_DIR } /core/periodic_table.json.gz" ,
477491 )
478492
479493
0 commit comments