66
77from __future__ import annotations
88
9+ import hashlib
910import logging
1011import os
1112from pathlib import Path
12- from typing import List
13+ from typing import List , Optional
1314
1415from datamaestro .context import ResolvablePath
1516from datamaestro .definitions import AbstractDataset
1920logger = logging .getLogger (__name__ )
2021
2122
23+ class GlobChecker :
24+ """Verifies folder contents by computing a combined MD5 over matching files.
25+
26+ Files matching the glob pattern are sorted by name, each file's MD5 is
27+ computed, and then the MD5 of the concatenated hex digests gives the
28+ overall checksum.
29+
30+ If ``md5`` is ``None``, the computed checksum is logged so the user
31+ can record it for future verification.
32+ """
33+
34+ def __init__ (self , glob : str , md5 : Optional [str ] = None ):
35+ self .glob = glob
36+ self .md5 = md5
37+
38+ def compute (self , path : Path ) -> Optional [str ]:
39+ """Compute the combined MD5 for files matching the glob under *path*."""
40+ files = sorted (path .glob (self .glob ))
41+ if not files :
42+ return None
43+ combined = hashlib .md5 ()
44+ for f in files :
45+ if f .is_file ():
46+ combined .update (hashlib .md5 (f .read_bytes ()).hexdigest ().encode ())
47+ return combined .hexdigest ()
48+
49+ def check (self , path : Path ) -> bool :
50+ digest = self .compute (path )
51+ if digest is None :
52+ logger .warning ("No files matching %s in %s" , self .glob , path )
53+ return False
54+ if self .md5 is None :
55+ logger .info (
56+ "GlobChecker(%s): computed md5 = %s for %s" , self .glob , digest , path
57+ )
58+ return True
59+ if digest != self .md5 :
60+ logger .error (
61+ "GlobChecker(%s): md5 mismatch for %s: expected %s, got %s" ,
62+ self .glob ,
63+ path ,
64+ self .md5 ,
65+ digest ,
66+ )
67+ return False
68+ return True
69+
70+
2271class links (LocalResourceMixin , Resource ):
2372 """Link with another dataset path.
2473
@@ -150,6 +199,12 @@ class linkfolder(linkpath):
150199 @dataset(url="...")
151200 class MyDataset(Base):
152201 DATA = linkfolder("data", proposals=[...])
202+
203+ An optional ``checker`` (e.g. :class:`GlobChecker`) can be provided to
204+ verify the folder contents after linking::
205+
206+ DATA = linkfolder("data", proposals=[...],
207+ checker=GlobChecker("FB*", "a1b2c3..."))
153208 """
154209
155210 def __init__ (
@@ -158,11 +213,17 @@ def __init__(
158213 proposals ,
159214 * ,
160215 transient : bool = False ,
216+ checker : Optional [GlobChecker ] = None ,
161217 ):
162218 super ().__init__ (varname , proposals , transient = transient )
219+ self .checker = checker
163220
164221 def _check_path (self , path ):
165- return path .is_dir ()
222+ if not path .is_dir ():
223+ return False
224+ if self .checker is not None :
225+ return self .checker .check (path )
226+ return True
166227
167228
168229class linkfile (linkpath ):
0 commit comments