Skip to content

Commit 48f8793

Browse files
committed
feat: add GlobChecker to check the MD5 of a set of files
1 parent a0b6b62 commit 48f8793

1 file changed

Lines changed: 63 additions & 2 deletions

File tree

src/datamaestro/download/links.py

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66

77
from __future__ import annotations
88

9+
import hashlib
910
import logging
1011
import os
1112
from pathlib import Path
12-
from typing import List
13+
from typing import List, Optional
1314

1415
from datamaestro.context import ResolvablePath
1516
from datamaestro.definitions import AbstractDataset
@@ -19,6 +20,54 @@
1920
logger = logging.getLogger(__name__)
2021

2122

23+
class GlobChecker:
24+
"""Verifies folder contents by computing a combined MD5 over matching files.
25+
26+
Files matching the glob pattern are sorted by name, each file's MD5 is
27+
computed, and then the MD5 of the concatenated hex digests gives the
28+
overall checksum.
29+
30+
If ``md5`` is ``None``, the computed checksum is logged so the user
31+
can record it for future verification.
32+
"""
33+
34+
def __init__(self, glob: str, md5: Optional[str] = None):
35+
self.glob = glob
36+
self.md5 = md5
37+
38+
def compute(self, path: Path) -> Optional[str]:
39+
"""Compute the combined MD5 for files matching the glob under *path*."""
40+
files = sorted(path.glob(self.glob))
41+
if not files:
42+
return None
43+
combined = hashlib.md5()
44+
for f in files:
45+
if f.is_file():
46+
combined.update(hashlib.md5(f.read_bytes()).hexdigest().encode())
47+
return combined.hexdigest()
48+
49+
def check(self, path: Path) -> bool:
50+
digest = self.compute(path)
51+
if digest is None:
52+
logger.warning("No files matching %s in %s", self.glob, path)
53+
return False
54+
if self.md5 is None:
55+
logger.info(
56+
"GlobChecker(%s): computed md5 = %s for %s", self.glob, digest, path
57+
)
58+
return True
59+
if digest != self.md5:
60+
logger.error(
61+
"GlobChecker(%s): md5 mismatch for %s: expected %s, got %s",
62+
self.glob,
63+
path,
64+
self.md5,
65+
digest,
66+
)
67+
return False
68+
return True
69+
70+
2271
class links(LocalResourceMixin, Resource):
2372
"""Link with another dataset path.
2473
@@ -150,6 +199,12 @@ class linkfolder(linkpath):
150199
@dataset(url="...")
151200
class MyDataset(Base):
152201
DATA = linkfolder("data", proposals=[...])
202+
203+
An optional ``checker`` (e.g. :class:`GlobChecker`) can be provided to
204+
verify the folder contents after linking::
205+
206+
DATA = linkfolder("data", proposals=[...],
207+
checker=GlobChecker("FB*", "a1b2c3..."))
153208
"""
154209

155210
def __init__(
@@ -158,11 +213,17 @@ def __init__(
158213
proposals,
159214
*,
160215
transient: bool = False,
216+
checker: Optional[GlobChecker] = None,
161217
):
162218
super().__init__(varname, proposals, transient=transient)
219+
self.checker = checker
163220

164221
def _check_path(self, path):
165-
return path.is_dir()
222+
if not path.is_dir():
223+
return False
224+
if self.checker is not None:
225+
return self.checker.check(path)
226+
return True
166227

167228

168229
class linkfile(linkpath):

0 commit comments

Comments
 (0)