Skip to content

Commit 12982a1

Browse files
committed
Create utility for generating SBOM from artifacts
1 parent e09c9a7 commit 12982a1

1 file changed

Lines changed: 279 additions & 0 deletions

File tree

sbom.py

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
"""
2+
Utility which creates Software Bill-of-Materials (SBOM)
3+
for CPython release artifacts.
4+
"""
5+
6+
import datetime
7+
import hashlib
8+
import json
9+
import os
10+
import re
11+
import sys
12+
import tarfile
13+
14+
15+
def spdx_id(value: str) -> str:
16+
"""Encode a value into characters that are valid in an SPDX ID"""
17+
return re.sub(r"[^a-zA-Z0-9.\-]+", "-", value)
18+
19+
20+
def calculate_package_verification_codes(sbom) -> None:
21+
"""
22+
Calculate SPDX 'PackageVerificationCode' values for
23+
each package with 'filesAnalyzed' set to 'true'.
24+
Mutates the values within the passed structure.
25+
26+
The code is SHA1 of a concatenated and sorted list of file SHA1s.
27+
"""
28+
29+
# Find all packages which we need to calculate package verification codes for.
30+
sbom_file_id_to_package_id = {}
31+
sbom_package_id_to_file_sha1s: dict[str, list[bytes]] = {}
32+
for sbom_package in sbom["packages"]:
33+
# If this value is 'false' we skip calculating.
34+
if sbom_package["filesAnalyzed"]:
35+
sbom_package_id = sbom_package["SPDXID"]
36+
sbom_package_id_to_file_sha1s[sbom_package_id] = []
37+
38+
# Next pass we do is over relationships,
39+
# we need to find all files that belong to each package.
40+
for sbom_relationship in sbom["relationships"]:
41+
sbom_relationship_type = sbom_relationship["relationshipType"]
42+
sbom_element_id = sbom_relationship["spdxElementId"]
43+
sbom_related_element_id = sbom_relationship["relatedSpdxElement"]
44+
45+
# We're looking for '<package> CONTAINS <file>' relationships
46+
if (
47+
sbom_relationship_type != "CONTAINS"
48+
or sbom_element_id not in sbom_package_id_to_file_sha1s
49+
or not sbom_related_element_id.startswith("SPDXRef-FILE-")
50+
):
51+
continue
52+
53+
# Found one! Add it to our mapping.
54+
sbom_file_id_to_package_id[sbom_related_element_id] = sbom_element_id
55+
56+
# Now we do a single pass on files, appending all SHA1 values along the way.
57+
for sbom_file in sbom["files"]:
58+
# Attempt to match this file to a package.
59+
sbom_file_id = sbom_file["SPDXID"]
60+
if sbom_file_id not in sbom_file_id_to_package_id:
61+
continue
62+
sbom_package_id = sbom_file_id_to_package_id[sbom_file_id]
63+
64+
# Find the SHA1 checksum for the file.
65+
for sbom_file_checksum in sbom_file["checksums"]:
66+
if sbom_file_checksum["algorithm"] == "SHA1":
67+
# We lowercase the value as that's what's required by the algorithm.
68+
sbom_file_checksum_sha1 = (
69+
sbom_file_checksum["checksumValue"].lower().encode("ascii")
70+
)
71+
break
72+
else:
73+
raise ValueError(f"Can't find SHA1 checksum for '{sbom_file_id}'")
74+
75+
sbom_package_id_to_file_sha1s[sbom_package_id].append(sbom_file_checksum_sha1)
76+
77+
# Finally we iterate over the packages again and calculate the final package verification code values.
78+
for sbom_package in sbom["packages"]:
79+
sbom_package_id = sbom_package["SPDXID"]
80+
if sbom_package_id not in sbom_package_id_to_file_sha1s:
81+
continue
82+
83+
# Package verification code is the SHA1 of ASCII values ascending-sorted.
84+
sbom_package_verification_code = hashlib.sha1(
85+
b"".join(sorted(sbom_package_id_to_file_sha1s[sbom_package_id]))
86+
).hexdigest()
87+
88+
sbom_package["packageVerificationCode"] = {
89+
"packageVerificationCodeValue": sbom_package_verification_code
90+
}
91+
92+
93+
def create_sbom_for_source_tarball(tarball_path: str):
94+
"""Stitches together an SBOM for a source tarball"""
95+
tarball_name = os.path.basename(tarball_path)
96+
97+
# Open the tarball with known compression settings.
98+
if tarball_name.endswith(".tgz"):
99+
tarball = tarfile.open(tarball_path, mode="r:gz")
100+
elif tarball_name.endswith(".tar.xz"):
101+
tarball = tarfile.open(tarball_path, mode="r:xz")
102+
else:
103+
raise ValueError(f"Unknown tarball format: '{tarball_name}'")
104+
105+
# Parse the CPython version from the tarball.
106+
# Calculate the download locations from the CPython version and tarball name.
107+
cpython_version = re.match(r"^Python-([0-9abrc.]+)\.t", tarball_name).group(1)
108+
cpython_version_without_suffix = re.match(r"^([0-9.]+)", cpython_version).group(1)
109+
tarball_download_location = f"https://www.python.org/ftp/python/{cpython_version_without_suffix}/{tarball_name}"
110+
111+
# Take some hashes of the tarball
112+
with open(tarball_path, mode="rb") as f:
113+
tarball_checksum_sha256 = hashlib.sha256(f.read()).hexdigest()
114+
115+
# There should be an SBOM included in the tarball.
116+
# If there's not we can't create an SBOM.
117+
sbom_bytes = tarball.extractfile(tarball.getmember("Misc/sbom.spdx.json")).read()
118+
119+
sbom = json.loads(sbom_bytes)
120+
sbom.update({
121+
"SPDXID": "SPDXRef-DOCUMENT",
122+
"spdxVersion": "SPDX-2.3",
123+
"name": "CPython SBOM",
124+
"dataLicense": "CC0-1.0",
125+
# Naming done according to OpenSSF SBOM WG recommendations.
126+
# See: https://github.com/ossf/sbom-everywhere/blob/main/reference/sbom_naming.md
127+
"documentNamespace": f"{tarball_download_location}.spdx.json",
128+
"creationInfo": {
129+
"created": (
130+
datetime.datetime.now(tz=datetime.timezone.utc)
131+
.strftime("%Y-%m-%dT%H:%M:%SZ")
132+
),
133+
"creators": [
134+
"Person: Python Release Managers",
135+
"Tool: python/release-tools@f58cfa6611dd13f2fb4e4790a8c54f06dddab6bc",
136+
],
137+
# Version of the SPDX License ID list.
138+
# This shouldn't need to be updated often, if ever.
139+
"licenseListVersion": "3.22",
140+
},
141+
})
142+
143+
# Create the SBOM entry for the CPython package. We use
144+
# the SPDXID later on for creating relationships to files.
145+
sbom_cpython_package = {
146+
"SPDXID": "SPDXRef-PACKAGE-cpython",
147+
"name": "CPython",
148+
"versionInfo": cpython_version,
149+
"licenseConcluded": "PSF-2.0",
150+
"originator": "Organization: Python Software Foundation",
151+
"supplier": "Organization: Python Software Foundation",
152+
"packageFileName": tarball_name,
153+
"externalRefs": [
154+
{
155+
"referenceCategory": "SECURITY",
156+
"referenceLocator": f"cpe:2.3:a:python:python:{cpython_version}:*:*:*:*:*:*:*",
157+
"referenceType": "cpe23Type",
158+
}
159+
],
160+
"primaryPackagePurpose": "SOURCE",
161+
"downloadLocation": tarball_download_location,
162+
"checksums": [{"algorithm": "SHA256", "checksumValue": tarball_checksum_sha256}],
163+
}
164+
sbom["packages"].append(sbom_cpython_package)
165+
166+
# Extract all currently known files from the SBOM with their checksums.
167+
known_sbom_files = {}
168+
for sbom_file in sbom["files"]:
169+
sbom_filename = sbom_file["fileName"]
170+
171+
# We use the name we're expecting in the tarball here
172+
# which is to prefix the name with 'Python-{version}/...'.
173+
expected_tar_filename = f"Python-{cpython_version}/{sbom_filename}"
174+
175+
# We also want to update our SBOM to use the same filenames
176+
# as the ones in the tarball. We maintain the SPDXIDs though
177+
# to not need to rewrite SBOM relationships.
178+
sbom_file["fileName"] = expected_tar_filename
179+
180+
# Look for the expected SHA256 checksum.
181+
for sbom_file_checksum in sbom_file["checksums"]:
182+
if sbom_file_checksum["algorithm"] == "SHA256":
183+
known_sbom_files[expected_tar_filename] = (
184+
sbom_file_checksum["checksumValue"]
185+
)
186+
break
187+
else:
188+
raise ValueError(
189+
f"Couldn't find expected SHA256 checksum in SBOM for file '{sbom_filename}'"
190+
)
191+
192+
# Now we walk the tarball and compare known files to our expected checksums in the SBOM.
193+
# All files that aren't already in the SBOM can be added as "CPython" files.
194+
for member in tarball.getmembers():
195+
if member.isdir(): # Skip directories!
196+
continue
197+
198+
# Get the member from the tarball. CPython prefixes all of its
199+
# source code with 'Python-{version}/...'.
200+
assert member.isfile() and member.name.startswith(f"Python-{cpython_version}/")
201+
202+
# Calculate the hashes, either for comparison with a known value
203+
# or to embed in the SBOM as a new file. SHA1 is only used because
204+
# SPDX requires it for all file entries.
205+
file_bytes = tarball.extractfile(member).read()
206+
actual_file_checksum_sha1 = hashlib.sha1(file_bytes).hexdigest()
207+
actual_file_checksum_sha256 = hashlib.sha256(file_bytes).hexdigest()
208+
209+
# We've already seen this file, so we check it hasn't been modified and continue on.
210+
if member.name in known_sbom_files:
211+
# If there's a hash mismatch we raise an error, something isn't right!
212+
expected_file_checksum_sha256 = known_sbom_files.pop(member.name)
213+
if expected_file_checksum_sha256 != actual_file_checksum_sha256:
214+
raise ValueError(f"Mismatched checksum for file '{member.name}'")
215+
216+
# If this is a new file, then it's a part of the 'CPython' SBOM package.
217+
else:
218+
# Remove the 'Python-{version}/...' prefix for the SPDXID.
219+
sbom_file_spdx_id = spdx_id(f"SPDXRef-FILE-{member.name.split('/', 1)[1]}")
220+
sbom["files"].append(
221+
{
222+
"SPDXID": sbom_file_spdx_id,
223+
"fileName": member.name,
224+
"checksums": [
225+
{
226+
"algorithm": "SHA1",
227+
"checksumValue": actual_file_checksum_sha1,
228+
},
229+
{
230+
"algorithm": "SHA256",
231+
"checksumValue": actual_file_checksum_sha256,
232+
},
233+
],
234+
}
235+
)
236+
sbom["relationships"].append(
237+
{
238+
"spdxElementId": sbom_cpython_package["SPDXID"],
239+
"relatedSpdxElement": sbom_file_spdx_id,
240+
"relationshipType": "CONTAINS",
241+
}
242+
)
243+
244+
# If there are any known files that weren't found in the
245+
# source tarball we want to raise an error.
246+
if known_sbom_files:
247+
raise ValueError(
248+
f"Some files from source SBOM aren't accounted for "
249+
f"in source tarball: {sorted(known_sbom_files)!r}"
250+
)
251+
252+
# Final relationship, this SBOM describes the CPython package.
253+
sbom["relationships"].append(
254+
{
255+
"spdxElementId": "SPDXRef-DOCUMENT",
256+
"relatedSpdxElement": sbom_cpython_package["SPDXID"],
257+
"relationshipType": "DESCRIBES",
258+
}
259+
)
260+
261+
# Apply the 'supplier' tag to every package since we're shipping
262+
# the package in the tarball itself. Originator field is used for maintainers.
263+
for sbom_package in sbom["packages"]:
264+
sbom_package["supplier"] = "Organization: Python Software Foundation"
265+
sbom_package["filesAnalyzed"] = True
266+
267+
# Calculate the 'packageVerificationCode' values for files in packages.
268+
calculate_package_verification_codes(sbom)
269+
270+
return sbom
271+
272+
273+
if __name__ == "__main__":
274+
tarball_path = sys.argv[1]
275+
print(
276+
json.dumps(
277+
create_sbom_for_source_tarball(tarball_path), indent=2, sort_keys=True
278+
)
279+
)

0 commit comments

Comments
 (0)