Kelsidavis-WoWee/container/macos/sdk-fetcher.py
2026-03-30 21:15:41 +03:00

380 lines
13 KiB
Python

#!/usr/bin/env python3
"""Download and extract macOS SDK from Apple's Command Line Tools package.
Apple publishes Command Line Tools (CLT) packages via their publicly
accessible software update catalog. This script downloads the latest CLT,
extracts just the macOS SDK, and packages it as a .tar.gz tarball suitable
for osxcross.
No Apple ID or paid developer account required.
Usage:
python3 sdk-fetcher.py [output_dir]
The script prints the absolute path of the resulting tarball to stdout.
All progress / status messages go to stderr.
If a cached SDK tarball already exists in output_dir, it is reused.
Dependencies: python3 (>= 3.6), cpio, tar, gzip
Optional: bsdtar (libarchive-tools) or xar -- faster XAR extraction.
Falls back to a pure-Python XAR parser when neither is available.
"""
import glob
import gzip
import lzma
import os
import plistlib
import re
import shutil
import struct
import subprocess
import sys
import tempfile
import urllib.request
import zlib
try:
import defusedxml.ElementTree as ET
except ImportError as exc:
raise ImportError(
"defusedxml is required: pip install defusedxml"
) from exc
# -- Configuration -----------------------------------------------------------
CATALOG_URLS = [
# Try newest catalog first; first successful fetch wins.
"https://swscan.apple.com/content/catalogs/others/"
"index-16-15-14-13-12-10.16-10.15-10.14-10.13-10.12-10.11-10.10-10.9-"
"mountainlion-lion-snowleopard-leopard.merged-1.sucatalog.gz",
"https://swscan.apple.com/content/catalogs/others/"
"index-15-14-13-12-10.16-10.15-10.14-10.13-10.12-10.11-10.10-10.9-"
"mountainlion-lion-snowleopard-leopard.merged-1.sucatalog.gz",
"https://swscan.apple.com/content/catalogs/others/"
"index-14-13-12-10.16-10.15-10.14-10.13-10.12-10.11-10.10-10.9-"
"mountainlion-lion-snowleopard-leopard.merged-1.sucatalog.gz",
]
USER_AGENT = "Software%20Update"
# -- Helpers -----------------------------------------------------------------
def _validate_url(url):
"""Reject non-HTTPS URLs to prevent file:// and other scheme attacks."""
if not url.startswith("https://"):
raise ValueError(f"Refusing non-HTTPS URL: {url}")
def log(msg):
print(msg, file=sys.stderr, flush=True)
# -- 1) Catalog & URL discovery ----------------------------------------------
def find_sdk_pkg_url():
"""Search Apple catalogs for the latest CLTools_macOSNMOS_SDK.pkg URL."""
for cat_url in CATALOG_URLS:
short = cat_url.split("/index-")[1][:25] + "..."
log(f" Trying catalog: {short}")
try:
_validate_url(cat_url)
req = urllib.request.Request(cat_url, headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=60) as resp: # nosemgrep: python.lang.security.audit.dynamic-urllib-use-detected.dynamic-urllib-use-detected
raw = gzip.decompress(resp.read())
catalog = plistlib.loads(raw)
except Exception as exc:
log(f" -> fetch failed: {exc}")
continue
products = catalog.get("Products", {})
candidates = []
for pid, product in products.items():
post_date = str(product.get("PostDate", ""))
for pkg in product.get("Packages", []):
url = pkg.get("URL", "")
size = pkg.get("Size", 0)
if "CLTools_macOSNMOS_SDK" in url and url.endswith(".pkg"):
candidates.append((post_date, url, size, pid))
if not candidates:
log(f" -> no CLTools SDK packages in this catalog, trying next...")
continue
candidates.sort(reverse=True)
_date, url, size, pid = candidates[0]
log(f"==> Found: CLTools_macOSNMOS_SDK (product {pid}, {size // 1048576} MB)")
return url
log("ERROR: No CLTools SDK packages found in any Apple catalog.")
sys.exit(1)
# -- 2) Download -------------------------------------------------------------
def download(url, dest):
"""Download *url* to *dest* with a basic progress indicator."""
_validate_url(url)
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=600) as resp: # nosemgrep: python.lang.security.audit.dynamic-urllib-use-detected.dynamic-urllib-use-detected
total = int(resp.headers.get("Content-Length", 0))
done = 0
with open(dest, "wb") as f:
while True:
chunk = resp.read(1 << 20)
if not chunk:
break
f.write(chunk)
done += len(chunk)
if total:
pct = done * 100 // total
log(f"\r {done // 1048576} / {total // 1048576} MB ({pct}%)")
log("")
# -- 3) XAR extraction -------------------------------------------------------
def extract_xar(pkg_path, dest_dir):
"""Extract a XAR (.pkg) archive -- external tool or pure-Python fallback."""
for tool in ("bsdtar", "xar"):
if shutil.which(tool):
log(f"==> Extracting .pkg with {tool}...")
r = subprocess.run([tool, "-xf", pkg_path, "-C", dest_dir],
capture_output=True)
if r.returncode == 0:
return
log(f" {tool} exited {r.returncode}, trying next method...")
log("==> Extracting .pkg with built-in Python XAR parser...")
_extract_xar_python(pkg_path, dest_dir)
def _extract_xar_python(pkg_path, dest_dir):
"""Pure-Python XAR extractor (no external dependencies)."""
with open(pkg_path, "rb") as f:
raw = f.read(28)
if len(raw) < 28:
raise ValueError("File too small to be a valid XAR archive")
magic, hdr_size, _ver, toc_clen, _toc_ulen, _ck = struct.unpack(
">4sHHQQI", raw,
)
if magic != b"xar!":
raise ValueError(f"Not a XAR file (magic: {magic!r})")
f.seek(hdr_size)
toc_xml = zlib.decompress(f.read(toc_clen))
heap_off = hdr_size + toc_clen
root = ET.fromstring(toc_xml)
toc = root.find("toc")
if toc is None:
raise ValueError("Malformed XAR: no <toc> element")
def _walk(elem, base):
for fe in elem.findall("file"):
name = fe.findtext("name", "")
ftype = fe.findtext("type", "file")
path = os.path.join(base, name)
if ftype == "directory":
os.makedirs(path, exist_ok=True)
_walk(fe, path)
continue
de = fe.find("data")
if de is None:
continue
offset = int(de.findtext("offset", "0"))
size = int(de.findtext("size", "0"))
enc_el = de.find("encoding")
enc = enc_el.get("style", "") if enc_el is not None else ""
os.makedirs(os.path.dirname(path), exist_ok=True)
f.seek(heap_off + offset)
if "gzip" in enc:
with open(path, "wb") as out:
out.write(zlib.decompress(f.read(size), 15 + 32))
elif "bzip2" in enc:
import bz2
with open(path, "wb") as out:
out.write(bz2.decompress(f.read(size)))
else:
with open(path, "wb") as out:
rem = size
while rem > 0:
blk = f.read(min(rem, 1 << 20))
if not blk:
break
out.write(blk)
rem -= len(blk)
_walk(toc, dest_dir)
# -- 4) Payload extraction (pbzx / gzip cpio) --------------------------------
def _pbzx_stream(path):
"""Yield decompressed chunks from a pbzx-compressed file."""
with open(path, "rb") as f:
if f.read(4) != b"pbzx":
raise ValueError("Not a pbzx file")
f.read(8)
while True:
hdr = f.read(16)
if len(hdr) < 16:
break
_usize, csize = struct.unpack(">QQ", hdr)
data = f.read(csize)
if len(data) < csize:
break
if csize == _usize:
yield data
else:
yield lzma.decompress(data)
def _gzip_stream(path):
"""Yield decompressed chunks from a gzip file."""
with gzip.open(path, "rb") as f:
while True:
chunk = f.read(1 << 20)
if not chunk:
break
yield chunk
def _raw_stream(path):
"""Yield raw 1 MiB chunks (last resort)."""
with open(path, "rb") as f:
while True:
chunk = f.read(1 << 20)
if not chunk:
break
yield chunk
def extract_payload(payload_path, out_dir):
"""Decompress a CLT Payload (pbzx or gzip cpio) into *out_dir*."""
with open(payload_path, "rb") as pf:
magic = pf.read(4)
if magic == b"pbzx":
log(" Payload format: pbzx (LZMA chunks)")
stream = _pbzx_stream(payload_path)
elif magic[:2] == b"\x1f\x8b":
log(" Payload format: gzip")
stream = _gzip_stream(payload_path)
else:
log(f" Payload format: unknown (magic: {magic.hex()}), trying raw cpio...")
stream = _raw_stream(payload_path)
proc = subprocess.Popen(
["cpio", "-id", "--quiet"],
stdin=subprocess.PIPE,
cwd=out_dir,
stderr=subprocess.PIPE,
)
for chunk in stream:
try:
proc.stdin.write(chunk)
except BrokenPipeError:
break
proc.stdin.close()
proc.wait()
# -- Main --------------------------------------------------------------------
def main():
output_dir = os.path.abspath(sys.argv[1]) if len(sys.argv) > 1 else os.getcwd()
os.makedirs(output_dir, exist_ok=True)
# Re-use a previously fetched SDK if present.
cached = glob.glob(os.path.join(output_dir, "MacOSX*.sdk.tar.*"))
if cached:
cached.sort()
result = os.path.realpath(cached[-1])
log(f"==> Using cached SDK: {os.path.basename(result)}")
print(result)
return
work = tempfile.mkdtemp(prefix="fetch-macos-sdk-")
try:
# 1 -- Locate SDK package URL from Apple's catalog
log("==> Searching Apple software-update catalogs...")
sdk_url = find_sdk_pkg_url()
# 2 -- Download (just the SDK component, ~55 MB)
pkg = os.path.join(work, "sdk.pkg")
log("==> Downloading CLTools SDK package...")
download(sdk_url, pkg)
# 3 -- Extract the flat .pkg (XAR format) to get the Payload
pkg_dir = os.path.join(work, "pkg")
os.makedirs(pkg_dir)
extract_xar(pkg, pkg_dir)
os.unlink(pkg)
# 4 -- Locate the Payload file
log("==> Locating SDK payload...")
sdk_payload = None
for dirpath, _dirs, files in os.walk(pkg_dir):
if "Payload" in files:
sdk_payload = os.path.join(dirpath, "Payload")
log(f" Found: {os.path.relpath(sdk_payload, pkg_dir)}")
break
if sdk_payload is None:
log("ERROR: No Payload found in extracted package")
sys.exit(1)
# 5 -- Decompress Payload -> cpio -> filesystem
sdk_root = os.path.join(work, "sdk")
os.makedirs(sdk_root)
log("==> Extracting SDK from payload (this may take a minute)...")
extract_payload(sdk_payload, sdk_root)
shutil.rmtree(pkg_dir)
# 6 -- Find MacOSX*.sdk directory
sdk_found = None
for dirpath, dirs, _files in os.walk(sdk_root):
for d in dirs:
if re.match(r"MacOSX\d+(\.\d+)?\.sdk$", d):
sdk_found = os.path.join(dirpath, d)
break
if sdk_found:
break
if not sdk_found:
log("ERROR: MacOSX*.sdk directory not found. Extracted contents:")
for dp, ds, fs in os.walk(sdk_root):
depth = dp.replace(sdk_root, "").count(os.sep)
if depth < 4:
log(f" {' ' * depth}{os.path.basename(dp)}/")
sys.exit(1)
sdk_name = os.path.basename(sdk_found)
log(f"==> Found: {sdk_name}")
# 7 -- Package as .tar.gz
tarball = os.path.join(output_dir, f"{sdk_name}.tar.gz")
log(f"==> Packaging: {sdk_name}.tar.gz ...")
subprocess.run(
["tar", "-czf", tarball, "-C", os.path.dirname(sdk_found), sdk_name],
check=True,
)
log(f"==> macOS SDK ready: {tarball}")
print(tarball)
finally:
shutil.rmtree(work, ignore_errors=True)
if __name__ == "__main__":
main()