Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,157 changes: 1,157 additions & 0 deletions ci/Jenkinsfile

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
268 changes: 268 additions & 0 deletions ci/README.md

Large diffs are not rendered by default.

121 changes: 121 additions & 0 deletions ci/common.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Shared helpers loaded by the build pipeline Jenkinsfile.

boolean paramBool(String name) {
def v = params.get(name)
if (v == null) { return false }
if (v instanceof Boolean) { return v }
return v.toString().toBoolean()
}

String paramString(String name) {
def v = params.get(name)
return v == null ? '' : v.toString()
}

// Sole shell-quoting primitive. Wraps the argument in single quotes and
// escapes embedded single quotes.
String shellQuote(String s) {
return "'" + (s ?: '').replace("'", "'\"'\"'") + "'"
}

// Sets FINN_DOCKER_PREBUILT=1 when a shared image is configured so non-builder
// agents load the image from NFS instead of rebuilding.
void runDockerCommand(String command) {
if (env.FINN_DOCKER_SHARED_IMAGE_DIR) {
withEnv(['FINN_DOCKER_PREBUILT=1']) {
sh command
}
} else {
sh command
}
}

void unstashIfPresent(String stashName) {
try {
unstash stashName
} catch (Exception ignored) {
echo "No stash '${stashName}' (stage skipped or failed before publishing)"
}
}

// Single stash-with-catchError primitive. requireFile, if given, gates the
// stash on that file existing. allowEmpty controls the stash step.
void _stashReport(String stashName, String includes, boolean allowEmpty, String requireFile) {
catchError(buildResult: null, stageResult: null,
message: "safeStashReport(${stashName}) failed, aggregation may be partial") {
if (requireFile && !fileExists(requireFile)) { return }
stash name: stashName, includes: includes, allowEmpty: allowEmpty
}
}

// Build pipeline stashes the full per-shard report sidecar set. Some are
// missing when a shard fails early, so allowEmpty is true. The .coverage
// entry only exists on rows that opted into coverage in STAGES.
void safeStashShardReport(String stashName) {
_stashReport(
stashName,
"${stashName}.xml,${stashName}.html,${stashName}.timings.json," +
"${stashName}.shardmap.txt,${stashName}.shardmap.json,${stashName}.stagemap," +
"${stashName}.empty-shard,${stashName}.coverage",
true,
null,
)
}

// Hard-fail on root-owned residue. Factored out so the build forms below
// cannot diverge on the error message or detection logic.
void _assertNoResidue(String caller, String q) {
sh """
if [ -d ${q} ]; then
echo "${caller}: ${q} still exists after rm. Likely root-owned residue. Ask an admin to 'sudo rm -rf' the directory on this agent."
ls -la ${q} | head -40
exit 1
fi
"""
}

// Tolerant rm, hard-fail on root-owned residue, then pre-create as the
// unprivileged user so docker -v does not bind the mount as root.
void cleanPreviousBuildFiles(String buildDir) {
if (!buildDir || buildDir.empty) { return }
String q = shellQuote(buildDir)
sh "rm -rf ${q} 2>/dev/null || true"
_assertNoResidue('cleanPreviousBuildFiles', q)
sh "mkdir -p ${q}"
}

// All shared NFS subtrees derive from FINN_CI_NFS_ROOT. Returning '' from any
// resolver means "no NFS available". Callers must handle that as a fallback.
String finnCiNfsRoot() { return (env.FINN_CI_NFS_ROOT ?: '').trim() }

String finnSubdir(String... segments) {
String r = finnCiNfsRoot()
if (!r) { return '' }
for (int i = 0; i < segments.length; i++) {
if (!segments[i]) { return '' }
}
return ([r] + (segments as List)).join('/')
}

String finnAgentCachesDir(String node) { return finnSubdir('agent_caches', node) }
String finnDockerImagesRoot() { return finnSubdir('docker_images') }
String finnDockerImagesDir(String jobKey) { return finnSubdir('docker_images', jobKey) }
String finnArtifactsRoot() { return finnSubdir('artifacts') }
String finnCiStateRoot() { return finnSubdir('_ci_state') }
String finnCiStateDir(String jobKey) { return finnSubdir('_ci_state', jobKey) }

// Append `value` to the list at `map[key]`, creating the list lazily.
// Replaces the inline `Map.computeIfAbsent` idiom because CPS does not
// reliably transform SAM closures to java.util.function.Function. Returns
// the (possibly newly created) list so callers can chain.
List mapAppend(Map map, Object key, Object value) {
def existing = map.get(key)
if (existing == null) {
existing = []
map.put(key, existing)
}
existing << value
return existing
}

return this
16 changes: 11 additions & 5 deletions ci/finn_ci/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,17 @@
"""FINN CI helpers.

A small package, importable without the finn package installed, that backs the
FINN Jenkins CI pipeline and the pytest sharding plugin.
FINN Jenkins CI pipeline and the pytest sharding plugin. The build pipeline
drives the CLI with ``PYTHONPATH=ci python3 -m finn_ci <subcommand>``.

Submodules:
config - the CI board and stage tables and the pure helpers over them
sharding - deterministic weight-balanced group-to-shard assignment
jsonio - the JSON read helper shared across the package
plugin - the pytest plugin that selects a shard and captures timings
config - the CI board and stage tables and the pure helpers over them
sharding - deterministic weight-balanced group-to-shard assignment
jsonio - the JSON read/write helpers shared across the package
plugin - the pytest plugin that selects a shard and captures timings
timing - the self-maintaining per-group timing master and summaries
retention - image/artifact/snapshot tree rotation and pip-cache pruning
lsf - bjobs orphan-job parsing for the build reaper
failures - the stdlib JUnit failure printer
__main__ - the CLI dispatched by python3 -m finn_ci
"""
158 changes: 158 additions & 0 deletions ci/finn_ci/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Copyright (C) 2026, Advanced Micro Devices, Inc.
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause

"""CLI for the FINN CI pipeline: python3 -m finn_ci <subcommand>.

Run from a checkout with ci/ on PYTHONPATH (the Jenkinsfile uses
PYTHONPATH=ci python3 -m finn_ci ...). Each subcommand is a thin wrapper over a
finn_ci submodule so the Groovy side never re-implements the config, timing,
retention, or LSF parsing logic.
"""

import argparse
import json
import sys
from finn_ci import config, failures, lsf, retention, timing


def main(argv=None):
"""CLI entry point. Catches validate_* failures so a malformed STAGES row
surfaces in the Validate Jenkins console as a one-line "ci_sharding:"
message instead of a Python traceback.
"""
try:
return _dispatch(argv)
except (ValueError, AssertionError) as exc:
print("ci_sharding: %s" % exc, file=sys.stderr)
return 2


def _dispatch(argv):
parser = argparse.ArgumentParser(prog="finn_ci", description=__doc__)
sub = parser.add_subparsers(dest="cmd")

sub.add_parser("stage-choices-json")

# validate-config is the one entry point the Validate stage in Jenkins
# delegates to. Folds enabled_params / job_key / shard_plan into a single
# subprocess and runs validate_config() first so a malformed row or orphan
# zipArtifact board fails Validate loudly.
p = sub.add_parser("validate-config")
p.add_argument("--choice", required=True)
p.add_argument("--job-name", required=True)
p.add_argument("--stage-filter", default="")

p = sub.add_parser("job-key")
p.add_argument("name")

p = sub.add_parser("lsf-parse-jobs")
p.add_argument("--prefix", required=True)

p = sub.add_parser("prune-pip-cache")
p.add_argument("root")
p.add_argument("keep")
p.add_argument("max_age_days", type=int)
p.add_argument("--dry-run", action="store_true")

p = sub.add_parser("prepare")
p.add_argument("--master", required=True)
p.add_argument("--snapshot", required=True)

p = sub.add_parser("summarize")
p.add_argument("reports_dir")

p = sub.add_parser("update")
p.add_argument("--reports", required=True)
p.add_argument("--master", default="")
p.add_argument("--out", required=True)
p.add_argument("--job", default="")
p.add_argument("--build", default="")
p.add_argument("--update-master", action="store_true")

p = sub.add_parser("merge-maps")
p.add_argument("reports_dir")

p = sub.add_parser("print-failures")
p.add_argument("junit_xml")
p.add_argument("stash")
p.add_argument("lines_per", type=int)
p.add_argument("max_fails", type=int)

# One numbered-tree rotation for the image / artifact / snapshot trees.
# retain_n and max_age_days come from RETENTION[kind], so a caller cannot
# pass a window that disagrees with the documented policy.
p = sub.add_parser("prune")
p.add_argument("--kind", required=True, choices=tuple(retention.RETENTION))
p.add_argument("root")
p.add_argument("job_key")
p.add_argument("current_build")
p.add_argument("--dry-run", action="store_true")

args = parser.parse_args(argv)
if args.cmd == "stage-choices-json":
print(json.dumps(config.jenkins_stage_choices()))
return 0
if args.cmd == "validate-config":
config.validate_config()
print(
json.dumps(
{
"enabled_params": config.enabled_params_for_choice(args.choice),
"job_key": config.job_key(args.job_name),
"shard_plan": config.shard_plan(args.choice, args.stage_filter),
}
)
)
return 0
if args.cmd == "job-key":
print(config.job_key(args.name))
return 0
if args.cmd == "lsf-parse-jobs":
print(json.dumps(lsf.parse_lsf_jobs(args.prefix, sys.stdin.read())))
return 0
if args.cmd == "prune-pip-cache":
retention.prune_pip_cache(args.root, args.keep, args.max_age_days, args.dry_run)
return 0
if args.cmd == "prepare":
return timing.prepare_timing_snapshot(args.master, args.snapshot)
if args.cmd == "summarize":
return timing.summarize_timings(args.reports_dir)
if args.cmd == "update":
return timing.update_master(
args.reports,
args.master,
args.out,
update_persistent=args.update_master,
metadata={
"job": args.job,
"build": args.build,
},
)
if args.cmd == "merge-maps":
return timing.merge_maps(args.reports_dir)
if args.cmd == "print-failures":
return failures.print_failures(args.junit_xml, args.stash, args.lines_per, args.max_fails)
if args.cmd == "prune":
policy = retention.RETENTION[args.kind]
prune_fn = {
"image": retention.prune_images,
"artifact": retention.prune_artifacts,
"snapshot": retention.prune_snapshots,
}[args.kind]
prune_fn(
args.root,
config.job_key(args.job_key),
args.current_build,
policy["retain"],
policy["ageDays"],
args.dry_run,
)
return 0
parser.print_help()
return 2


if __name__ == "__main__":
sys.exit(main())
22 changes: 2 additions & 20 deletions ci/scripts/print_pytest_failures.py → ci/finn_ci/failures.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python3
# Copyright (C) 2026, Advanced Micro Devices, Inc.
# All rights reserved.
#
Expand All @@ -9,26 +8,13 @@
Used by Jenkins to surface per-test failure context when there is no tool
log to tail (notebook timeouts, asserts, fixture errors). Pure stdlib so
it runs on any agent.

Usage: print_pytest_failures.py <junit_xml> <stash> <lines_per_failure> <max_failures>
"""

import re
import sys
import xml.etree.ElementTree as ET


def main(argv):
if len(argv) != 5:
print(
"Usage: print_pytest_failures.py <junit_xml> <stash> "
"<lines_per_failure> <max_failures>",
file=sys.stderr,
)
return 2
xml_path = argv[1]
stash = argv[2]
lines_per = int(argv[3])
max_fails = int(argv[4])
def print_failures(xml_path, stash, lines_per, max_fails):
tag = "[pytest-failures %s]" % stash
try:
root = ET.parse(xml_path).getroot()
Expand Down Expand Up @@ -81,7 +67,3 @@ def main(argv):
for ln in body_lines:
print(" %s" % ln)
return 0


if __name__ == "__main__":
sys.exit(main(sys.argv))
23 changes: 22 additions & 1 deletion ci/finn_ci/jsonio.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
#
# SPDX-License-Identifier: BSD-3-Clause

"""JSON read helper shared across the finn_ci package."""
"""JSON read/write helpers shared across the finn_ci package."""

import json
import os
import sys
import tempfile


def read_json(path, default=None):
Expand All @@ -25,3 +27,22 @@ def read_json(path, default=None):
file=sys.stderr,
)
return default


def write_json_atomic(path, data):
parent = os.path.dirname(os.path.abspath(path))
# exist_ok=True so two concurrent first-time callers on a shared NFS root
# cannot race on mkdir.
os.makedirs(parent, exist_ok=True)
fd, tmp = tempfile.mkstemp(prefix=".tmp-", suffix=".json", dir=parent)
try:
with os.fdopen(fd, "w") as f:
json.dump(data, f, indent=2, sort_keys=True)
f.write("\n")
os.rename(tmp, path)
except Exception:
try:
os.unlink(tmp)
except OSError:
pass
raise
Loading
Loading