Initial nix-ota implementation
Self-hostable OTA update system for NixOS fleets: a control server, device agent, publisher CLI, and NixOS modules that ship prebuilt system closures from a binary cache to devices that don't have the flake. - crates/common: signed manifest types (ed25519), store-path validator - crates/server: axum + sqlite + HTMX dashboard, channel/device API - crates/agent: poll, verify signature + revision, nix copy, switch, health check, magic-rollback on failure - crates/publisher: keygen + sign + publish CLI for operators/CI - nix/modules: NixOS modules for server and agent - nix/tests/ota.nix: end-to-end VM test exercising publish A -> B -> broken C -> rollback to B (passes) The control server never holds the signing key; manifests are signed offline and verified against a pinned public key on each device.
This commit is contained in:
commit
42b2ce4d1d
19 changed files with 4745 additions and 0 deletions
147
nix/tests/ota.nix
Normal file
147
nix/tests/ota.nix
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
{ pkgs, self, system }:
|
||||
# NixOS VM test for nix-ota.
|
||||
#
|
||||
# Builds three "system closure" stand-ins at evaluation time (each is a
|
||||
# directory containing a marker file and a `bin/switch-to-configuration`
|
||||
# stub), then drives the agent through three publishes:
|
||||
# 1. publish A -> device switches to A
|
||||
# 2. publish B -> device switches to B
|
||||
# 3. publish C (broken: agent's healthCmd will fail) -> device rolls back to B
|
||||
let
|
||||
mkClosure = label: extraScript: pkgs.runCommand "sys-${label}" {} ''
|
||||
mkdir -p $out/bin
|
||||
echo "${label}" > $out/marker
|
||||
cat > $out/bin/switch-to-configuration <<'EOF'
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
echo "applied ${label}" >&2
|
||||
${extraScript}
|
||||
exit 0
|
||||
EOF
|
||||
chmod +x $out/bin/switch-to-configuration
|
||||
'';
|
||||
|
||||
closureA = mkClosure "a" "touch /run/nix-ota-applied-a";
|
||||
closureB = mkClosure "b" "touch /run/nix-ota-applied-b";
|
||||
# Closure C activates fine, but the healthCmd checks for /run/nix-ota-broken
|
||||
# which we create before publishing C, forcing rollback.
|
||||
closureC = mkClosure "c" "touch /run/nix-ota-applied-c";
|
||||
|
||||
# Pre-generated binary cache keypair (test fixture; not secret).
|
||||
# Generated with: nix-store --generate-binary-cache-key cache.local sec pub
|
||||
cacheKeys = pkgs.runCommand "test-cache-keys" {} ''
|
||||
mkdir -p $out
|
||||
export HOME=$TMPDIR
|
||||
export NIX_STATE_DIR=$TMPDIR/state
|
||||
export NIX_STORE_DIR=$TMPDIR/store
|
||||
mkdir -p $NIX_STATE_DIR $NIX_STORE_DIR
|
||||
${pkgs.nix}/bin/nix-store --generate-binary-cache-key cache.local $out/secret $out/public
|
||||
'';
|
||||
|
||||
pubBin = "${self.packages.${system}.nix-ota-publisher}/bin/nix-ota";
|
||||
in
|
||||
pkgs.testers.runNixOSTest {
|
||||
name = "nix-ota";
|
||||
nodes = {
|
||||
server = { config, pkgs, lib, ... }: {
|
||||
imports = [ self.nixosModules.server ];
|
||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
||||
services.nix-ota-server = {
|
||||
enable = true;
|
||||
listen = "0.0.0.0:8080";
|
||||
openFirewall = true;
|
||||
publishTokenFile = pkgs.writeText "tok" "test-token";
|
||||
};
|
||||
services.nix-serve = {
|
||||
enable = true;
|
||||
port = 5000;
|
||||
secretKeyFile = "${cacheKeys}/secret";
|
||||
};
|
||||
networking.firewall.allowedTCPPorts = [ 5000 ];
|
||||
# The closures need to be in the server's store so nix-serve can serve them.
|
||||
system.extraDependencies = [ closureA closureB closureC ];
|
||||
};
|
||||
|
||||
device = { config, pkgs, lib, ... }: {
|
||||
imports = [ self.nixosModules.agent ];
|
||||
services.nix-ota-agent = {
|
||||
enable = true;
|
||||
server = "http://server:8080";
|
||||
channel = "prod";
|
||||
deviceId = "vm-device-1";
|
||||
publicKeyFile = "/var/lib/nix-ota/public.key";
|
||||
cacheUrl = "http://server:5000";
|
||||
cachePublicKey = builtins.readFile "${cacheKeys}/public";
|
||||
interval = 5;
|
||||
healthCmd = "test ! -f /run/nix-ota-broken";
|
||||
};
|
||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
||||
nix.settings.trusted-users = [ "root" ];
|
||||
};
|
||||
};
|
||||
|
||||
testScript = ''
|
||||
closureA = "${closureA}"
|
||||
closureB = "${closureB}"
|
||||
closureC = "${closureC}"
|
||||
pubBin = "${pubBin}"
|
||||
|
||||
start_all()
|
||||
server.wait_for_unit("nix-ota-server.service")
|
||||
server.wait_for_open_port(8080)
|
||||
server.wait_for_unit("nix-serve.service")
|
||||
server.wait_for_open_port(5000)
|
||||
# Drive the agent ourselves; disable the timer for deterministic stepping.
|
||||
device.succeed("systemctl stop nix-ota-agent.timer || true")
|
||||
|
||||
# Sign the closures with the binary cache key so the device's Nix will accept them.
|
||||
for c in [closureA, closureB, closureC]:
|
||||
server.succeed(f"nix store sign --extra-experimental-features nix-command --key-file ${cacheKeys}/secret --recursive {c}")
|
||||
|
||||
# Operator generates a manifest signing key on the server host.
|
||||
server.succeed("mkdir -p /root/keys")
|
||||
pub = server.succeed(f"{pubBin} keygen --out /root/keys/sign.key").strip()
|
||||
|
||||
# Push pubkey onto the device's writable state dir.
|
||||
device.succeed("mkdir -p /var/lib/nix-ota")
|
||||
device.succeed(f"echo '{pub}' > /var/lib/nix-ota/public.key")
|
||||
|
||||
def publish(store_path, rev):
|
||||
server.succeed(
|
||||
f"{pubBin} publish "
|
||||
f"--server http://localhost:8080 --token test-token "
|
||||
f"--key /root/keys/sign.key --channel prod "
|
||||
f"--store-path {store_path} --substituter http://server:5000 --revision {rev}"
|
||||
)
|
||||
|
||||
def poll_agent():
|
||||
# oneshot service: start and wait for it to finish (success or failure).
|
||||
device.succeed("systemctl start --wait nix-ota-agent.service || true")
|
||||
|
||||
# --- Step 1: publish A
|
||||
publish(closureA, 1)
|
||||
poll_agent()
|
||||
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureA}")
|
||||
device.succeed("test -f /run/nix-ota-applied-a")
|
||||
|
||||
# --- Step 2: publish B
|
||||
publish(closureB, 2)
|
||||
poll_agent()
|
||||
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureB}")
|
||||
device.succeed("test -f /run/nix-ota-applied-b")
|
||||
|
||||
# --- Step 3: publish C with health check rigged to fail
|
||||
device.succeed("touch /run/nix-ota-broken")
|
||||
publish(closureC, 3)
|
||||
poll_agent()
|
||||
# Agent should have applied C, failed the health check, and rolled back to B.
|
||||
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureB}")
|
||||
# The activation script for C did run before health check.
|
||||
device.succeed("test -f /run/nix-ota-applied-c")
|
||||
|
||||
# The dashboard should reflect the rolled_back state.
|
||||
server.wait_until_succeeds(
|
||||
"curl -fsS http://localhost:8080/ | grep -Eq 'rolled_back|failed'", timeout=30
|
||||
)
|
||||
'';
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue