nix-ota/nix/tests/ota.nix

148 lines
5.5 KiB
Nix
Raw Normal View History

{ pkgs, self, system }:
# NixOS VM test for nix-ota.
#
# Builds three "system closure" stand-ins at evaluation time (each is a
# directory containing a marker file and a `bin/switch-to-configuration`
# stub), then drives the agent through three publishes:
# 1. publish A -> device switches to A
# 2. publish B -> device switches to B
# 3. publish C (broken: agent's healthCmd will fail) -> device rolls back to B
let
mkClosure = label: extraScript: pkgs.runCommand "sys-${label}" {} ''
mkdir -p $out/bin
echo "${label}" > $out/marker
cat > $out/bin/switch-to-configuration <<'EOF'
#!/bin/sh
set -eu
echo "applied ${label}" >&2
${extraScript}
exit 0
EOF
chmod +x $out/bin/switch-to-configuration
'';
closureA = mkClosure "a" "touch /run/nix-ota-applied-a";
closureB = mkClosure "b" "touch /run/nix-ota-applied-b";
# Closure C activates fine, but the healthCmd checks for /run/nix-ota-broken
# which we create before publishing C, forcing rollback.
closureC = mkClosure "c" "touch /run/nix-ota-applied-c";
# Pre-generated binary cache keypair (test fixture; not secret).
# Generated with: nix-store --generate-binary-cache-key cache.local sec pub
cacheKeys = pkgs.runCommand "test-cache-keys" {} ''
mkdir -p $out
export HOME=$TMPDIR
export NIX_STATE_DIR=$TMPDIR/state
export NIX_STORE_DIR=$TMPDIR/store
mkdir -p $NIX_STATE_DIR $NIX_STORE_DIR
${pkgs.nix}/bin/nix-store --generate-binary-cache-key cache.local $out/secret $out/public
'';
pubBin = "${self.packages.${system}.nix-ota-publisher}/bin/nix-ota";
in
pkgs.testers.runNixOSTest {
name = "nix-ota";
nodes = {
server = { config, pkgs, lib, ... }: {
imports = [ self.nixosModules.server ];
nix.settings.experimental-features = [ "nix-command" "flakes" ];
services.nix-ota-server = {
enable = true;
listen = "0.0.0.0:8080";
openFirewall = true;
publishTokenFile = pkgs.writeText "tok" "test-token";
};
services.nix-serve = {
enable = true;
port = 5000;
secretKeyFile = "${cacheKeys}/secret";
};
networking.firewall.allowedTCPPorts = [ 5000 ];
# The closures need to be in the server's store so nix-serve can serve them.
system.extraDependencies = [ closureA closureB closureC ];
};
device = { config, pkgs, lib, ... }: {
imports = [ self.nixosModules.agent ];
services.nix-ota-agent = {
enable = true;
server = "http://server:8080";
channel = "prod";
deviceId = "vm-device-1";
publicKeyFile = "/var/lib/nix-ota/public.key";
cacheUrl = "http://server:5000";
cachePublicKey = builtins.readFile "${cacheKeys}/public";
interval = 5;
healthCmd = "test ! -f /run/nix-ota-broken";
};
nix.settings.experimental-features = [ "nix-command" "flakes" ];
nix.settings.trusted-users = [ "root" ];
};
};
testScript = ''
closureA = "${closureA}"
closureB = "${closureB}"
closureC = "${closureC}"
pubBin = "${pubBin}"
start_all()
server.wait_for_unit("nix-ota-server.service")
server.wait_for_open_port(8080)
server.wait_for_unit("nix-serve.service")
server.wait_for_open_port(5000)
# Drive the agent ourselves; disable the timer for deterministic stepping.
device.succeed("systemctl stop nix-ota-agent.timer || true")
# Sign the closures with the binary cache key so the device's Nix will accept them.
for c in [closureA, closureB, closureC]:
server.succeed(f"nix store sign --extra-experimental-features nix-command --key-file ${cacheKeys}/secret --recursive {c}")
# Operator generates a manifest signing key on the server host.
server.succeed("mkdir -p /root/keys")
pub = server.succeed(f"{pubBin} keygen --out /root/keys/sign.key").strip()
# Push pubkey onto the device's writable state dir.
device.succeed("mkdir -p /var/lib/nix-ota")
device.succeed(f"echo '{pub}' > /var/lib/nix-ota/public.key")
def publish(store_path, rev):
server.succeed(
f"{pubBin} publish "
f"--server http://localhost:8080 --token test-token "
f"--key /root/keys/sign.key --channel prod "
f"--store-path {store_path} --substituter http://server:5000 --revision {rev}"
)
def poll_agent():
# oneshot service: start and wait for it to finish (success or failure).
device.succeed("systemctl start --wait nix-ota-agent.service || true")
# --- Step 1: publish A
publish(closureA, 1)
poll_agent()
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureA}")
device.succeed("test -f /run/nix-ota-applied-a")
# --- Step 2: publish B
publish(closureB, 2)
poll_agent()
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureB}")
device.succeed("test -f /run/nix-ota-applied-b")
# --- Step 3: publish C with health check rigged to fail
device.succeed("touch /run/nix-ota-broken")
publish(closureC, 3)
poll_agent()
# Agent should have applied C, failed the health check, and rolled back to B.
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureB}")
# The activation script for C did run before health check.
device.succeed("test -f /run/nix-ota-applied-c")
# The dashboard should reflect the rolled_back state.
server.wait_until_succeeds(
"curl -fsS http://localhost:8080/ | grep -Eq 'rolled_back|failed'", timeout=30
)
'';
}