148 lines
5.5 KiB
Nix
148 lines
5.5 KiB
Nix
|
|
{ pkgs, self, system }:
|
||
|
|
# NixOS VM test for nix-ota.
|
||
|
|
#
|
||
|
|
# Builds three "system closure" stand-ins at evaluation time (each is a
|
||
|
|
# directory containing a marker file and a `bin/switch-to-configuration`
|
||
|
|
# stub), then drives the agent through three publishes:
|
||
|
|
# 1. publish A -> device switches to A
|
||
|
|
# 2. publish B -> device switches to B
|
||
|
|
# 3. publish C (broken: agent's healthCmd will fail) -> device rolls back to B
|
||
|
|
let
|
||
|
|
mkClosure = label: extraScript: pkgs.runCommand "sys-${label}" {} ''
|
||
|
|
mkdir -p $out/bin
|
||
|
|
echo "${label}" > $out/marker
|
||
|
|
cat > $out/bin/switch-to-configuration <<'EOF'
|
||
|
|
#!/bin/sh
|
||
|
|
set -eu
|
||
|
|
echo "applied ${label}" >&2
|
||
|
|
${extraScript}
|
||
|
|
exit 0
|
||
|
|
EOF
|
||
|
|
chmod +x $out/bin/switch-to-configuration
|
||
|
|
'';
|
||
|
|
|
||
|
|
closureA = mkClosure "a" "touch /run/nix-ota-applied-a";
|
||
|
|
closureB = mkClosure "b" "touch /run/nix-ota-applied-b";
|
||
|
|
# Closure C activates fine, but the healthCmd checks for /run/nix-ota-broken
|
||
|
|
# which we create before publishing C, forcing rollback.
|
||
|
|
closureC = mkClosure "c" "touch /run/nix-ota-applied-c";
|
||
|
|
|
||
|
|
# Pre-generated binary cache keypair (test fixture; not secret).
|
||
|
|
# Generated with: nix-store --generate-binary-cache-key cache.local sec pub
|
||
|
|
cacheKeys = pkgs.runCommand "test-cache-keys" {} ''
|
||
|
|
mkdir -p $out
|
||
|
|
export HOME=$TMPDIR
|
||
|
|
export NIX_STATE_DIR=$TMPDIR/state
|
||
|
|
export NIX_STORE_DIR=$TMPDIR/store
|
||
|
|
mkdir -p $NIX_STATE_DIR $NIX_STORE_DIR
|
||
|
|
${pkgs.nix}/bin/nix-store --generate-binary-cache-key cache.local $out/secret $out/public
|
||
|
|
'';
|
||
|
|
|
||
|
|
pubBin = "${self.packages.${system}.nix-ota-publisher}/bin/nix-ota";
|
||
|
|
in
|
||
|
|
pkgs.testers.runNixOSTest {
|
||
|
|
name = "nix-ota";
|
||
|
|
nodes = {
|
||
|
|
server = { config, pkgs, lib, ... }: {
|
||
|
|
imports = [ self.nixosModules.server ];
|
||
|
|
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
||
|
|
services.nix-ota-server = {
|
||
|
|
enable = true;
|
||
|
|
listen = "0.0.0.0:8080";
|
||
|
|
openFirewall = true;
|
||
|
|
publishTokenFile = pkgs.writeText "tok" "test-token";
|
||
|
|
};
|
||
|
|
services.nix-serve = {
|
||
|
|
enable = true;
|
||
|
|
port = 5000;
|
||
|
|
secretKeyFile = "${cacheKeys}/secret";
|
||
|
|
};
|
||
|
|
networking.firewall.allowedTCPPorts = [ 5000 ];
|
||
|
|
# The closures need to be in the server's store so nix-serve can serve them.
|
||
|
|
system.extraDependencies = [ closureA closureB closureC ];
|
||
|
|
};
|
||
|
|
|
||
|
|
device = { config, pkgs, lib, ... }: {
|
||
|
|
imports = [ self.nixosModules.agent ];
|
||
|
|
services.nix-ota-agent = {
|
||
|
|
enable = true;
|
||
|
|
server = "http://server:8080";
|
||
|
|
channel = "prod";
|
||
|
|
deviceId = "vm-device-1";
|
||
|
|
publicKeyFile = "/var/lib/nix-ota/public.key";
|
||
|
|
cacheUrl = "http://server:5000";
|
||
|
|
cachePublicKey = builtins.readFile "${cacheKeys}/public";
|
||
|
|
interval = 5;
|
||
|
|
healthCmd = "test ! -f /run/nix-ota-broken";
|
||
|
|
};
|
||
|
|
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
||
|
|
nix.settings.trusted-users = [ "root" ];
|
||
|
|
};
|
||
|
|
};
|
||
|
|
|
||
|
|
testScript = ''
|
||
|
|
closureA = "${closureA}"
|
||
|
|
closureB = "${closureB}"
|
||
|
|
closureC = "${closureC}"
|
||
|
|
pubBin = "${pubBin}"
|
||
|
|
|
||
|
|
start_all()
|
||
|
|
server.wait_for_unit("nix-ota-server.service")
|
||
|
|
server.wait_for_open_port(8080)
|
||
|
|
server.wait_for_unit("nix-serve.service")
|
||
|
|
server.wait_for_open_port(5000)
|
||
|
|
# Drive the agent ourselves; disable the timer for deterministic stepping.
|
||
|
|
device.succeed("systemctl stop nix-ota-agent.timer || true")
|
||
|
|
|
||
|
|
# Sign the closures with the binary cache key so the device's Nix will accept them.
|
||
|
|
for c in [closureA, closureB, closureC]:
|
||
|
|
server.succeed(f"nix store sign --extra-experimental-features nix-command --key-file ${cacheKeys}/secret --recursive {c}")
|
||
|
|
|
||
|
|
# Operator generates a manifest signing key on the server host.
|
||
|
|
server.succeed("mkdir -p /root/keys")
|
||
|
|
pub = server.succeed(f"{pubBin} keygen --out /root/keys/sign.key").strip()
|
||
|
|
|
||
|
|
# Push pubkey onto the device's writable state dir.
|
||
|
|
device.succeed("mkdir -p /var/lib/nix-ota")
|
||
|
|
device.succeed(f"echo '{pub}' > /var/lib/nix-ota/public.key")
|
||
|
|
|
||
|
|
def publish(store_path, rev):
|
||
|
|
server.succeed(
|
||
|
|
f"{pubBin} publish "
|
||
|
|
f"--server http://localhost:8080 --token test-token "
|
||
|
|
f"--key /root/keys/sign.key --channel prod "
|
||
|
|
f"--store-path {store_path} --substituter http://server:5000 --revision {rev}"
|
||
|
|
)
|
||
|
|
|
||
|
|
def poll_agent():
|
||
|
|
# oneshot service: start and wait for it to finish (success or failure).
|
||
|
|
device.succeed("systemctl start --wait nix-ota-agent.service || true")
|
||
|
|
|
||
|
|
# --- Step 1: publish A
|
||
|
|
publish(closureA, 1)
|
||
|
|
poll_agent()
|
||
|
|
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureA}")
|
||
|
|
device.succeed("test -f /run/nix-ota-applied-a")
|
||
|
|
|
||
|
|
# --- Step 2: publish B
|
||
|
|
publish(closureB, 2)
|
||
|
|
poll_agent()
|
||
|
|
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureB}")
|
||
|
|
device.succeed("test -f /run/nix-ota-applied-b")
|
||
|
|
|
||
|
|
# --- Step 3: publish C with health check rigged to fail
|
||
|
|
device.succeed("touch /run/nix-ota-broken")
|
||
|
|
publish(closureC, 3)
|
||
|
|
poll_agent()
|
||
|
|
# Agent should have applied C, failed the health check, and rolled back to B.
|
||
|
|
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureB}")
|
||
|
|
# The activation script for C did run before health check.
|
||
|
|
device.succeed("test -f /run/nix-ota-applied-c")
|
||
|
|
|
||
|
|
# The dashboard should reflect the rolled_back state.
|
||
|
|
server.wait_until_succeeds(
|
||
|
|
"curl -fsS http://localhost:8080/ | grep -Eq 'rolled_back|failed'", timeout=30
|
||
|
|
)
|
||
|
|
'';
|
||
|
|
}
|