Initial nix-ota implementation
Self-hostable OTA update system for NixOS fleets: a control server, device agent, publisher CLI, and NixOS modules that ship prebuilt system closures from a binary cache to devices that don't have the flake. - crates/common: signed manifest types (ed25519), store-path validator - crates/server: axum + sqlite + HTMX dashboard, channel/device API - crates/agent: poll, verify signature + revision, nix copy, switch, health check, magic-rollback on failure - crates/publisher: keygen + sign + publish CLI for operators/CI - nix/modules: NixOS modules for server and agent - nix/tests/ota.nix: end-to-end VM test exercising publish A -> B -> broken C -> rollback to B (passes) The control server never holds the signing key; manifests are signed offline and verified against a pinned public key on each device.
This commit is contained in:
commit
42b2ce4d1d
19 changed files with 4745 additions and 0 deletions
87
nix/modules/agent.nix
Normal file
87
nix/modules/agent.nix
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
self: { config, lib, pkgs, ... }:
|
||||
let
|
||||
cfg = config.services.nix-ota-agent;
|
||||
inherit (lib) mkEnableOption mkOption mkIf types;
|
||||
in {
|
||||
options.services.nix-ota-agent = {
|
||||
enable = mkEnableOption "nix-ota device agent";
|
||||
package = mkOption {
|
||||
type = types.package;
|
||||
default = self.packages.${pkgs.system}.nix-ota-agent;
|
||||
};
|
||||
server = mkOption { type = types.str; example = "https://ota.example.com"; };
|
||||
channel = mkOption { type = types.str; default = "prod"; };
|
||||
deviceId = mkOption { type = types.str; example = "device-001"; };
|
||||
publicKey = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Base64-encoded ed25519 verifying key. The agent will reject manifests not signed by the matching private key. Mutually exclusive with publicKeyFile.";
|
||||
};
|
||||
publicKeyFile = mkOption {
|
||||
type = types.nullOr types.path;
|
||||
default = null;
|
||||
description = "Path to a file containing the base64-encoded verifying key. Use this if you need to write the key at runtime (e.g. from an orchestration system).";
|
||||
};
|
||||
interval = mkOption { type = types.int; default = 60; };
|
||||
healthCmd = mkOption { type = types.nullOr types.str; default = null; };
|
||||
cacheUrl = mkOption {
|
||||
type = types.str;
|
||||
description = "Substituter URL added to nix.settings.substituters so `nix copy` can fetch from it.";
|
||||
};
|
||||
cachePublicKey = mkOption {
|
||||
type = types.str;
|
||||
description = "Trusted public key of the binary cache (the one that signs store paths).";
|
||||
};
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
assertions = [{
|
||||
assertion = (cfg.publicKey != null) != (cfg.publicKeyFile != null);
|
||||
message = "services.nix-ota-agent: set exactly one of publicKey or publicKeyFile.";
|
||||
}];
|
||||
|
||||
nix.settings = {
|
||||
substituters = [ cfg.cacheUrl ];
|
||||
trusted-public-keys = [ cfg.cachePublicKey ];
|
||||
experimental-features = [ "nix-command" "flakes" ];
|
||||
};
|
||||
|
||||
environment.etc."nix-ota/public.key" = lib.mkIf (cfg.publicKey != null) {
|
||||
text = cfg.publicKey;
|
||||
};
|
||||
|
||||
systemd.services.nix-ota-agent = {
|
||||
description = "nix-ota device agent (oneshot)";
|
||||
after = [ "network-online.target" ];
|
||||
wants = [ "network-online.target" ];
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
StateDirectory = "nix-ota";
|
||||
};
|
||||
environment = {
|
||||
NIX_OTA_SERVER = cfg.server;
|
||||
NIX_OTA_CHANNEL = cfg.channel;
|
||||
NIX_OTA_DEVICE_ID = cfg.deviceId;
|
||||
NIX_OTA_PUBLIC_KEY_FILE = if cfg.publicKeyFile != null
|
||||
then toString cfg.publicKeyFile
|
||||
else "/etc/nix-ota/public.key";
|
||||
NIX_OTA_STATE_DIR = "/var/lib/nix-ota";
|
||||
} // lib.optionalAttrs (cfg.healthCmd != null) {
|
||||
NIX_OTA_HEALTH_CMD = cfg.healthCmd;
|
||||
};
|
||||
script = ''
|
||||
export PATH=${lib.makeBinPath [ pkgs.nix pkgs.systemd pkgs.coreutils pkgs.bash ]}:$PATH
|
||||
exec ${cfg.package}/bin/nix-ota-agent --once
|
||||
'';
|
||||
};
|
||||
|
||||
systemd.timers.nix-ota-agent = {
|
||||
wantedBy = [ "timers.target" ];
|
||||
timerConfig = {
|
||||
OnBootSec = "1min";
|
||||
OnUnitActiveSec = "${toString cfg.interval}s";
|
||||
Unit = "nix-ota-agent.service";
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
53
nix/modules/server.nix
Normal file
53
nix/modules/server.nix
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
self: { config, lib, pkgs, ... }:
|
||||
let
|
||||
cfg = config.services.nix-ota-server;
|
||||
inherit (lib) mkEnableOption mkOption mkIf types;
|
||||
in {
|
||||
options.services.nix-ota-server = {
|
||||
enable = mkEnableOption "nix-ota control server";
|
||||
package = mkOption {
|
||||
type = types.package;
|
||||
default = self.packages.${pkgs.system}.nix-ota-server;
|
||||
};
|
||||
listen = mkOption { type = types.str; default = "0.0.0.0:8080"; };
|
||||
dataDir = mkOption { type = types.path; default = "/var/lib/nix-ota-server"; };
|
||||
publishTokenFile = mkOption {
|
||||
type = types.nullOr types.path;
|
||||
default = null;
|
||||
description = "Path to a file containing the bearer token for /publish.";
|
||||
};
|
||||
openFirewall = mkOption { type = types.bool; default = false; };
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
users.users.nix-ota = {
|
||||
isSystemUser = true; group = "nix-ota"; home = cfg.dataDir; createHome = true;
|
||||
};
|
||||
users.groups.nix-ota = {};
|
||||
|
||||
systemd.services.nix-ota-server = {
|
||||
description = "nix-ota control server";
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "network.target" ];
|
||||
serviceConfig = {
|
||||
User = "nix-ota";
|
||||
Group = "nix-ota";
|
||||
WorkingDirectory = cfg.dataDir;
|
||||
Restart = "on-failure";
|
||||
StateDirectory = "nix-ota-server";
|
||||
};
|
||||
script = ''
|
||||
${lib.optionalString (cfg.publishTokenFile != null) ''
|
||||
export NIX_OTA_PUBLISH_TOKEN="$(cat ${cfg.publishTokenFile})"
|
||||
''}
|
||||
exec ${cfg.package}/bin/nix-ota-server \
|
||||
--listen ${cfg.listen} \
|
||||
--db ${cfg.dataDir}/nix-ota.db
|
||||
'';
|
||||
};
|
||||
|
||||
networking.firewall = mkIf cfg.openFirewall {
|
||||
allowedTCPPorts = [ (lib.toInt (lib.last (lib.splitString ":" cfg.listen))) ];
|
||||
};
|
||||
};
|
||||
}
|
||||
147
nix/tests/ota.nix
Normal file
147
nix/tests/ota.nix
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
{ pkgs, self, system }:
|
||||
# NixOS VM test for nix-ota.
|
||||
#
|
||||
# Builds three "system closure" stand-ins at evaluation time (each is a
|
||||
# directory containing a marker file and a `bin/switch-to-configuration`
|
||||
# stub), then drives the agent through three publishes:
|
||||
# 1. publish A -> device switches to A
|
||||
# 2. publish B -> device switches to B
|
||||
# 3. publish C (broken: agent's healthCmd will fail) -> device rolls back to B
|
||||
let
|
||||
mkClosure = label: extraScript: pkgs.runCommand "sys-${label}" {} ''
|
||||
mkdir -p $out/bin
|
||||
echo "${label}" > $out/marker
|
||||
cat > $out/bin/switch-to-configuration <<'EOF'
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
echo "applied ${label}" >&2
|
||||
${extraScript}
|
||||
exit 0
|
||||
EOF
|
||||
chmod +x $out/bin/switch-to-configuration
|
||||
'';
|
||||
|
||||
closureA = mkClosure "a" "touch /run/nix-ota-applied-a";
|
||||
closureB = mkClosure "b" "touch /run/nix-ota-applied-b";
|
||||
# Closure C activates fine, but the healthCmd checks for /run/nix-ota-broken
|
||||
# which we create before publishing C, forcing rollback.
|
||||
closureC = mkClosure "c" "touch /run/nix-ota-applied-c";
|
||||
|
||||
# Pre-generated binary cache keypair (test fixture; not secret).
|
||||
# Generated with: nix-store --generate-binary-cache-key cache.local sec pub
|
||||
cacheKeys = pkgs.runCommand "test-cache-keys" {} ''
|
||||
mkdir -p $out
|
||||
export HOME=$TMPDIR
|
||||
export NIX_STATE_DIR=$TMPDIR/state
|
||||
export NIX_STORE_DIR=$TMPDIR/store
|
||||
mkdir -p $NIX_STATE_DIR $NIX_STORE_DIR
|
||||
${pkgs.nix}/bin/nix-store --generate-binary-cache-key cache.local $out/secret $out/public
|
||||
'';
|
||||
|
||||
pubBin = "${self.packages.${system}.nix-ota-publisher}/bin/nix-ota";
|
||||
in
|
||||
pkgs.testers.runNixOSTest {
|
||||
name = "nix-ota";
|
||||
nodes = {
|
||||
server = { config, pkgs, lib, ... }: {
|
||||
imports = [ self.nixosModules.server ];
|
||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
||||
services.nix-ota-server = {
|
||||
enable = true;
|
||||
listen = "0.0.0.0:8080";
|
||||
openFirewall = true;
|
||||
publishTokenFile = pkgs.writeText "tok" "test-token";
|
||||
};
|
||||
services.nix-serve = {
|
||||
enable = true;
|
||||
port = 5000;
|
||||
secretKeyFile = "${cacheKeys}/secret";
|
||||
};
|
||||
networking.firewall.allowedTCPPorts = [ 5000 ];
|
||||
# The closures need to be in the server's store so nix-serve can serve them.
|
||||
system.extraDependencies = [ closureA closureB closureC ];
|
||||
};
|
||||
|
||||
device = { config, pkgs, lib, ... }: {
|
||||
imports = [ self.nixosModules.agent ];
|
||||
services.nix-ota-agent = {
|
||||
enable = true;
|
||||
server = "http://server:8080";
|
||||
channel = "prod";
|
||||
deviceId = "vm-device-1";
|
||||
publicKeyFile = "/var/lib/nix-ota/public.key";
|
||||
cacheUrl = "http://server:5000";
|
||||
cachePublicKey = builtins.readFile "${cacheKeys}/public";
|
||||
interval = 5;
|
||||
healthCmd = "test ! -f /run/nix-ota-broken";
|
||||
};
|
||||
nix.settings.experimental-features = [ "nix-command" "flakes" ];
|
||||
nix.settings.trusted-users = [ "root" ];
|
||||
};
|
||||
};
|
||||
|
||||
testScript = ''
|
||||
closureA = "${closureA}"
|
||||
closureB = "${closureB}"
|
||||
closureC = "${closureC}"
|
||||
pubBin = "${pubBin}"
|
||||
|
||||
start_all()
|
||||
server.wait_for_unit("nix-ota-server.service")
|
||||
server.wait_for_open_port(8080)
|
||||
server.wait_for_unit("nix-serve.service")
|
||||
server.wait_for_open_port(5000)
|
||||
# Drive the agent ourselves; disable the timer for deterministic stepping.
|
||||
device.succeed("systemctl stop nix-ota-agent.timer || true")
|
||||
|
||||
# Sign the closures with the binary cache key so the device's Nix will accept them.
|
||||
for c in [closureA, closureB, closureC]:
|
||||
server.succeed(f"nix store sign --extra-experimental-features nix-command --key-file ${cacheKeys}/secret --recursive {c}")
|
||||
|
||||
# Operator generates a manifest signing key on the server host.
|
||||
server.succeed("mkdir -p /root/keys")
|
||||
pub = server.succeed(f"{pubBin} keygen --out /root/keys/sign.key").strip()
|
||||
|
||||
# Push pubkey onto the device's writable state dir.
|
||||
device.succeed("mkdir -p /var/lib/nix-ota")
|
||||
device.succeed(f"echo '{pub}' > /var/lib/nix-ota/public.key")
|
||||
|
||||
def publish(store_path, rev):
|
||||
server.succeed(
|
||||
f"{pubBin} publish "
|
||||
f"--server http://localhost:8080 --token test-token "
|
||||
f"--key /root/keys/sign.key --channel prod "
|
||||
f"--store-path {store_path} --substituter http://server:5000 --revision {rev}"
|
||||
)
|
||||
|
||||
def poll_agent():
|
||||
# oneshot service: start and wait for it to finish (success or failure).
|
||||
device.succeed("systemctl start --wait nix-ota-agent.service || true")
|
||||
|
||||
# --- Step 1: publish A
|
||||
publish(closureA, 1)
|
||||
poll_agent()
|
||||
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureA}")
|
||||
device.succeed("test -f /run/nix-ota-applied-a")
|
||||
|
||||
# --- Step 2: publish B
|
||||
publish(closureB, 2)
|
||||
poll_agent()
|
||||
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureB}")
|
||||
device.succeed("test -f /run/nix-ota-applied-b")
|
||||
|
||||
# --- Step 3: publish C with health check rigged to fail
|
||||
device.succeed("touch /run/nix-ota-broken")
|
||||
publish(closureC, 3)
|
||||
poll_agent()
|
||||
# Agent should have applied C, failed the health check, and rolled back to B.
|
||||
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureB}")
|
||||
# The activation script for C did run before health check.
|
||||
device.succeed("test -f /run/nix-ota-applied-c")
|
||||
|
||||
# The dashboard should reflect the rolled_back state.
|
||||
server.wait_until_succeeds(
|
||||
"curl -fsS http://localhost:8080/ | grep -Eq 'rolled_back|failed'", timeout=30
|
||||
)
|
||||
'';
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue