Initial nix-ota implementation

Self-hostable OTA update system for NixOS fleets: a control server,
device agent, publisher CLI, and NixOS modules that ship prebuilt
system closures from a binary cache to devices that don't have the
flake.

- crates/common: signed manifest types (ed25519), store-path validator
- crates/server: axum + sqlite + HTMX dashboard, channel/device API
- crates/agent: poll, verify signature + revision, nix copy, switch,
  health check, magic-rollback on failure
- crates/publisher: keygen + sign + publish CLI for operators/CI
- nix/modules: NixOS modules for server and agent
- nix/tests/ota.nix: end-to-end VM test exercising publish A -> B ->
  broken C -> rollback to B (passes)

The control server never holds the signing key; manifests are signed
offline and verified against a pinned public key on each device.
This commit is contained in:
0m.ax 2026-05-25 14:58:42 +02:00
commit 42b2ce4d1d
19 changed files with 4745 additions and 0 deletions

87
nix/modules/agent.nix Normal file
View file

@ -0,0 +1,87 @@
self: { config, lib, pkgs, ... }:
let
cfg = config.services.nix-ota-agent;
inherit (lib) mkEnableOption mkOption mkIf types;
in {
options.services.nix-ota-agent = {
enable = mkEnableOption "nix-ota device agent";
package = mkOption {
type = types.package;
default = self.packages.${pkgs.system}.nix-ota-agent;
};
server = mkOption { type = types.str; example = "https://ota.example.com"; };
channel = mkOption { type = types.str; default = "prod"; };
deviceId = mkOption { type = types.str; example = "device-001"; };
publicKey = mkOption {
type = types.nullOr types.str;
default = null;
description = "Base64-encoded ed25519 verifying key. The agent will reject manifests not signed by the matching private key. Mutually exclusive with publicKeyFile.";
};
publicKeyFile = mkOption {
type = types.nullOr types.path;
default = null;
description = "Path to a file containing the base64-encoded verifying key. Use this if you need to write the key at runtime (e.g. from an orchestration system).";
};
interval = mkOption { type = types.int; default = 60; };
healthCmd = mkOption { type = types.nullOr types.str; default = null; };
cacheUrl = mkOption {
type = types.str;
description = "Substituter URL added to nix.settings.substituters so `nix copy` can fetch from it.";
};
cachePublicKey = mkOption {
type = types.str;
description = "Trusted public key of the binary cache (the one that signs store paths).";
};
};
config = mkIf cfg.enable {
assertions = [{
assertion = (cfg.publicKey != null) != (cfg.publicKeyFile != null);
message = "services.nix-ota-agent: set exactly one of publicKey or publicKeyFile.";
}];
nix.settings = {
substituters = [ cfg.cacheUrl ];
trusted-public-keys = [ cfg.cachePublicKey ];
experimental-features = [ "nix-command" "flakes" ];
};
environment.etc."nix-ota/public.key" = lib.mkIf (cfg.publicKey != null) {
text = cfg.publicKey;
};
systemd.services.nix-ota-agent = {
description = "nix-ota device agent (oneshot)";
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot";
StateDirectory = "nix-ota";
};
environment = {
NIX_OTA_SERVER = cfg.server;
NIX_OTA_CHANNEL = cfg.channel;
NIX_OTA_DEVICE_ID = cfg.deviceId;
NIX_OTA_PUBLIC_KEY_FILE = if cfg.publicKeyFile != null
then toString cfg.publicKeyFile
else "/etc/nix-ota/public.key";
NIX_OTA_STATE_DIR = "/var/lib/nix-ota";
} // lib.optionalAttrs (cfg.healthCmd != null) {
NIX_OTA_HEALTH_CMD = cfg.healthCmd;
};
script = ''
export PATH=${lib.makeBinPath [ pkgs.nix pkgs.systemd pkgs.coreutils pkgs.bash ]}:$PATH
exec ${cfg.package}/bin/nix-ota-agent --once
'';
};
systemd.timers.nix-ota-agent = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnBootSec = "1min";
OnUnitActiveSec = "${toString cfg.interval}s";
Unit = "nix-ota-agent.service";
};
};
};
}

53
nix/modules/server.nix Normal file
View file

@ -0,0 +1,53 @@
self: { config, lib, pkgs, ... }:
let
cfg = config.services.nix-ota-server;
inherit (lib) mkEnableOption mkOption mkIf types;
in {
options.services.nix-ota-server = {
enable = mkEnableOption "nix-ota control server";
package = mkOption {
type = types.package;
default = self.packages.${pkgs.system}.nix-ota-server;
};
listen = mkOption { type = types.str; default = "0.0.0.0:8080"; };
dataDir = mkOption { type = types.path; default = "/var/lib/nix-ota-server"; };
publishTokenFile = mkOption {
type = types.nullOr types.path;
default = null;
description = "Path to a file containing the bearer token for /publish.";
};
openFirewall = mkOption { type = types.bool; default = false; };
};
config = mkIf cfg.enable {
users.users.nix-ota = {
isSystemUser = true; group = "nix-ota"; home = cfg.dataDir; createHome = true;
};
users.groups.nix-ota = {};
systemd.services.nix-ota-server = {
description = "nix-ota control server";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
User = "nix-ota";
Group = "nix-ota";
WorkingDirectory = cfg.dataDir;
Restart = "on-failure";
StateDirectory = "nix-ota-server";
};
script = ''
${lib.optionalString (cfg.publishTokenFile != null) ''
export NIX_OTA_PUBLISH_TOKEN="$(cat ${cfg.publishTokenFile})"
''}
exec ${cfg.package}/bin/nix-ota-server \
--listen ${cfg.listen} \
--db ${cfg.dataDir}/nix-ota.db
'';
};
networking.firewall = mkIf cfg.openFirewall {
allowedTCPPorts = [ (lib.toInt (lib.last (lib.splitString ":" cfg.listen))) ];
};
};
}

147
nix/tests/ota.nix Normal file
View file

@ -0,0 +1,147 @@
{ pkgs, self, system }:
# NixOS VM test for nix-ota.
#
# Builds three "system closure" stand-ins at evaluation time (each is a
# directory containing a marker file and a `bin/switch-to-configuration`
# stub), then drives the agent through three publishes:
# 1. publish A -> device switches to A
# 2. publish B -> device switches to B
# 3. publish C (broken: agent's healthCmd will fail) -> device rolls back to B
let
mkClosure = label: extraScript: pkgs.runCommand "sys-${label}" {} ''
mkdir -p $out/bin
echo "${label}" > $out/marker
cat > $out/bin/switch-to-configuration <<'EOF'
#!/bin/sh
set -eu
echo "applied ${label}" >&2
${extraScript}
exit 0
EOF
chmod +x $out/bin/switch-to-configuration
'';
closureA = mkClosure "a" "touch /run/nix-ota-applied-a";
closureB = mkClosure "b" "touch /run/nix-ota-applied-b";
# Closure C activates fine, but the healthCmd checks for /run/nix-ota-broken
# which we create before publishing C, forcing rollback.
closureC = mkClosure "c" "touch /run/nix-ota-applied-c";
# Pre-generated binary cache keypair (test fixture; not secret).
# Generated with: nix-store --generate-binary-cache-key cache.local sec pub
cacheKeys = pkgs.runCommand "test-cache-keys" {} ''
mkdir -p $out
export HOME=$TMPDIR
export NIX_STATE_DIR=$TMPDIR/state
export NIX_STORE_DIR=$TMPDIR/store
mkdir -p $NIX_STATE_DIR $NIX_STORE_DIR
${pkgs.nix}/bin/nix-store --generate-binary-cache-key cache.local $out/secret $out/public
'';
pubBin = "${self.packages.${system}.nix-ota-publisher}/bin/nix-ota";
in
pkgs.testers.runNixOSTest {
name = "nix-ota";
nodes = {
server = { config, pkgs, lib, ... }: {
imports = [ self.nixosModules.server ];
nix.settings.experimental-features = [ "nix-command" "flakes" ];
services.nix-ota-server = {
enable = true;
listen = "0.0.0.0:8080";
openFirewall = true;
publishTokenFile = pkgs.writeText "tok" "test-token";
};
services.nix-serve = {
enable = true;
port = 5000;
secretKeyFile = "${cacheKeys}/secret";
};
networking.firewall.allowedTCPPorts = [ 5000 ];
# The closures need to be in the server's store so nix-serve can serve them.
system.extraDependencies = [ closureA closureB closureC ];
};
device = { config, pkgs, lib, ... }: {
imports = [ self.nixosModules.agent ];
services.nix-ota-agent = {
enable = true;
server = "http://server:8080";
channel = "prod";
deviceId = "vm-device-1";
publicKeyFile = "/var/lib/nix-ota/public.key";
cacheUrl = "http://server:5000";
cachePublicKey = builtins.readFile "${cacheKeys}/public";
interval = 5;
healthCmd = "test ! -f /run/nix-ota-broken";
};
nix.settings.experimental-features = [ "nix-command" "flakes" ];
nix.settings.trusted-users = [ "root" ];
};
};
testScript = ''
closureA = "${closureA}"
closureB = "${closureB}"
closureC = "${closureC}"
pubBin = "${pubBin}"
start_all()
server.wait_for_unit("nix-ota-server.service")
server.wait_for_open_port(8080)
server.wait_for_unit("nix-serve.service")
server.wait_for_open_port(5000)
# Drive the agent ourselves; disable the timer for deterministic stepping.
device.succeed("systemctl stop nix-ota-agent.timer || true")
# Sign the closures with the binary cache key so the device's Nix will accept them.
for c in [closureA, closureB, closureC]:
server.succeed(f"nix store sign --extra-experimental-features nix-command --key-file ${cacheKeys}/secret --recursive {c}")
# Operator generates a manifest signing key on the server host.
server.succeed("mkdir -p /root/keys")
pub = server.succeed(f"{pubBin} keygen --out /root/keys/sign.key").strip()
# Push pubkey onto the device's writable state dir.
device.succeed("mkdir -p /var/lib/nix-ota")
device.succeed(f"echo '{pub}' > /var/lib/nix-ota/public.key")
def publish(store_path, rev):
server.succeed(
f"{pubBin} publish "
f"--server http://localhost:8080 --token test-token "
f"--key /root/keys/sign.key --channel prod "
f"--store-path {store_path} --substituter http://server:5000 --revision {rev}"
)
def poll_agent():
# oneshot service: start and wait for it to finish (success or failure).
device.succeed("systemctl start --wait nix-ota-agent.service || true")
# --- Step 1: publish A
publish(closureA, 1)
poll_agent()
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureA}")
device.succeed("test -f /run/nix-ota-applied-a")
# --- Step 2: publish B
publish(closureB, 2)
poll_agent()
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureB}")
device.succeed("test -f /run/nix-ota-applied-b")
# --- Step 3: publish C with health check rigged to fail
device.succeed("touch /run/nix-ota-broken")
publish(closureC, 3)
poll_agent()
# Agent should have applied C, failed the health check, and rolled back to B.
device.succeed(f"readlink -f /nix/var/nix/profiles/system | grep -qF {closureB}")
# The activation script for C did run before health check.
device.succeed("test -f /run/nix-ota-applied-c")
# The dashboard should reflect the rolled_back state.
server.wait_until_succeeds(
"curl -fsS http://localhost:8080/ | grep -Eq 'rolled_back|failed'", timeout=30
)
'';
}