Initial nix-ota implementation

Self-hostable OTA update system for NixOS fleets: a control server,
device agent, publisher CLI, and NixOS modules that ship prebuilt
system closures from a binary cache to devices that don't have the
flake.

- crates/common: signed manifest types (ed25519), store-path validator
- crates/server: axum + sqlite + HTMX dashboard, channel/device API
- crates/agent: poll, verify signature + revision, nix copy, switch,
  health check, magic-rollback on failure
- crates/publisher: keygen + sign + publish CLI for operators/CI
- nix/modules: NixOS modules for server and agent
- nix/tests/ota.nix: end-to-end VM test exercising publish A -> B ->
  broken C -> rollback to B (passes)

The control server never holds the signing key; manifests are signed
offline and verified against a pinned public key on each device.
This commit is contained in:
0m.ax 2026-05-25 14:58:42 +02:00
commit 42b2ce4d1d
19 changed files with 4745 additions and 0 deletions

22
crates/agent/Cargo.toml Normal file
View file

@ -0,0 +1,22 @@
[package]
name = "nix-ota-agent"
version.workspace = true
edition.workspace = true
license.workspace = true
[[bin]]
name = "nix-ota-agent"
path = "src/main.rs"
[dependencies]
nix-ota-common = { path = "../common" }
anyhow.workspace = true
serde.workspace = true
serde_json.workspace = true
tokio.workspace = true
tracing.workspace = true
tracing-subscriber.workspace = true
clap.workspace = true
reqwest.workspace = true
ed25519-dalek.workspace = true
time.workspace = true

261
crates/agent/src/main.rs Normal file
View file

@ -0,0 +1,261 @@
//! `nix-ota-agent` — runs on each device.
//!
//! Lifecycle on every poll:
//! 1. Fetch `/channels/<channel>/current`.
//! 2. Verify ed25519 signature against the device's pinned public key.
//! 3. Reject manifests with a revision <= last applied (replay defense).
//! 4. `nix copy --from <substituter> <storePath>` — Nix itself verifies
//! the per-path signatures against the cache's public key, so a
//! compromised control server cannot inject store contents.
//! 5. `nix-env -p /nix/var/nix/profiles/system --set <storePath>`
//! then `<storePath>/bin/switch-to-configuration switch`.
//! 6. Run the configured health check. On failure, roll back by
//! switching to the previous system profile generation.
//! 7. Check in with the control server.
//!
//! The agent stores small bits of state (last applied revision and
//! previous store path for rollback) under `--state-dir`, defaulting
//! to /var/lib/nix-ota.
use anyhow::{anyhow, bail, Context, Result};
use clap::Parser;
use nix_ota_common as common;
use serde::{Deserialize, Serialize};
use std::{path::{Path, PathBuf}, time::Duration};
use tokio::process::Command;
const AGENT_VERSION: &str = env!("CARGO_PKG_VERSION");
#[derive(Parser, Debug, Clone)]
#[command(version, about = "nix-ota device agent")]
struct Args {
/// Control server base URL, e.g. https://ota.example.com
#[arg(long, env = "NIX_OTA_SERVER")]
server: String,
/// Channel name to follow (e.g. prod, canary).
#[arg(long, env = "NIX_OTA_CHANNEL", default_value = "prod")]
channel: String,
/// Device identifier (must be unique within a deployment).
#[arg(long, env = "NIX_OTA_DEVICE_ID")]
device_id: String,
/// Path to a file containing the base64-encoded ed25519 public key
/// used to verify manifest signatures.
#[arg(long, env = "NIX_OTA_PUBLIC_KEY_FILE")]
public_key_file: PathBuf,
/// Poll interval seconds. If `--once` is set, this is ignored.
#[arg(long, env = "NIX_OTA_INTERVAL", default_value_t = 60)]
interval: u64,
/// Run a single poll and exit (used by systemd timer).
#[arg(long, env = "NIX_OTA_ONCE")]
once: bool,
/// Persistent state directory.
#[arg(long, env = "NIX_OTA_STATE_DIR", default_value = "/var/lib/nix-ota")]
state_dir: PathBuf,
/// Optional health-check command. If exit code != 0 after switch,
/// the agent rolls back.
#[arg(long, env = "NIX_OTA_HEALTH_CMD")]
health_cmd: Option<String>,
/// Dry-run: log what would happen, don't execute nix or switch.
#[arg(long, env = "NIX_OTA_DRY_RUN")]
dry_run: bool,
}
#[derive(Debug, Default, Serialize, Deserialize)]
struct State {
last_revision: u64,
last_store_path: Option<String>,
previous_store_path: Option<String>,
}
impl State {
fn path(dir: &Path) -> PathBuf { dir.join("state.json") }
fn load(dir: &Path) -> Result<Self> {
let p = Self::path(dir);
if !p.exists() { return Ok(Self::default()); }
Ok(serde_json::from_slice(&std::fs::read(p)?)?)
}
fn save(&self, dir: &Path) -> Result<()> {
std::fs::create_dir_all(dir).ok();
let tmp = dir.join("state.json.tmp");
std::fs::write(&tmp, serde_json::to_vec_pretty(self)?)?;
std::fs::rename(tmp, Self::path(dir))?;
Ok(())
}
}
#[tokio::main]
async fn main() -> Result<()> {
tracing_subscriber::fmt()
.with_env_filter(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| "info".into()),
)
.init();
let args = Args::parse();
let vk_b64 = std::fs::read_to_string(&args.public_key_file)
.with_context(|| format!("reading public key {}", args.public_key_file.display()))?;
let vk = common::decode_verifying_key(vk_b64.trim())?;
let client = reqwest::Client::builder()
.user_agent(format!("nix-ota-agent/{AGENT_VERSION}"))
.timeout(Duration::from_secs(30))
.build()?;
loop {
match run_once(&args, &vk, &client).await {
Ok(_) => {}
Err(e) => tracing::error!("poll failed: {e:#}"),
}
if args.once { break; }
tokio::time::sleep(Duration::from_secs(args.interval)).await;
}
Ok(())
}
async fn run_once(args: &Args, vk: &ed25519_dalek::VerifyingKey, client: &reqwest::Client) -> Result<()>
where
ed25519_dalek::VerifyingKey: Sized,
{
let mut state = State::load(&args.state_dir)?;
let url = format!("{}/channels/{}/current", args.server.trim_end_matches('/'), args.channel);
let resp = client.get(&url).send().await?;
if !resp.status().is_success() {
// Still report a check-in so the dashboard knows we're alive.
checkin(args, client, &state, common::Health::Ok, Some(format!("no manifest: {}", resp.status()))).await.ok();
bail!("server returned {}", resp.status());
}
let manifest: common::Manifest = resp.json().await?;
common::verify_manifest(vk, &manifest)
.context("manifest signature verification failed")?;
if manifest.body.revision <= state.last_revision {
tracing::debug!(rev = manifest.body.revision, "no new revision");
checkin(args, client, &state, common::Health::Ok, None).await.ok();
return Ok(());
}
if Some(&manifest.body.store_path) == state.last_store_path.as_ref() {
// Same path, bumped revision (e.g. publish-rollback). Just record.
state.last_revision = manifest.body.revision;
state.save(&args.state_dir)?;
checkin(args, client, &state, common::Health::Ok, None).await.ok();
return Ok(());
}
tracing::info!(target = %manifest.body.store_path, rev = manifest.body.revision, "applying new closure");
checkin(args, client, &state, common::Health::Updating,
Some(format!("copying {}", manifest.body.store_path))).await.ok();
// 1. Copy from cache.
if !args.dry_run {
nix_copy(&manifest.body.substituter, &manifest.body.store_path).await?;
}
// 2. Switch.
let previous = state.last_store_path.clone();
if !args.dry_run {
nix_set_profile(&manifest.body.store_path).await?;
switch_to_configuration(&manifest.body.store_path, "switch").await?;
}
// 3. Health check.
let healthy = run_health_check(args.health_cmd.as_deref()).await;
if !healthy {
tracing::error!("health check failed, rolling back");
if let Some(prev) = previous.as_deref() {
if !args.dry_run {
if let Err(e) = rollback(prev).await {
tracing::error!("rollback failed: {e:#}");
checkin(args, client, &state, common::Health::Failed,
Some(format!("rollback failed: {e}"))).await.ok();
bail!("rollback failed");
}
}
checkin(args, client, &state, common::Health::RolledBack,
Some(format!("rolled back to {prev}"))).await.ok();
} else {
checkin(args, client, &state, common::Health::Failed,
Some("no previous generation to roll back to".into())).await.ok();
}
// Do NOT record success; intentionally leave last_revision so we
// retry the next poll only if a *new* revision is published.
state.last_revision = manifest.body.revision;
state.save(&args.state_dir)?;
bail!("health check failed");
}
state.previous_store_path = previous;
state.last_store_path = Some(manifest.body.store_path.clone());
state.last_revision = manifest.body.revision;
state.save(&args.state_dir)?;
checkin(args, client, &state, common::Health::Ok, Some("applied".into())).await.ok();
Ok(())
}
async fn nix_copy(substituter: &str, path: &str) -> Result<()> {
let status = Command::new("nix")
.args(["copy", "--from", substituter, path])
.status()
.await
.context("running `nix copy`")?;
if !status.success() { bail!("nix copy exited {status}"); }
Ok(())
}
async fn nix_set_profile(path: &str) -> Result<()> {
let status = Command::new("nix-env")
.args(["-p", "/nix/var/nix/profiles/system", "--set", path])
.status()
.await
.context("running `nix-env --set`")?;
if !status.success() { bail!("nix-env exited {status}"); }
Ok(())
}
async fn switch_to_configuration(store_path: &str, action: &str) -> Result<()> {
let bin = format!("{store_path}/bin/switch-to-configuration");
let status = Command::new(&bin).arg(action).status().await
.with_context(|| format!("running {bin}"))?;
if !status.success() { bail!("switch-to-configuration exited {status}"); }
Ok(())
}
async fn rollback(previous_store_path: &str) -> Result<()> {
nix_set_profile(previous_store_path).await?;
switch_to_configuration(previous_store_path, "switch").await?;
Ok(())
}
async fn run_health_check(cmd: Option<&str>) -> bool {
let Some(cmd) = cmd else { return true; };
match Command::new("sh").arg("-c").arg(cmd).status().await {
Ok(s) => s.success(),
Err(e) => {
tracing::error!("health check exec failed: {e}");
false
}
}
}
async fn checkin(
args: &Args,
client: &reqwest::Client,
state: &State,
health: common::Health,
message: Option<String>,
) -> Result<()> {
let ci = common::CheckIn {
device_id: args.device_id.clone(),
channel: args.channel.clone(),
current_store_path: state.last_store_path.clone(),
target_store_path: state.last_store_path.clone(),
health,
agent_version: AGENT_VERSION.into(),
message,
};
let url = format!("{}/devices/{}/checkin",
args.server.trim_end_matches('/'), args.device_id);
let r = client.post(&url).json(&ci).send().await?;
if !r.status().is_success() {
return Err(anyhow!("checkin status {}", r.status()));
}
Ok(())
}