Initial nix-ota implementation
Self-hostable OTA update system for NixOS fleets: a control server, device agent, publisher CLI, and NixOS modules that ship prebuilt system closures from a binary cache to devices that don't have the flake. - crates/common: signed manifest types (ed25519), store-path validator - crates/server: axum + sqlite + HTMX dashboard, channel/device API - crates/agent: poll, verify signature + revision, nix copy, switch, health check, magic-rollback on failure - crates/publisher: keygen + sign + publish CLI for operators/CI - nix/modules: NixOS modules for server and agent - nix/tests/ota.nix: end-to-end VM test exercising publish A -> B -> broken C -> rollback to B (passes) The control server never holds the signing key; manifests are signed offline and verified against a pinned public key on each device.
This commit is contained in:
commit
42b2ce4d1d
19 changed files with 4745 additions and 0 deletions
22
crates/agent/Cargo.toml
Normal file
22
crates/agent/Cargo.toml
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
[package]
|
||||
name = "nix-ota-agent"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[[bin]]
|
||||
name = "nix-ota-agent"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
nix-ota-common = { path = "../common" }
|
||||
anyhow.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
clap.workspace = true
|
||||
reqwest.workspace = true
|
||||
ed25519-dalek.workspace = true
|
||||
time.workspace = true
|
||||
261
crates/agent/src/main.rs
Normal file
261
crates/agent/src/main.rs
Normal file
|
|
@ -0,0 +1,261 @@
|
|||
//! `nix-ota-agent` — runs on each device.
|
||||
//!
|
||||
//! Lifecycle on every poll:
|
||||
//! 1. Fetch `/channels/<channel>/current`.
|
||||
//! 2. Verify ed25519 signature against the device's pinned public key.
|
||||
//! 3. Reject manifests with a revision <= last applied (replay defense).
|
||||
//! 4. `nix copy --from <substituter> <storePath>` — Nix itself verifies
|
||||
//! the per-path signatures against the cache's public key, so a
|
||||
//! compromised control server cannot inject store contents.
|
||||
//! 5. `nix-env -p /nix/var/nix/profiles/system --set <storePath>`
|
||||
//! then `<storePath>/bin/switch-to-configuration switch`.
|
||||
//! 6. Run the configured health check. On failure, roll back by
|
||||
//! switching to the previous system profile generation.
|
||||
//! 7. Check in with the control server.
|
||||
//!
|
||||
//! The agent stores small bits of state (last applied revision and
|
||||
//! previous store path for rollback) under `--state-dir`, defaulting
|
||||
//! to /var/lib/nix-ota.
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::Parser;
|
||||
use nix_ota_common as common;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{path::{Path, PathBuf}, time::Duration};
|
||||
use tokio::process::Command;
|
||||
|
||||
const AGENT_VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
|
||||
#[derive(Parser, Debug, Clone)]
|
||||
#[command(version, about = "nix-ota device agent")]
|
||||
struct Args {
|
||||
/// Control server base URL, e.g. https://ota.example.com
|
||||
#[arg(long, env = "NIX_OTA_SERVER")]
|
||||
server: String,
|
||||
/// Channel name to follow (e.g. prod, canary).
|
||||
#[arg(long, env = "NIX_OTA_CHANNEL", default_value = "prod")]
|
||||
channel: String,
|
||||
/// Device identifier (must be unique within a deployment).
|
||||
#[arg(long, env = "NIX_OTA_DEVICE_ID")]
|
||||
device_id: String,
|
||||
/// Path to a file containing the base64-encoded ed25519 public key
|
||||
/// used to verify manifest signatures.
|
||||
#[arg(long, env = "NIX_OTA_PUBLIC_KEY_FILE")]
|
||||
public_key_file: PathBuf,
|
||||
/// Poll interval seconds. If `--once` is set, this is ignored.
|
||||
#[arg(long, env = "NIX_OTA_INTERVAL", default_value_t = 60)]
|
||||
interval: u64,
|
||||
/// Run a single poll and exit (used by systemd timer).
|
||||
#[arg(long, env = "NIX_OTA_ONCE")]
|
||||
once: bool,
|
||||
/// Persistent state directory.
|
||||
#[arg(long, env = "NIX_OTA_STATE_DIR", default_value = "/var/lib/nix-ota")]
|
||||
state_dir: PathBuf,
|
||||
/// Optional health-check command. If exit code != 0 after switch,
|
||||
/// the agent rolls back.
|
||||
#[arg(long, env = "NIX_OTA_HEALTH_CMD")]
|
||||
health_cmd: Option<String>,
|
||||
/// Dry-run: log what would happen, don't execute nix or switch.
|
||||
#[arg(long, env = "NIX_OTA_DRY_RUN")]
|
||||
dry_run: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Serialize, Deserialize)]
|
||||
struct State {
|
||||
last_revision: u64,
|
||||
last_store_path: Option<String>,
|
||||
previous_store_path: Option<String>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn path(dir: &Path) -> PathBuf { dir.join("state.json") }
|
||||
fn load(dir: &Path) -> Result<Self> {
|
||||
let p = Self::path(dir);
|
||||
if !p.exists() { return Ok(Self::default()); }
|
||||
Ok(serde_json::from_slice(&std::fs::read(p)?)?)
|
||||
}
|
||||
fn save(&self, dir: &Path) -> Result<()> {
|
||||
std::fs::create_dir_all(dir).ok();
|
||||
let tmp = dir.join("state.json.tmp");
|
||||
std::fs::write(&tmp, serde_json::to_vec_pretty(self)?)?;
|
||||
std::fs::rename(tmp, Self::path(dir))?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(
|
||||
tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| "info".into()),
|
||||
)
|
||||
.init();
|
||||
let args = Args::parse();
|
||||
let vk_b64 = std::fs::read_to_string(&args.public_key_file)
|
||||
.with_context(|| format!("reading public key {}", args.public_key_file.display()))?;
|
||||
let vk = common::decode_verifying_key(vk_b64.trim())?;
|
||||
let client = reqwest::Client::builder()
|
||||
.user_agent(format!("nix-ota-agent/{AGENT_VERSION}"))
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()?;
|
||||
|
||||
loop {
|
||||
match run_once(&args, &vk, &client).await {
|
||||
Ok(_) => {}
|
||||
Err(e) => tracing::error!("poll failed: {e:#}"),
|
||||
}
|
||||
if args.once { break; }
|
||||
tokio::time::sleep(Duration::from_secs(args.interval)).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_once(args: &Args, vk: &ed25519_dalek::VerifyingKey, client: &reqwest::Client) -> Result<()>
|
||||
where
|
||||
ed25519_dalek::VerifyingKey: Sized,
|
||||
{
|
||||
let mut state = State::load(&args.state_dir)?;
|
||||
let url = format!("{}/channels/{}/current", args.server.trim_end_matches('/'), args.channel);
|
||||
let resp = client.get(&url).send().await?;
|
||||
if !resp.status().is_success() {
|
||||
// Still report a check-in so the dashboard knows we're alive.
|
||||
checkin(args, client, &state, common::Health::Ok, Some(format!("no manifest: {}", resp.status()))).await.ok();
|
||||
bail!("server returned {}", resp.status());
|
||||
}
|
||||
let manifest: common::Manifest = resp.json().await?;
|
||||
common::verify_manifest(vk, &manifest)
|
||||
.context("manifest signature verification failed")?;
|
||||
|
||||
if manifest.body.revision <= state.last_revision {
|
||||
tracing::debug!(rev = manifest.body.revision, "no new revision");
|
||||
checkin(args, client, &state, common::Health::Ok, None).await.ok();
|
||||
return Ok(());
|
||||
}
|
||||
if Some(&manifest.body.store_path) == state.last_store_path.as_ref() {
|
||||
// Same path, bumped revision (e.g. publish-rollback). Just record.
|
||||
state.last_revision = manifest.body.revision;
|
||||
state.save(&args.state_dir)?;
|
||||
checkin(args, client, &state, common::Health::Ok, None).await.ok();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
tracing::info!(target = %manifest.body.store_path, rev = manifest.body.revision, "applying new closure");
|
||||
checkin(args, client, &state, common::Health::Updating,
|
||||
Some(format!("copying {}", manifest.body.store_path))).await.ok();
|
||||
|
||||
// 1. Copy from cache.
|
||||
if !args.dry_run {
|
||||
nix_copy(&manifest.body.substituter, &manifest.body.store_path).await?;
|
||||
}
|
||||
|
||||
// 2. Switch.
|
||||
let previous = state.last_store_path.clone();
|
||||
if !args.dry_run {
|
||||
nix_set_profile(&manifest.body.store_path).await?;
|
||||
switch_to_configuration(&manifest.body.store_path, "switch").await?;
|
||||
}
|
||||
|
||||
// 3. Health check.
|
||||
let healthy = run_health_check(args.health_cmd.as_deref()).await;
|
||||
if !healthy {
|
||||
tracing::error!("health check failed, rolling back");
|
||||
if let Some(prev) = previous.as_deref() {
|
||||
if !args.dry_run {
|
||||
if let Err(e) = rollback(prev).await {
|
||||
tracing::error!("rollback failed: {e:#}");
|
||||
checkin(args, client, &state, common::Health::Failed,
|
||||
Some(format!("rollback failed: {e}"))).await.ok();
|
||||
bail!("rollback failed");
|
||||
}
|
||||
}
|
||||
checkin(args, client, &state, common::Health::RolledBack,
|
||||
Some(format!("rolled back to {prev}"))).await.ok();
|
||||
} else {
|
||||
checkin(args, client, &state, common::Health::Failed,
|
||||
Some("no previous generation to roll back to".into())).await.ok();
|
||||
}
|
||||
// Do NOT record success; intentionally leave last_revision so we
|
||||
// retry the next poll only if a *new* revision is published.
|
||||
state.last_revision = manifest.body.revision;
|
||||
state.save(&args.state_dir)?;
|
||||
bail!("health check failed");
|
||||
}
|
||||
|
||||
state.previous_store_path = previous;
|
||||
state.last_store_path = Some(manifest.body.store_path.clone());
|
||||
state.last_revision = manifest.body.revision;
|
||||
state.save(&args.state_dir)?;
|
||||
checkin(args, client, &state, common::Health::Ok, Some("applied".into())).await.ok();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn nix_copy(substituter: &str, path: &str) -> Result<()> {
|
||||
let status = Command::new("nix")
|
||||
.args(["copy", "--from", substituter, path])
|
||||
.status()
|
||||
.await
|
||||
.context("running `nix copy`")?;
|
||||
if !status.success() { bail!("nix copy exited {status}"); }
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn nix_set_profile(path: &str) -> Result<()> {
|
||||
let status = Command::new("nix-env")
|
||||
.args(["-p", "/nix/var/nix/profiles/system", "--set", path])
|
||||
.status()
|
||||
.await
|
||||
.context("running `nix-env --set`")?;
|
||||
if !status.success() { bail!("nix-env exited {status}"); }
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn switch_to_configuration(store_path: &str, action: &str) -> Result<()> {
|
||||
let bin = format!("{store_path}/bin/switch-to-configuration");
|
||||
let status = Command::new(&bin).arg(action).status().await
|
||||
.with_context(|| format!("running {bin}"))?;
|
||||
if !status.success() { bail!("switch-to-configuration exited {status}"); }
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn rollback(previous_store_path: &str) -> Result<()> {
|
||||
nix_set_profile(previous_store_path).await?;
|
||||
switch_to_configuration(previous_store_path, "switch").await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_health_check(cmd: Option<&str>) -> bool {
|
||||
let Some(cmd) = cmd else { return true; };
|
||||
match Command::new("sh").arg("-c").arg(cmd).status().await {
|
||||
Ok(s) => s.success(),
|
||||
Err(e) => {
|
||||
tracing::error!("health check exec failed: {e}");
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn checkin(
|
||||
args: &Args,
|
||||
client: &reqwest::Client,
|
||||
state: &State,
|
||||
health: common::Health,
|
||||
message: Option<String>,
|
||||
) -> Result<()> {
|
||||
let ci = common::CheckIn {
|
||||
device_id: args.device_id.clone(),
|
||||
channel: args.channel.clone(),
|
||||
current_store_path: state.last_store_path.clone(),
|
||||
target_store_path: state.last_store_path.clone(),
|
||||
health,
|
||||
agent_version: AGENT_VERSION.into(),
|
||||
message,
|
||||
};
|
||||
let url = format!("{}/devices/{}/checkin",
|
||||
args.server.trim_end_matches('/'), args.device_id);
|
||||
let r = client.post(&url).json(&ci).send().await?;
|
||||
if !r.status().is_success() {
|
||||
return Err(anyhow!("checkin status {}", r.status()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue