diff --git a/cmd/stepsecurity-dev-machine-guard/main.go b/cmd/stepsecurity-dev-machine-guard/main.go index 4782e69..ac82fff 100644 --- a/cmd/stepsecurity-dev-machine-guard/main.go +++ b/cmd/stepsecurity-dev-machine-guard/main.go @@ -21,6 +21,7 @@ import ( "github.com/step-security/dev-machine-guard/internal/device" "github.com/step-security/dev-machine-guard/internal/executor" "github.com/step-security/dev-machine-guard/internal/featuregate" + "github.com/step-security/dev-machine-guard/internal/heartbeat" "github.com/step-security/dev-machine-guard/internal/launchd" "github.com/step-security/dev-machine-guard/internal/output" "github.com/step-security/dev-machine-guard/internal/paths" @@ -240,6 +241,10 @@ func main() { config.ShowConfigure() case "send-telemetry": + // Stamp the local heartbeat first — before the enterprise gate and + // the singleton lock inside telemetry.Run — so even runs that bail at + // the gate or die during startup leave an on-disk "I started" record. + writeHeartbeat("send-telemetry", log) if !config.IsEnterpriseMode() { log.Error("Enterprise configuration not found. Run '%s configure' or download the script from your StepSecurity dashboard.", os.Args[0]) os.Exit(1) @@ -597,6 +602,17 @@ func findLegacyLeftovers(legacy string) []string { // state and reconciles local hook installation to match. Silent no-op // in community mode (enterprise config missing) — the existing scan // path stays unaffected. Failures are logged but never crash main. +// writeHeartbeat stamps last-run.json with this run's start metadata. Wholly +// best-effort: a write failure (read-only home, disabled install dir) is +// logged at debug and never affects the run. The invocation method reuses the +// scheduler-footprint detection telemetry already does, so the heartbeat +// distinguishes a scheduled fire from a manual run. +func writeHeartbeat(command string, log *progress.Logger) { + if err := heartbeat.Write(paths.HeartbeatFile(), command, telemetry.DetectInvocationMethod()); err != nil { + log.Debug("heartbeat: failed to write %s: %v", paths.HeartbeatFile(), err) + } +} + func runHookStateReconcile(exec executor.Executor, log *progress.Logger) { if !featuregate.IsEnabled(featuregate.FeatureAIAgentHooks) { log.Debug("hook-state reconcile: skipped (feature gated)") diff --git a/internal/heartbeat/heartbeat.go b/internal/heartbeat/heartbeat.go new file mode 100644 index 0000000..46a9bd6 --- /dev/null +++ b/internal/heartbeat/heartbeat.go @@ -0,0 +1,137 @@ +// Package heartbeat writes a small last-run.json "I started" breadcrumb to +// the install dir at the very top of a telemetry run — before the +// enterprise-config gate and before the singleton lock is acquired. +// +// Why this exists, separate from agent.error.log and scan-state.json: those +// only appear once a run gets far enough to log a line or finish an upload. +// Several failure modes never reach that point — a process killed mid-startup +// (e.g. the Windows GUI-launcher teardown), a run that fails the enterprise +// gate, a lock it can never acquire. The heartbeat captures "this binary +// started at time T, pid P, triggered by X" independent of any of that, so a +// stale file means "the agent isn't being invoked at all" (scheduler not +// firing — battery policy, missing task) while a fresh file alongside missing +// server-side telemetry means "the agent runs but dies/fails before upload." +// +// The write is durable against the abrupt termination it is meant to record: +// marshal to a temp sibling, fsync, then atomically rename over last-run.json +// (same pattern as internal/state). A kill at any point leaves either the +// previous heartbeat or the new one — never a truncated file. +package heartbeat + +import ( + "encoding/json" + "os" + "path/filepath" + "runtime" + "time" + + "github.com/step-security/dev-machine-guard/internal/buildinfo" +) + +// SchemaVersion is the on-disk format version for last-run.json. Bump when +// the Record shape changes incompatibly; readers treat a mismatch as "no +// usable heartbeat" rather than failing. +const SchemaVersion = 1 + +// Filename is the basename written into the install dir. Exported so callers +// and diagnostics can reference it without duplicating the literal. +const Filename = "last-run.json" + +// Record is the last-run.json envelope: a point-in-time stamp that a run +// began. It deliberately carries only start-of-run facts — outcome lives in +// scan-state.json (LastSuccessfulExecutionID) and agent.error.log. +type Record struct { + SchemaVersion int `json:"schema_version"` + WrittenAt time.Time `json:"written_at"` + PID int `json:"pid"` + AgentVersion string `json:"agent_version"` + Command string `json:"command"` // subcommand that started the run, e.g. "send-telemetry" + InvocationMethod string `json:"invocation_method"` // scheduler footprint vs manual; see telemetry.DetectInvocationMethod + OS string `json:"os"` +} + +// Write stamps last-run.json at path with this run's start metadata. An empty +// path is a no-op returning nil — callers pass paths.HeartbeatFile(), which is +// "" when the install dir is disabled (--install-dir=""), and treat the +// heartbeat as off in that case. Best-effort: callers should log a write error +// at debug/warn and continue, never fail the run on it. +func Write(path, command, invocationMethod string) error { + if path == "" { + return nil + } + rec := Record{ + SchemaVersion: SchemaVersion, + WrittenAt: time.Now().UTC(), + PID: os.Getpid(), + AgentVersion: buildinfo.Version, + Command: command, + InvocationMethod: invocationMethod, + OS: runtime.GOOS, + } + return writeRecord(path, rec) +} + +// Load reads last-run.json. A missing file, parse error, or schema mismatch +// returns (nil, err) with err nil for the missing/mismatch cases (expected +// fall-throughs) so callers can treat a nil record as "no usable heartbeat" +// without distinguishing causes. Exposed for diagnostics and any future +// fleet-view that folds the last-run summary into the telemetry payload. +func Load(path string) (*Record, error) { + if path == "" { + return nil, nil + } + data, err := os.ReadFile(filepath.Clean(path)) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, err + } + var r Record + if err := json.Unmarshal(data, &r); err != nil { + return nil, err + } + if r.SchemaVersion != SchemaVersion { + return nil, nil + } + return &r, nil +} + +// writeRecord persists rec to path atomically: temp sibling, fsync, rename. +// Mirrors internal/state.Save, including the Windows pre-remove (os.Rename +// there fails when the destination already exists). +func writeRecord(path string, rec Record) error { + data, err := json.MarshalIndent(rec, "", " ") + if err != nil { + return err + } + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0o750); err != nil { + return err + } + tmp, err := os.CreateTemp(dir, ".last-run-*.tmp") + if err != nil { + return err + } + tmpPath := tmp.Name() + if _, err := tmp.Write(data); err != nil { + _ = tmp.Close() + _ = os.Remove(tmpPath) + return err + } + if err := tmp.Sync(); err != nil { + _ = tmp.Close() + _ = os.Remove(tmpPath) + return err + } + if err := tmp.Close(); err != nil { + _ = os.Remove(tmpPath) + return err + } + _ = os.Remove(path) + if err := os.Rename(tmpPath, path); err != nil { + _ = os.Remove(tmpPath) + return err + } + return nil +} diff --git a/internal/heartbeat/heartbeat_test.go b/internal/heartbeat/heartbeat_test.go new file mode 100644 index 0000000..ef3405e --- /dev/null +++ b/internal/heartbeat/heartbeat_test.go @@ -0,0 +1,117 @@ +package heartbeat + +import ( + "os" + "path/filepath" + "testing" + "time" + + "github.com/step-security/dev-machine-guard/internal/buildinfo" +) + +func TestWriteThenLoadRoundTrips(t *testing.T) { + path := filepath.Join(t.TempDir(), "last-run.json") + + before := time.Now().Add(-time.Second) + if err := Write(path, "send-telemetry", "install"); err != nil { + t.Fatalf("Write: %v", err) + } + + rec, err := Load(path) + if err != nil { + t.Fatalf("Load: %v", err) + } + if rec == nil { + t.Fatal("Load returned nil record after Write") + } + if rec.SchemaVersion != SchemaVersion { + t.Errorf("SchemaVersion = %d, want %d", rec.SchemaVersion, SchemaVersion) + } + if rec.PID != os.Getpid() { + t.Errorf("PID = %d, want %d", rec.PID, os.Getpid()) + } + if rec.Command != "send-telemetry" { + t.Errorf("Command = %q, want send-telemetry", rec.Command) + } + if rec.InvocationMethod != "install" { + t.Errorf("InvocationMethod = %q, want install", rec.InvocationMethod) + } + if rec.AgentVersion != buildinfo.Version { + t.Errorf("AgentVersion = %q, want %q", rec.AgentVersion, buildinfo.Version) + } + if rec.OS == "" { + t.Error("OS is empty") + } + if rec.WrittenAt.Before(before) || rec.WrittenAt.After(time.Now().Add(time.Second)) { + t.Errorf("WrittenAt %v not within the test window", rec.WrittenAt) + } +} + +func TestWriteEmptyPathIsNoop(t *testing.T) { + if err := Write("", "send-telemetry", "one_time"); err != nil { + t.Fatalf("Write(\"\") should be a no-op, got %v", err) + } +} + +func TestWriteOverwritesPreviousRun(t *testing.T) { + path := filepath.Join(t.TempDir(), "last-run.json") + + if err := Write(path, "send-telemetry", "one_time"); err != nil { + t.Fatalf("first Write: %v", err) + } + // A second write must atomically replace the first (Windows os.Rename + // would fail on an existing destination without the pre-remove). + if err := Write(path, "install", "install"); err != nil { + t.Fatalf("second Write: %v", err) + } + + rec, err := Load(path) + if err != nil { + t.Fatalf("Load: %v", err) + } + if rec == nil || rec.Command != "install" { + t.Fatalf("second Write did not take; got %+v", rec) + } + + // No leftover temp siblings from the atomic-rename dance. + entries, err := os.ReadDir(filepath.Dir(path)) + if err != nil { + t.Fatalf("ReadDir: %v", err) + } + for _, e := range entries { + if filepath.Ext(e.Name()) == ".tmp" { + t.Errorf("leftover temp file: %s", e.Name()) + } + } +} + +func TestLoadMissingFileReturnsNilNil(t *testing.T) { + rec, err := Load(filepath.Join(t.TempDir(), "does-not-exist.json")) + if err != nil { + t.Fatalf("Load of missing file should not error, got %v", err) + } + if rec != nil { + t.Errorf("expected nil record for missing file, got %+v", rec) + } +} + +func TestLoadSchemaMismatchReturnsNil(t *testing.T) { + path := filepath.Join(t.TempDir(), "last-run.json") + if err := os.WriteFile(path, []byte(`{"schema_version":999,"pid":1}`), 0o600); err != nil { + t.Fatal(err) + } + rec, err := Load(path) + if err != nil { + t.Fatalf("Load: %v", err) + } + if rec != nil { + t.Errorf("expected nil for schema mismatch, got %+v", rec) + } +} + +func TestLoadEmptyPathReturnsNilNil(t *testing.T) { + rec, err := Load("") + if err != nil || rec != nil { + t.Errorf("Load(\"\") = (%+v, %v), want (nil, nil)", rec, err) + } +} diff --git a/internal/paths/paths.go b/internal/paths/paths.go index c7e4d12..fef6648 100644 --- a/internal/paths/paths.go +++ b/internal/paths/paths.go @@ -150,3 +150,14 @@ func ScanStateFile() string { } return filepath.Join(home, "scan-state.json") } + +// HeartbeatFile returns the absolute path to last-run.json, or "" when +// Home() is disabled. Callers must treat "" as "heartbeat unavailable" and +// skip writing it (same contract as ScanStateFile). +func HeartbeatFile() string { + home := Home() + if home == "" { + return "" + } + return filepath.Join(home, "last-run.json") +}