diff --git a/internal/service/assets/alloy.config.tmpl b/internal/service/assets/alloy.config.tmpl index da274f0..90004ca 100644 --- a/internal/service/assets/alloy.config.tmpl +++ b/internal/service/assets/alloy.config.tmpl @@ -58,3 +58,42 @@ prometheus.scrape "node" { scrape_interval = "15s" } +prometheus.exporter.blackbox "services" { + config = `{ + modules: { + tcp: { prober: tcp }, + http_2xx: { prober: http } + } + }` + target { + name = "traefik" + address = "traefik:443" + module = "tcp" + } + target { + name = "finch" + address = "http://finch:3003/healthz" + module = "http_2xx" + } + target { + name = "mimir" + address = "http://mimir:8080/ready" + module = "http_2xx" + } + target { + name = "loki" + address = "http://loki:3100/ready" + module = "http_2xx" + } + target { + name = "pyroscope" + address = "http://pyroscope:4040/ready" + module = "http_2xx" + } +} + +prometheus.scrape "services" { + targets = prometheus.exporter.blackbox.services.targets + forward_to = [prometheus.remote_write.default.receiver] + scrape_interval = "15s" +} diff --git a/internal/service/assets/docker-compose.yaml.tmpl b/internal/service/assets/docker-compose.yaml.tmpl index 95c4afd..19c8fbd 100644 --- a/internal/service/assets/docker-compose.yaml.tmpl +++ b/internal/service/assets/docker-compose.yaml.tmpl @@ -8,6 +8,7 @@ services: - GF_SERVER_SERVE_FROM_SUB_PATH=true volumes: - /var/lib/finch/grafana:/var/lib/grafana + - /var/lib/finch/grafana/alerting:/etc/grafana/provisioning/alerting entrypoint: - sh - -euc @@ -61,6 +62,8 @@ services: EOF /run.sh restart: always + labels: + finch.config-hash: "{{ .GrafanaConfigHash }}" loki: container_name: loki @@ -73,6 +76,8 @@ services: - /var/lib/finch/loki/etc:/etc/loki - /var/lib/finch/loki/data:/var/lib/loki restart: always + labels: + finch.config-hash: "{{ .LokiConfigHash }}" traefik: container_name: traefik @@ -104,6 +109,8 @@ services: - "--disable-reporting=true" - "/etc/alloy/alloy.config" restart: always + labels: + finch.config-hash: "{{ .AlloyConfigHash }}" finch: container_name: finch @@ -122,6 +129,8 @@ services: - "--config.file=/etc/mimir/mimir.yaml" user: "10001:10001" restart: always + labels: + finch.config-hash: "{{ .MimirConfigHash }}" pyroscope: container_name: pyroscope @@ -133,11 +142,106 @@ services: - "--compactor.blocks-retention-period=72h" - "--querier.max-query-lookback=72h" - "--querier.max-query-length=72h" - - "--querier.max-query-lookback=72h" - "--server.log-source-ips-enabled=true" user: "10001:10001" restart: always + hc-traefik: + container_name: hc-traefik + image: curlimages/curl:8.17.0 + entrypoint: + - sleep + - infinity + healthcheck: + test: ["CMD-SHELL", "curl -so /dev/null -w '%{http_code}' http://traefik:80 | grep -qE '^[234][0-9]{2}$' && curl -sko /dev/null -w '%{http_code}' https://traefik:443 | grep -qE '^[234][0-9]{2}$'"] + interval: 5s + timeout: 3s + retries: 30 + start_period: 10s + depends_on: + - traefik + restart: always + + hc-finch: + container_name: hc-finch + image: curlimages/curl:8.17.0 + entrypoint: + - sleep + - infinity + healthcheck: + test: ["CMD-SHELL", "curl -fs http://finch:3003/healthz"] + interval: 5s + timeout: 3s + retries: 30 + start_period: 10s + depends_on: + - finch + restart: always + + hc-grafana: + container_name: hc-grafana + image: curlimages/curl:8.17.0 + entrypoint: + - sleep + - infinity + healthcheck: + test: ["CMD-SHELL", "curl -fs http://grafana:3000/api/health"] + interval: 5s + timeout: 3s + retries: 30 + start_period: 10s + depends_on: + - grafana + restart: always + + hc-loki: + container_name: hc-loki + image: curlimages/curl:8.17.0 + entrypoint: + - sleep + - infinity + healthcheck: + test: ["CMD-SHELL", "curl -fs http://loki:3100/ready"] + interval: 5s + timeout: 3s + retries: 30 + start_period: 10s + depends_on: + - loki + restart: always + + hc-mimir: + container_name: hc-mimir + image: curlimages/curl:8.17.0 + entrypoint: + - sleep + - infinity + healthcheck: + test: ["CMD-SHELL", "curl -fs http://mimir:8080/ready"] + interval: 5s + timeout: 3s + retries: 30 + start_period: 10s + depends_on: + - mimir + restart: always + + hc-pyroscope: + container_name: hc-pyroscope + image: curlimages/curl:8.17.0 + entrypoint: + - sleep + - infinity + healthcheck: + test: ["CMD-SHELL", "curl -fs http://pyroscope:4040/ready"] + interval: 5s + timeout: 3s + retries: 30 + start_period: 10s + depends_on: + - pyroscope + restart: always + networks: default: name: finch diff --git a/internal/service/assets/grafana-alerts.yaml b/internal/service/assets/grafana-alerts.yaml new file mode 100644 index 0000000..b93bf5e --- /dev/null +++ b/internal/service/assets/grafana-alerts.yaml @@ -0,0 +1,212 @@ +--- +apiVersion: 1 +groups: + - orgId: 1 + name: Finch + folder: Finch + interval: 1m + rules: + - uid: finch-alert-traefik-down + title: Traefik Down + condition: B + data: + - refId: A + datasourceUid: finch-mimir + relativeTimeRange: + from: 300 + to: 0 + model: + expr: probe_success{job="integrations/blackbox/traefik"} + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + relativeTimeRange: + from: 300 + to: 0 + model: + type: classic_conditions + refId: B + conditions: + - evaluator: + type: lt + params: [1] + operator: + type: and + query: + params: [A] + reducer: + type: last + type: query + noDataState: Alerting + execErrState: Alerting + for: 2m + labels: + severity: critical + annotations: + summary: Traefik is down — remote agents cannot deliver metrics, logs or profiles + + - uid: finch-alert-finch-down + title: Finch Down + condition: B + data: + - refId: A + datasourceUid: finch-mimir + relativeTimeRange: + from: 300 + to: 0 + model: + expr: probe_success{job="integrations/blackbox/finch"} + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + relativeTimeRange: + from: 300 + to: 0 + model: + type: classic_conditions + refId: B + conditions: + - evaluator: + type: lt + params: [1] + operator: + type: and + query: + params: [A] + reducer: + type: last + type: query + noDataState: Alerting + execErrState: Alerting + for: 2m + labels: + severity: critical + annotations: + summary: Finch is down — agents cannot authenticate and data delivery will fail + + - uid: finch-alert-mimir-down + title: Mimir Down + condition: B + data: + - refId: A + datasourceUid: finch-mimir + relativeTimeRange: + from: 300 + to: 0 + model: + expr: probe_success{job="integrations/blackbox/mimir"} + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + relativeTimeRange: + from: 300 + to: 0 + model: + type: classic_conditions + refId: B + conditions: + - evaluator: + type: lt + params: [1] + operator: + type: and + query: + params: [A] + reducer: + type: last + type: query + noDataState: Alerting + execErrState: Alerting + for: 2m + labels: + severity: critical + annotations: + summary: Mimir is down — metrics storage and querying unavailable + + - uid: finch-alert-loki-down + title: Loki Down + condition: B + data: + - refId: A + datasourceUid: finch-mimir + relativeTimeRange: + from: 300 + to: 0 + model: + expr: probe_success{job="integrations/blackbox/loki"} + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + relativeTimeRange: + from: 300 + to: 0 + model: + type: classic_conditions + refId: B + conditions: + - evaluator: + type: lt + params: [1] + operator: + type: and + query: + params: [A] + reducer: + type: last + type: query + noDataState: Alerting + execErrState: Alerting + for: 2m + labels: + severity: critical + annotations: + summary: Loki is down — log storage and querying unavailable + + - uid: finch-alert-pyroscope-down + title: Pyroscope Down + condition: B + data: + - refId: A + datasourceUid: finch-mimir + relativeTimeRange: + from: 300 + to: 0 + model: + expr: probe_success{job="integrations/blackbox/pyroscope"} + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + relativeTimeRange: + from: 300 + to: 0 + model: + type: classic_conditions + refId: B + conditions: + - evaluator: + type: lt + params: [1] + operator: + type: and + query: + params: [A] + reducer: + type: last + type: query + noDataState: Alerting + execErrState: Alerting + for: 2m + labels: + severity: critical + annotations: + summary: Pyroscope is down — profiling storage and querying unavailable diff --git a/internal/service/assets/loki.yaml b/internal/service/assets/loki.yaml index 103a17b..f42d029 100644 --- a/internal/service/assets/loki.yaml +++ b/internal/service/assets/loki.yaml @@ -4,6 +4,7 @@ auth_enabled: false limits_config: allow_structured_metadata: true volume_enabled: true + retention_period: 72h server: http_listen_port: 3100 @@ -32,3 +33,8 @@ storage_config: cache_location: /var/lib/loki/index_cache filesystem: directory: /var/lib/loki/chunks + +compactor: + working_directory: /var/lib/loki/compactor + retention_enabled: true + delete_request_store: filesystem diff --git a/internal/service/assets/mimir.yaml b/internal/service/assets/mimir.yaml index b672ffc..17df6ab 100644 --- a/internal/service/assets/mimir.yaml +++ b/internal/service/assets/mimir.yaml @@ -1,4 +1,9 @@ --- +limits: + compactor_blocks_retention_period: 72h + ingestion_rate: 0 + ingestion_burst_size: 0 + multitenancy_enabled: false activity_tracker: diff --git a/internal/service/deploy.go b/internal/service/deploy.go index 0a5d890..c20f764 100644 --- a/internal/service/deploy.go +++ b/internal/service/deploy.go @@ -14,11 +14,13 @@ import ( "io/fs" "os" "path" + "strings" "text/template" "time" "github.com/tschaefer/finchctl/internal/config" "github.com/tschaefer/finchctl/internal/mtls" + "github.com/tschaefer/finchctl/internal/target" "github.com/tschaefer/finchctl/internal/version" ) @@ -32,10 +34,15 @@ const ( func (s *Service) __deployMakeDirHierarchy() error { directories := []string{ "grafana/dashboards", - "loki/{data,etc}", - "alloy/{data,etc}", - "traefik/etc/{certs.d,conf.d}", - "mimir/{data,etc}", + "grafana/alerting", + "loki/data", + "loki/etc", + "alloy/data", + "alloy/etc", + "traefik/etc/certs.d", + "traefik/etc/conf.d", + "mimir/data", + "mimir/etc", "pyroscope/data", } for _, dir := range directories { @@ -50,19 +57,24 @@ func (s *Service) __deployMakeDirHierarchy() error { func (s *Service) __deploySetDirHierarchyPermission() error { ownership := map[string]string{ - "grafana": "472:472", - "grafana/dashboards": "472:472", - "loki": "10001:10001", - "loki/{data,etc}": "10001:10001", - "alloy": "0:0", - "alloy/{data,etc}": "0:0", - "traefik": "0:0", - "traefik/etc": "0:0", - "traefik/etc/{certs.d,conf.d}": "0:0", - "mimir": "10001:10001", - "mimir/{data,etc}": "10001:10001", - "pyroscope": "10001:10001", - "pyroscope/data": "10001:10001", + "grafana": "472:472", + "grafana/dashboards": "472:472", + "grafana/alerting": "472:472", + "loki": "10001:10001", + "loki/data": "10001:10001", + "loki/etc": "10001:10001", + "alloy": "0:0", + "alloy/data": "0:0", + "alloy/etc": "0:0", + "traefik": "0:0", + "traefik/etc": "0:0", + "traefik/etc/certs.d": "0:0", + "traefik/etc/conf.d": "0:0", + "mimir": "10001:10001", + "mimir/data": "10001:10001", + "mimir/etc": "10001:10001", + "pyroscope": "10001:10001", + "pyroscope/data": "10001:10001", } for dir, owner := range ownership { @@ -236,6 +248,11 @@ func (s *Service) __deployCopyGrafanaDashboards() error { return nil } +func (s *Service) __deployCopyGrafanaAlerts() error { + path := path.Join(s.libDir(), "grafana/alerting/grafana-alerts.yaml") + return s.__helperCopyConfig(path, "400", "472:472") +} + func (s *Service) __deployCopyMimirConfig() error { path := path.Join(s.libDir(), "mimir/etc/mimir.yaml") return s.__helperCopyConfig(path, "400", "10001:10001") @@ -299,13 +316,68 @@ func (s *Service) __helperCopyTemplate(filePath, mode, owner string, data any) e return nil } +func (s *Service) __configHash(data ...[]byte) string { + h := sha256.New() + for _, d := range data { + h.Write(d) + } + return hex.EncodeToString(h.Sum(nil))[:16] +} + func (s *Service) __deployCopyComposeFile() error { filePath := path.Join(s.libDir(), "docker-compose.yaml") + alloyTmplBytes, err := fs.ReadFile(Assets, "alloy.config.tmpl") + if err != nil { + return &DeployServiceError{Message: err.Error(), Reason: ""} + } + alloyTmpl, err := template.New("alloy").Parse(string(alloyTmplBytes)) + if err != nil { + return &DeployServiceError{Message: err.Error(), Reason: ""} + } + var alloyRendered bytes.Buffer + if err = alloyTmpl.Execute(&alloyRendered, struct{ Hostname string }{s.config.Hostname}); err != nil { + return &DeployServiceError{Message: err.Error(), Reason: ""} + } + + lokiBytes, err := fs.ReadFile(Assets, "loki.yaml") + if err != nil { + return &DeployServiceError{Message: err.Error(), Reason: ""} + } + mimirBytes, err := fs.ReadFile(Assets, "mimir.yaml") + if err != nil { + return &DeployServiceError{Message: err.Error(), Reason: ""} + } + + grafanaAssets := []string{ + "grafana-alerts.yaml", + "grafana-dashboard-logs-docker.json", + "grafana-dashboard-logs-journal.json", + "grafana-dashboard-logs-file.json", + "grafana-dashboard-metrics.json", + "grafana-dashboard-profiles-finch.json", + } + var grafanaChunks [][]byte + for _, name := range grafanaAssets { + b, err := fs.ReadFile(Assets, name) + if err != nil { + return &DeployServiceError{Message: err.Error(), Reason: ""} + } + grafanaChunks = append(grafanaChunks, b) + } + data := struct { - RootUrl string + RootUrl string + AlloyConfigHash string + GrafanaConfigHash string + LokiConfigHash string + MimirConfigHash string }{ - RootUrl: fmt.Sprintf("https://%s", s.config.Hostname), + RootUrl: fmt.Sprintf("https://%s", s.config.Hostname), + AlloyConfigHash: s.__configHash(alloyRendered.Bytes()), + GrafanaConfigHash: s.__configHash(grafanaChunks...), + LokiConfigHash: s.__configHash(lokiBytes), + MimirConfigHash: s.__configHash(mimirBytes), } return s.__helperCopyTemplate(filePath, "400", "0:0", data) @@ -321,14 +393,97 @@ func (s *Service) __deployComposeUp() error { } func (s *Service) __deployComposeReady() error { - cmd := `timeout 180 bash -c 'until curl -fs -o /dev/null -w "%{http_code}" http://localhost | grep -qE "^[234][0-9]{2}$"; do sleep 2; done'` + if s.dryRun { + target.PrintProgress("Skipping readiness check due to dry-run mode", s.format) + return nil + } - out, err := s.target.Run(s.ctx, cmd) - if err != nil { - return &DeployServiceError{Message: err.Error(), Reason: string(out)} + containers := []string{ + "hc-traefik", + "hc-finch", + "hc-grafana", + "hc-loki", + "hc-mimir", + "hc-pyroscope", + } + + const ( + maxWait = 180 * time.Second + interval = 2 * time.Second + unhealthyLimit = 5 + ) + + unhealthyCount := make(map[string]int) + deadline := time.Now().Add(maxWait) + for time.Now().Before(deadline) { + allHealthy := true + var failed []string + + for _, c := range containers { + cmd := fmt.Sprintf("sudo docker inspect --format '{{.State.Health.Status}}' %s", c) + out, err := s.target.Run(s.ctx, cmd) + status := strings.TrimSpace(string(out)) + switch { + case err == nil && status == "healthy": + unhealthyCount[c] = 0 + case err != nil || status == "unhealthy": + unhealthyCount[c]++ + allHealthy = false + if unhealthyCount[c] >= unhealthyLimit { + failed = append(failed, c) + } + default: + allHealthy = false + } + } + + if len(failed) > 0 { + var services []string + for _, c := range failed { + services = append(services, strings.TrimPrefix(c, "hc-")) + } + logParts := s.__deployCollectHealthLogs(services) + + return &DeployServiceError{ + Message: fmt.Sprintf("readiness check failed for: %s", strings.Join(failed, ", ")), + Reason: strings.Join(*logParts, "\n"), + } + } + + if allHealthy { + return nil + } + + time.Sleep(interval) } - return nil + var timedOut []string + for _, c := range containers { + cmd := fmt.Sprintf("sudo docker inspect --format '{{.State.Health.Status}}' %s", c) + out, err := s.target.Run(s.ctx, cmd) + status := strings.TrimSpace(string(out)) + if err != nil || status != "healthy" { + timedOut = append(timedOut, strings.TrimPrefix(c, "hc-")) + } + } + + logParts := s.__deployCollectHealthLogs(timedOut) + + return &DeployServiceError{ + Message: fmt.Sprintf("readiness check failed for: %s", strings.Join(containers, ", ")), + Reason: strings.Join(*logParts, "\n"), + } +} + +func (s *Service) __deployCollectHealthLogs(containers []string) *[]string { + var logParts []string + for _, c := range containers { + cmd := fmt.Sprintf("sudo docker logs --tail 50 %s 2>&1", c) + out, _ := s.target.Run(s.ctx, cmd) + logParts = append(logParts, strings.TrimSpace(string(out))) + } + + return &logParts } func (s *Service) deployService() error { @@ -368,6 +523,10 @@ func (s *Service) deployService() error { return err } + if err := s.__deployCopyGrafanaAlerts(); err != nil { + return err + } + if err := s.__deployCopyFinchConfig(); err != nil { return err } diff --git a/internal/service/service_test.go b/internal/service/service_test.go index cc4d4c8..c4827c3 100644 --- a/internal/service/service_test.go +++ b/internal/service/service_test.go @@ -36,12 +36,12 @@ func Test_Deploy(t *testing.T) { assert.NoError(t, err, "deploy service") tracks := strings.Split(record, "\n") - assert.Len(t, tracks, 43, "number of log lines") + assert.Len(t, tracks, 54, "number of log lines") wanted := "Running 'command -v sudo' as .+@localhost" assert.Regexp(t, wanted, tracks[0], "first log line") - wanted = "Running 'timeout 180 bash -c 'until curl -fs -o /dev/null -w \"%{http_code}\" http://localhost | grep -qE \"^[234][0-9]{2}$\"; do sleep 2; done'' as .+@localhost" + wanted = "Skipping readiness check due to dry-run mode" assert.Regexp(t, wanted, tracks[len(tracks)-2], "last log line") s, err = New(context.Background(), Options{ @@ -128,7 +128,7 @@ func Test_Update(t *testing.T) { assert.NoError(t, err, "update service") tracks := strings.Split(record, "\n") - assert.Len(t, tracks, 39, "number of log lines") + assert.Len(t, tracks, 50, "number of log lines") wanted := "Running 'command -v sudo' as .+@localhost" assert.Regexp(t, wanted, tracks[0], "first log line") diff --git a/internal/service/update.go b/internal/service/update.go index a263a14..2b519fa 100644 --- a/internal/service/update.go +++ b/internal/service/update.go @@ -107,6 +107,10 @@ func (s *Service) updateService() error { return convertError(err, &UpdateServiceError{}) } + if err := s.__deployCopyGrafanaAlerts(); err != nil { + return convertError(err, &UpdateServiceError{}) + } + if err := s.__updateRecomposeDockerServices(); err != nil { return err }