dockstat/main.go

231 lines
5.6 KiB
Go

package main
import (
"context"
"errors"
"flag"
"fmt"
"log"
"net/http"
"os"
"strconv"
"strings"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/moby/moby/api/types/container"
"github.com/moby/moby/client"
)
const (
alertThresholdLabel = "io.prometheus.alert.downtime"
defaultAlertThreshold = 3600
)
type metrics struct {
health *prometheus.GaugeVec
status *prometheus.GaugeVec
startedAt *prometheus.GaugeVec
exitCode *prometheus.GaugeVec
alertThreshold *prometheus.GaugeVec
}
func (m *metrics) reset() {
m.health.Reset()
m.status.Reset()
m.startedAt.Reset()
m.exitCode.Reset()
m.alertThreshold.Reset()
}
func boolToFloat(b bool) float64 {
if b {
return 1.0
}
return 0.0
}
func extractContainerName(names []string) string {
if len(names) == 0 {
return "unknown"
}
return strings.TrimPrefix(names[0], "/")
}
func getAlertThreshold(labels map[string]string) float64 {
thresholdStr, ok := labels[alertThresholdLabel]
if !ok {
return defaultAlertThreshold
}
threshold, err := strconv.ParseFloat(thresholdStr, 64)
if err != nil {
log.Printf("Invalid alert threshold value %q, using default: %v", thresholdStr, err)
return defaultAlertThreshold
}
return threshold
}
func collectContainerMetrics(ctx context.Context, apiClient *client.Client, ctr container.Summary, metrics *metrics) error {
name := extractContainerName(ctr.Names)
labels := prometheus.Labels{"id": ctr.ID, "name": name}
running := boolToFloat(ctr.State == container.StateRunning)
metrics.status.With(labels).Set(running)
threshold := getAlertThreshold(ctr.Labels)
metrics.alertThreshold.With(labels).Set(threshold)
inspectResult, err := apiClient.ContainerInspect(ctx, ctr.ID, client.ContainerInspectOptions{})
if err != nil {
return err
}
if inspectResult.Container.State == nil {
return fmt.Errorf("Container state was empty for %s", ctr.ID)
}
if inspectResult.Container.State.StartedAt == "" {
metrics.startedAt.With(labels).Set(0)
} else {
startedAt, err := time.Parse(time.RFC3339, inspectResult.Container.State.StartedAt)
if err != nil {
return err
}
metrics.startedAt.With(labels).Set(float64(startedAt.Unix()))
}
metrics.exitCode.With(labels).Set(float64(inspectResult.Container.State.ExitCode))
if inspectResult.Container.State.Health != nil {
healthy := boolToFloat(inspectResult.Container.State.Health.Status == container.Healthy)
metrics.health.With(labels).Set(healthy)
}
return nil
}
func collect(ctx context.Context, apiClient *client.Client, metrics *metrics) error {
containers, err := apiClient.ContainerList(ctx, client.ContainerListOptions{All: true})
if err != nil {
return err
}
var errs []error
for _, ctr := range containers.Items {
err = collectContainerMetrics(ctx, apiClient, ctr, metrics)
if err != nil {
errs = append(errs, fmt.Errorf("Error collecting metrics for container %s: %w", ctr.ID[:12], err))
}
}
return errors.Join(errs...)
}
func NewMetrics(reg prometheus.Registerer) *metrics {
metrics := &metrics{
health: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "container_inspect_state_health_status",
Help: "Container's healthcheck value (bool)",
},
[]string{"id", "name"},
),
status: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "container_inspect_state_running",
Help: "Container's running state (bool)",
},
[]string{"id", "name"},
),
startedAt: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "container_inspect_state_started_at",
Help: "Container's start time (UNIX timestamp)",
},
[]string{"id", "name"},
),
exitCode: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "container_inspect_state_exit_code",
Help: "Container's exit code (int)",
},
[]string{"id", "name"},
),
alertThreshold: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "container_inspect_downtime_alert_threshold",
Help: "Container's downtime alert threshold in seconds (int)",
},
[]string{"id", "name"},
),
}
reg.MustRegister(metrics.health, metrics.status, metrics.startedAt, metrics.exitCode, metrics.alertThreshold)
return metrics
}
func main() {
var healthCheck = flag.Bool("health-check", false, "Perform health check and exit")
flag.Parse()
if *healthCheck {
resp, err := http.Get("http://127.0.0.1:8080/health")
if err != nil {
os.Exit(1)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
os.Exit(1)
}
os.Exit(0)
}
registry := prometheus.NewRegistry()
m := NewMetrics(registry)
apiClient, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation())
if err != nil {
log.Fatal(err)
}
defer apiClient.Close()
promHandler := promhttp.HandlerFor(registry, promhttp.HandlerOpts{})
http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
_, err := apiClient.Ping(ctx, client.PingOptions{})
if err != nil {
w.WriteHeader(http.StatusServiceUnavailable)
fmt.Fprintf(w, "Docker unavailable: %v", err)
return
}
fmt.Fprint(w, "OK")
})
http.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
log.Println("Metrics queried")
m.reset()
ctx, cancel := context.WithTimeout(r.Context(), 10*time.Second)
defer cancel()
err := collect(ctx, apiClient, m)
if err != nil {
log.Printf("Error collecting metrics: %v", err)
}
promHandler.ServeHTTP(w, r)
})
log.Println("Starting server on :8080")
log.Fatal(http.ListenAndServe(":8080", nil))
}