dockstat/main.go

package main

import (
	"context"
	"errors"
	"flag"
	"fmt"
	"log"
	"net/http"
	"os"
	"strconv"
	"strings"
	"time"

	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promhttp"

	"github.com/moby/moby/api/types/container"
	"github.com/moby/moby/client"
)

const (
	alertThresholdLabel   = "io.prometheus.alert.downtime"
	defaultAlertThreshold = 3600
)

type metrics struct {
	health         *prometheus.GaugeVec
	status         *prometheus.GaugeVec
	startedAt      *prometheus.GaugeVec
	exitCode       *prometheus.GaugeVec
	alertThreshold *prometheus.GaugeVec
}

func (m *metrics) reset() {
	m.health.Reset()
	m.status.Reset()
	m.startedAt.Reset()
	m.exitCode.Reset()
	m.alertThreshold.Reset()
}

func boolToFloat(b bool) float64 {
	if b {
		return 1.0
	}
	return 0.0
}

func extractContainerName(names []string) string {
	if len(names) == 0 {
		return "unknown"
	}
	return strings.TrimPrefix(names[0], "/")
}

func getAlertThreshold(labels map[string]string) float64 {
	thresholdStr, ok := labels[alertThresholdLabel]
	if !ok {
		return defaultAlertThreshold
	}

	threshold, err := strconv.ParseFloat(thresholdStr, 64)
	if err != nil {
		log.Printf("Invalid alert threshold value %q, using default: %v", thresholdStr, err)
		return defaultAlertThreshold
	}

	return threshold
}

func collectContainerMetrics(ctx context.Context, apiClient *client.Client, ctr container.Summary, metrics *metrics) error {
	name := extractContainerName(ctr.Names)

	labels := prometheus.Labels{"id": ctr.ID, "name": name}

	running := boolToFloat(ctr.State == container.StateRunning)
	metrics.status.With(labels).Set(running)

	threshold := getAlertThreshold(ctr.Labels)
	metrics.alertThreshold.With(labels).Set(threshold)

	inspectResult, err := apiClient.ContainerInspect(ctx, ctr.ID, client.ContainerInspectOptions{})
	if err != nil {
		return err
	}

	if inspectResult.Container.State == nil {
		return fmt.Errorf("Container state was empty for %s", ctr.ID)
	}

	if inspectResult.Container.State.StartedAt == "" {
		metrics.startedAt.With(labels).Set(0)
	} else {
		startedAt, err := time.Parse(time.RFC3339, inspectResult.Container.State.StartedAt)
		if err != nil {
			return err
		}
		metrics.startedAt.With(labels).Set(float64(startedAt.Unix()))
	}

	metrics.exitCode.With(labels).Set(float64(inspectResult.Container.State.ExitCode))

	if inspectResult.Container.State.Health != nil {
		healthy := boolToFloat(inspectResult.Container.State.Health.Status == container.Healthy)
		metrics.health.With(labels).Set(healthy)
	}

	return nil
}

func collect(ctx context.Context, apiClient *client.Client, metrics *metrics) error {
	containers, err := apiClient.ContainerList(ctx, client.ContainerListOptions{All: true})
	if err != nil {
		return err
	}

	var errs []error
	for _, ctr := range containers.Items {
		err = collectContainerMetrics(ctx, apiClient, ctr, metrics)
		if err != nil {
			errs = append(errs, fmt.Errorf("Error collecting metrics for container %s: %w", ctr.ID[:12], err))
		}
	}

	return errors.Join(errs...)
}

func NewMetrics(reg prometheus.Registerer) *metrics {
	metrics := &metrics{
		health: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Name: "container_inspect_state_health_status",
				Help: "Container's healthcheck value (bool)",
			},
			[]string{"id", "name"},
		),
		status: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Name: "container_inspect_state_running",
				Help: "Container's running state (bool)",
			},
			[]string{"id", "name"},
		),
		startedAt: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Name: "container_inspect_state_started_at",
				Help: "Container's start time (UNIX timestamp)",
			},
			[]string{"id", "name"},
		),
		exitCode: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Name: "container_inspect_state_exit_code",
				Help: "Container's exit code (int)",
			},
			[]string{"id", "name"},
		),
		alertThreshold: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Name: "container_inspect_downtime_alert_threshold",
				Help: "Container's downtime alert threshold in seconds (int)",
			},
			[]string{"id", "name"},
		),
	}
	reg.MustRegister(metrics.health, metrics.status, metrics.startedAt, metrics.exitCode, metrics.alertThreshold)

	return metrics
}

func main() {
	var healthCheck = flag.Bool("health-check", false, "Perform health check and exit")

	flag.Parse()

	if *healthCheck {
		resp, err := http.Get("http://127.0.0.1:8080/health")
		if err != nil {
			os.Exit(1)
		}
		defer resp.Body.Close()

		if resp.StatusCode != 200 {
			os.Exit(1)
		}
		os.Exit(0)
	}

	registry := prometheus.NewRegistry()
	m := NewMetrics(registry)

	apiClient, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation())
	if err != nil {
		log.Fatal(err)
	}
	defer apiClient.Close()

	promHandler := promhttp.HandlerFor(registry, promhttp.HandlerOpts{})

	http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
		defer cancel()

		_, err := apiClient.Ping(ctx, client.PingOptions{})
		if err != nil {
			w.WriteHeader(http.StatusServiceUnavailable)
			fmt.Fprintf(w, "Docker unavailable: %v", err)
			return
		}
		fmt.Fprint(w, "OK")
	})

	http.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
		log.Println("Metrics queried")
		m.reset()

		ctx, cancel := context.WithTimeout(r.Context(), 10*time.Second)
		defer cancel()

		err := collect(ctx, apiClient, m)
		if err != nil {
			log.Printf("Error collecting metrics: %v", err)
		}

		promHandler.ServeHTTP(w, r)
	})

	log.Println("Starting server on :8080")
	log.Fatal(http.ListenAndServe(":8080", nil))
}