231 lines
5.6 KiB
Go
231 lines
5.6 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
|
|
"github.com/moby/moby/api/types/container"
|
|
"github.com/moby/moby/client"
|
|
)
|
|
|
|
const (
|
|
alertThresholdLabel = "io.prometheus.alert.downtime"
|
|
defaultAlertThreshold = 3600
|
|
)
|
|
|
|
type metrics struct {
|
|
health *prometheus.GaugeVec
|
|
status *prometheus.GaugeVec
|
|
startedAt *prometheus.GaugeVec
|
|
exitCode *prometheus.GaugeVec
|
|
alertThreshold *prometheus.GaugeVec
|
|
}
|
|
|
|
func (m *metrics) reset() {
|
|
m.health.Reset()
|
|
m.status.Reset()
|
|
m.startedAt.Reset()
|
|
m.exitCode.Reset()
|
|
m.alertThreshold.Reset()
|
|
}
|
|
|
|
func boolToFloat(b bool) float64 {
|
|
if b {
|
|
return 1.0
|
|
}
|
|
return 0.0
|
|
}
|
|
|
|
func extractContainerName(names []string) string {
|
|
if len(names) == 0 {
|
|
return "unknown"
|
|
}
|
|
return strings.TrimPrefix(names[0], "/")
|
|
}
|
|
|
|
func getAlertThreshold(labels map[string]string) float64 {
|
|
thresholdStr, ok := labels[alertThresholdLabel]
|
|
if !ok {
|
|
return defaultAlertThreshold
|
|
}
|
|
|
|
threshold, err := strconv.ParseFloat(thresholdStr, 64)
|
|
if err != nil {
|
|
log.Printf("Invalid alert threshold value %q, using default: %v", thresholdStr, err)
|
|
return defaultAlertThreshold
|
|
}
|
|
|
|
return threshold
|
|
}
|
|
|
|
func collectContainerMetrics(ctx context.Context, apiClient *client.Client, ctr container.Summary, metrics *metrics) error {
|
|
name := extractContainerName(ctr.Names)
|
|
|
|
labels := prometheus.Labels{"id": ctr.ID, "name": name}
|
|
|
|
running := boolToFloat(ctr.State == container.StateRunning)
|
|
metrics.status.With(labels).Set(running)
|
|
|
|
threshold := getAlertThreshold(ctr.Labels)
|
|
metrics.alertThreshold.With(labels).Set(threshold)
|
|
|
|
inspectResult, err := apiClient.ContainerInspect(ctx, ctr.ID, client.ContainerInspectOptions{})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if inspectResult.Container.State == nil {
|
|
return fmt.Errorf("Container state was empty for %s", ctr.ID)
|
|
}
|
|
|
|
if inspectResult.Container.State.StartedAt == "" {
|
|
metrics.startedAt.With(labels).Set(0)
|
|
} else {
|
|
startedAt, err := time.Parse(time.RFC3339, inspectResult.Container.State.StartedAt)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
metrics.startedAt.With(labels).Set(float64(startedAt.Unix()))
|
|
}
|
|
|
|
metrics.exitCode.With(labels).Set(float64(inspectResult.Container.State.ExitCode))
|
|
|
|
if inspectResult.Container.State.Health != nil {
|
|
healthy := boolToFloat(inspectResult.Container.State.Health.Status == container.Healthy)
|
|
metrics.health.With(labels).Set(healthy)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func collect(ctx context.Context, apiClient *client.Client, metrics *metrics) error {
|
|
containers, err := apiClient.ContainerList(ctx, client.ContainerListOptions{All: true})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var errs []error
|
|
for _, ctr := range containers.Items {
|
|
err = collectContainerMetrics(ctx, apiClient, ctr, metrics)
|
|
if err != nil {
|
|
errs = append(errs, fmt.Errorf("Error collecting metrics for container %s: %w", ctr.ID[:12], err))
|
|
}
|
|
}
|
|
|
|
return errors.Join(errs...)
|
|
}
|
|
|
|
func NewMetrics(reg prometheus.Registerer) *metrics {
|
|
metrics := &metrics{
|
|
health: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "container_inspect_state_health_status",
|
|
Help: "Container's healthcheck value (bool)",
|
|
},
|
|
[]string{"id", "name"},
|
|
),
|
|
status: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "container_inspect_state_running",
|
|
Help: "Container's running state (bool)",
|
|
},
|
|
[]string{"id", "name"},
|
|
),
|
|
startedAt: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "container_inspect_state_started_at",
|
|
Help: "Container's start time (UNIX timestamp)",
|
|
},
|
|
[]string{"id", "name"},
|
|
),
|
|
exitCode: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "container_inspect_state_exit_code",
|
|
Help: "Container's exit code (int)",
|
|
},
|
|
[]string{"id", "name"},
|
|
),
|
|
alertThreshold: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Name: "container_inspect_downtime_alert_threshold",
|
|
Help: "Container's downtime alert threshold in seconds (int)",
|
|
},
|
|
[]string{"id", "name"},
|
|
),
|
|
}
|
|
reg.MustRegister(metrics.health, metrics.status, metrics.startedAt, metrics.exitCode, metrics.alertThreshold)
|
|
|
|
return metrics
|
|
}
|
|
|
|
func main() {
|
|
var healthCheck = flag.Bool("health-check", false, "Perform health check and exit")
|
|
|
|
flag.Parse()
|
|
|
|
if *healthCheck {
|
|
resp, err := http.Get("http://127.0.0.1:8080/health")
|
|
if err != nil {
|
|
os.Exit(1)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != 200 {
|
|
os.Exit(1)
|
|
}
|
|
os.Exit(0)
|
|
}
|
|
|
|
registry := prometheus.NewRegistry()
|
|
m := NewMetrics(registry)
|
|
|
|
apiClient, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation())
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
defer apiClient.Close()
|
|
|
|
promHandler := promhttp.HandlerFor(registry, promhttp.HandlerOpts{})
|
|
|
|
http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
|
defer cancel()
|
|
|
|
_, err := apiClient.Ping(ctx, client.PingOptions{})
|
|
if err != nil {
|
|
w.WriteHeader(http.StatusServiceUnavailable)
|
|
fmt.Fprintf(w, "Docker unavailable: %v", err)
|
|
return
|
|
}
|
|
fmt.Fprint(w, "OK")
|
|
})
|
|
|
|
http.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) {
|
|
log.Println("Metrics queried")
|
|
m.reset()
|
|
|
|
ctx, cancel := context.WithTimeout(r.Context(), 10*time.Second)
|
|
defer cancel()
|
|
|
|
err := collect(ctx, apiClient, m)
|
|
if err != nil {
|
|
log.Printf("Error collecting metrics: %v", err)
|
|
}
|
|
|
|
promHandler.ServeHTTP(w, r)
|
|
})
|
|
|
|
log.Println("Starting server on :8080")
|
|
log.Fatal(http.ListenAndServe(":8080", nil))
|
|
}
|