dockstat/dockstat.py

212 lines
6 KiB
Python
Raw Permalink Normal View History

2020-10-10 14:11:43 +01:00
#!/usr/bin/env python3
"""
Module to act as a Prometheus Exporter for Docker containers with a
2021-12-08 17:31:08 +00:00
healthcheck configured
2020-10-10 14:11:43 +01:00
"""
import argparse
import os
2020-10-10 14:11:43 +01:00
import os.path
import sys
2020-10-10 16:24:44 +01:00
from http.server import HTTPServer
2021-12-08 17:31:08 +00:00
from typing import Any
2020-10-10 14:11:43 +01:00
2024-11-15 09:18:51 +00:00
import docker
import docker.errors
import numpy
2024-11-15 09:18:51 +00:00
from docker.models.containers import Container
from prometheus_client import CollectorRegistry, Gauge, MetricsHandler, generate_latest
2020-10-10 14:11:43 +01:00
2023-01-12 09:00:47 +00:00
LISTEN_PORT = int(os.environ.get("DOCKSTAT_LISTEN_PORT", 8080))
HEALTHY_STR = "healthy"
2020-10-10 14:11:43 +01:00
2024-11-15 09:18:51 +00:00
class HTTPHandler(MetricsHandler):
2020-10-10 14:11:43 +01:00
"""
Class to encompass the requirements of a Prometheus Exporter
for Docker containers with a healthcheck configured
"""
2020-10-10 16:24:44 +01:00
2021-12-08 17:31:08 +00:00
def __init__(self, *args: Any, **kwargs: Any):
self.docker_api: docker.APIClient = docker.APIClient()
2020-10-10 16:24:44 +01:00
self.docker_client = docker.from_env()
super().__init__(*args, **kwargs)
# Override built-in method
2021-12-08 17:31:08 +00:00
def do_GET(self) -> None:
2020-10-10 14:11:43 +01:00
"""
2021-09-16 14:19:50 +01:00
Handle GET requests
2020-10-10 14:11:43 +01:00
"""
2023-01-12 09:00:47 +00:00
if self.path == "/metrics":
try:
self._metrics()
except docker.errors.NotFound:
pass
2020-10-10 14:11:43 +01:00
2023-01-12 09:00:47 +00:00
if self.path == "/healthcheck":
2020-10-15 10:55:12 +01:00
if not healthy():
2023-01-12 09:00:47 +00:00
print("ERROR: Check requirements")
self._respond(500, "ERR")
2020-10-10 14:11:43 +01:00
2023-01-12 09:00:47 +00:00
self._respond(200, "OK")
2020-10-15 10:55:12 +01:00
2021-12-08 17:31:08 +00:00
def _respond(self, status: int, message: str) -> None:
2020-10-10 14:11:43 +01:00
"""
2021-09-16 14:19:50 +01:00
Output a simple HTTP status and string to the client
e.g. 200 OK
Args:
status (int): HTTP status to output
message (str): String to output
2020-10-10 14:11:43 +01:00
"""
2020-10-15 10:55:12 +01:00
self.send_response(int(status) or 500)
2024-11-15 09:18:51 +00:00
self.send_header("content-type", "text/plain")
2020-10-10 14:11:43 +01:00
self.end_headers()
2022-06-03 01:45:45 +01:00
try:
self.wfile.write(bytes(str(message).encode()))
except BrokenPipeError:
pass
2020-10-10 14:11:43 +01:00
2021-12-08 17:31:08 +00:00
def _metrics(self) -> None:
2020-10-10 14:11:43 +01:00
"""
2021-09-16 14:19:50 +01:00
Handle the request for metrics
2020-10-10 14:11:43 +01:00
"""
2020-10-15 10:55:12 +01:00
if not healthy:
2023-01-12 09:00:47 +00:00
print("ERROR: Check requirements")
self._respond(500, "Server not configured correctly")
2020-10-14 14:51:50 +01:00
return
2020-10-10 14:11:43 +01:00
2020-10-10 16:24:44 +01:00
registry = CollectorRegistry()
2020-10-10 14:11:43 +01:00
health_gauge = Gauge(
2023-01-12 09:00:47 +00:00
"container_inspect_state_health_status",
2020-10-10 16:24:44 +01:00
"Container's healthcheck value (binary)",
2023-01-12 09:00:47 +00:00
labelnames=["id", "name", "value"],
registry=registry,
)
status_gauge = Gauge(
2023-01-12 09:00:47 +00:00
"container_inspect_state_running",
"Container's running state (binary)",
2023-01-12 09:00:47 +00:00
labelnames=["id", "name"],
registry=registry,
)
started_at_gauge = Gauge(
2023-01-12 09:00:47 +00:00
"container_inspect_state_started_at",
"Container's start time (int)",
2023-01-12 09:00:47 +00:00
labelnames=["id", "name"],
registry=registry,
)
exit_code_gauge = Gauge(
2023-01-12 09:00:47 +00:00
"container_inspect_state_exit_code",
"Container's exit code (int)",
2023-01-12 09:00:47 +00:00
labelnames=["id", "name"],
registry=registry,
)
alert_threshold_gauge = Gauge(
2023-01-12 09:00:47 +00:00
"container_inspect_downtime_alert_threshold",
"Container's downtime alert threshold in seconds (int)",
2023-01-12 09:00:47 +00:00
labelnames=["id", "name"],
registry=registry,
2020-10-10 16:24:44 +01:00
)
2023-01-12 09:15:33 +00:00
container: Container
2024-11-15 09:18:51 +00:00
for container in self.docker_client.containers.list(all=True):
2022-05-18 12:53:22 +01:00
try:
2024-11-15 09:18:51 +00:00
data = self.docker_api.inspect_container(getattr(container, "id"))
2022-05-18 12:53:22 +01:00
except docker.errors.NotFound:
2023-01-12 09:00:47 +00:00
print(f"WARNING: Container {container.id} does not exist. Skipping.")
2022-05-18 12:53:22 +01:00
continue
2020-10-10 14:11:43 +01:00
2023-01-12 09:00:47 +00:00
running = bool(data["State"]["Running"])
started_at = data["State"]["StartedAt"]
exit_code = int(data["State"]["ExitCode"])
alert_threshold = int(
2023-01-12 09:00:47 +00:00
data["Config"]["Labels"].get("io.prometheus.alert.downtime", 3600)
)
2023-01-12 09:15:33 +00:00
starttime = numpy.datetime64(started_at, "s").astype("long")
status_gauge.labels(
container.id,
container.name,
).set(int(running))
2021-10-06 10:37:45 +01:00
started_at_gauge.labels(
container.id,
container.name,
).set(starttime)
exit_code_gauge.labels(
container.id,
container.name,
).set(int(exit_code))
alert_threshold_gauge.labels(
container.id,
container.name,
).set(alert_threshold)
2020-10-10 14:11:43 +01:00
try:
2023-01-12 09:00:47 +00:00
health_str = data["State"]["Health"]["Status"]
health_gauge.labels(
2020-10-10 16:24:44 +01:00
container.id,
container.name,
health_str,
).set(int(health_str == HEALTHY_STR))
2020-10-10 14:11:43 +01:00
except KeyError:
pass
2020-10-10 16:24:44 +01:00
2020-10-15 10:55:12 +01:00
self._respond(200, generate_latest(registry).decode())
2020-10-10 14:11:43 +01:00
def healthy() -> bool:
2020-10-10 14:11:43 +01:00
"""
Simple funtion to return if all the requirements are met
2021-09-16 14:19:50 +01:00
Returns:
bool: True if healthy or False if unhealthy
2020-10-10 14:11:43 +01:00
"""
return all(
[
2023-01-12 09:00:47 +00:00
os.path.exists("/var/run/docker.sock"),
]
)
2020-10-10 14:11:43 +01:00
2023-01-12 09:00:47 +00:00
if __name__ == "__main__":
def cli_parse() -> argparse.Namespace:
2020-10-10 14:11:43 +01:00
"""
2021-09-16 14:19:50 +01:00
Parse the CLI
Returns:
argparse.Namespace: Arguments from the CLI
2020-10-10 14:11:43 +01:00
"""
parser = argparse.ArgumentParser()
parser.add_argument(
2023-01-12 09:00:47 +00:00
"-H",
"--healthcheck",
action="store_true",
help="Simply exit with 0 for healthy or 1 when unhealthy",
2020-10-10 14:11:43 +01:00
)
return parser.parse_args()
def main() -> int:
2020-10-10 14:11:43 +01:00
"""
main()
"""
args: argparse.Namespace = cli_parse()
2020-10-10 14:11:43 +01:00
if args.healthcheck:
# Invert the sense of 'healthy' for Unix CLI usage
return not healthy()
2023-01-12 09:00:47 +00:00
print(f"Starting web server on port {LISTEN_PORT}")
2020-10-15 10:55:12 +01:00
try:
2023-01-12 09:00:47 +00:00
HTTPServer(("", LISTEN_PORT), HTTPHandler).serve_forever()
2020-10-15 10:55:12 +01:00
except KeyboardInterrupt:
2023-01-12 09:00:47 +00:00
print("Exiting")
2020-10-10 14:11:43 +01:00
2020-10-10 16:24:44 +01:00
return 0
2020-10-10 14:11:43 +01:00
sys.exit(main())