209 lines
5.9 KiB
Python
209 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Module to act as a Prometheus Exporter for Docker containers with a
|
|
healthcheck configured
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import os.path
|
|
import sys
|
|
from http.server import HTTPServer
|
|
from typing import Any
|
|
|
|
import docker # type: ignore[import]
|
|
import numpy
|
|
from prometheus_client import CollectorRegistry # type: ignore[import]
|
|
from prometheus_client import Gauge, MetricsHandler, generate_latest
|
|
|
|
LISTEN_PORT = int(os.environ.get("DOCKSTAT_LISTEN_PORT", 8080))
|
|
HEALTHY_STR = "healthy"
|
|
|
|
|
|
class HTTPHandler(MetricsHandler): # type: ignore[misc]
|
|
"""
|
|
Class to encompass the requirements of a Prometheus Exporter
|
|
for Docker containers with a healthcheck configured
|
|
"""
|
|
|
|
def __init__(self, *args: Any, **kwargs: Any):
|
|
self.docker_api: docker.APIClient = docker.APIClient()
|
|
self.docker_client = docker.from_env()
|
|
super().__init__(*args, **kwargs)
|
|
|
|
# Override built-in method
|
|
def do_GET(self) -> None:
|
|
"""
|
|
Handle GET requests
|
|
"""
|
|
if self.path == "/metrics":
|
|
try:
|
|
self._metrics()
|
|
except docker.errors.NotFound:
|
|
pass
|
|
|
|
if self.path == "/healthcheck":
|
|
if not healthy():
|
|
print("ERROR: Check requirements")
|
|
self._respond(500, "ERR")
|
|
|
|
self._respond(200, "OK")
|
|
|
|
def _respond(self, status: int, message: str) -> None:
|
|
"""
|
|
Output a simple HTTP status and string to the client
|
|
|
|
e.g. 200 OK
|
|
|
|
Args:
|
|
status (int): HTTP status to output
|
|
message (str): String to output
|
|
"""
|
|
self.send_response(int(status) or 500)
|
|
self.end_headers()
|
|
try:
|
|
self.wfile.write(bytes(str(message).encode()))
|
|
except BrokenPipeError:
|
|
pass
|
|
|
|
def _metrics(self) -> None:
|
|
"""
|
|
Handle the request for metrics
|
|
"""
|
|
if not healthy:
|
|
print("ERROR: Check requirements")
|
|
self._respond(500, "Server not configured correctly")
|
|
return
|
|
|
|
registry = CollectorRegistry()
|
|
|
|
health_gauge = Gauge(
|
|
"container_inspect_state_health_status",
|
|
"Container's healthcheck value (binary)",
|
|
labelnames=["id", "name", "value"],
|
|
registry=registry,
|
|
)
|
|
status_gauge = Gauge(
|
|
"container_inspect_state_running",
|
|
"Container's running state (binary)",
|
|
labelnames=["id", "name"],
|
|
registry=registry,
|
|
)
|
|
started_at_gauge = Gauge(
|
|
"container_inspect_state_started_at",
|
|
"Container's start time (int)",
|
|
labelnames=["id", "name"],
|
|
registry=registry,
|
|
)
|
|
exit_code_gauge = Gauge(
|
|
"container_inspect_state_exit_code",
|
|
"Container's exit code (int)",
|
|
labelnames=["id", "name"],
|
|
registry=registry,
|
|
)
|
|
alert_threshold_gauge = Gauge(
|
|
"container_inspect_downtime_alert_threshold",
|
|
"Container's downtime alert threshold in seconds (int)",
|
|
labelnames=["id", "name"],
|
|
registry=registry,
|
|
)
|
|
|
|
for container in self.docker_client.containers.list(all=True):
|
|
try:
|
|
data = self.docker_api.inspect_container(container.id)
|
|
except docker.errors.NotFound:
|
|
print(f"WARNING: Container {container.id} does not exist. Skipping.")
|
|
continue
|
|
|
|
running = bool(data["State"]["Running"])
|
|
started_at = data["State"]["StartedAt"]
|
|
exit_code = int(data["State"]["ExitCode"])
|
|
alert_threshold = int(
|
|
data["Config"]["Labels"].get("io.prometheus.alert.downtime", 3600)
|
|
)
|
|
starttime: numpy.longlong = numpy.datetime64(started_at, "s").astype("long")
|
|
|
|
status_gauge.labels(
|
|
container.id,
|
|
container.name,
|
|
).set(int(running))
|
|
started_at_gauge.labels(
|
|
container.id,
|
|
container.name,
|
|
).set(starttime)
|
|
exit_code_gauge.labels(
|
|
container.id,
|
|
container.name,
|
|
).set(int(exit_code))
|
|
alert_threshold_gauge.labels(
|
|
container.id,
|
|
container.name,
|
|
).set(alert_threshold)
|
|
|
|
try:
|
|
health_str = data["State"]["Health"]["Status"]
|
|
health_gauge.labels(
|
|
container.id,
|
|
container.name,
|
|
health_str,
|
|
).set(int(health_str == HEALTHY_STR))
|
|
except KeyError:
|
|
pass
|
|
|
|
self._respond(200, generate_latest(registry).decode())
|
|
|
|
|
|
def healthy() -> bool:
|
|
"""
|
|
Simple funtion to return if all the requirements are met
|
|
|
|
Returns:
|
|
bool: True if healthy or False if unhealthy
|
|
"""
|
|
return all(
|
|
[
|
|
os.path.exists("/var/run/docker.sock"),
|
|
]
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
def cli_parse() -> argparse.Namespace:
|
|
"""
|
|
Parse the CLI
|
|
|
|
Returns:
|
|
argparse.Namespace: Arguments from the CLI
|
|
"""
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument(
|
|
"-H",
|
|
"--healthcheck",
|
|
action="store_true",
|
|
help="Simply exit with 0 for healthy or 1 when unhealthy",
|
|
)
|
|
|
|
return parser.parse_args()
|
|
|
|
def main() -> int:
|
|
"""
|
|
main()
|
|
"""
|
|
args: argparse.Namespace = cli_parse()
|
|
|
|
if args.healthcheck:
|
|
# Invert the sense of 'healthy' for Unix CLI usage
|
|
return not healthy()
|
|
|
|
print(f"Starting web server on port {LISTEN_PORT}")
|
|
try:
|
|
HTTPServer(("", LISTEN_PORT), HTTPHandler).serve_forever()
|
|
except KeyboardInterrupt:
|
|
print("Exiting")
|
|
|
|
return 0
|
|
|
|
sys.exit(main())
|