dockstat/dockstat.py

203 lines
5.8 KiB
Python

#!/usr/bin/env python3
"""
Module to act as a Prometheus Exporter for Docker containers with a
healthcheck configured
"""
import argparse
import os
import os.path
import sys
from http.server import HTTPServer
from typing import Any
import docker # type: ignore[import]
import numpy
from prometheus_client import CollectorRegistry # type: ignore[import]
from prometheus_client import Gauge, MetricsHandler, generate_latest
LISTEN_PORT = int(os.environ.get('DOCKSTAT_LISTEN_PORT', 8080))
HEALTHY_STR = 'healthy'
class HTTPHandler(MetricsHandler): # type: ignore[misc]
"""
Class to encompass the requirements of a Prometheus Exporter
for Docker containers with a healthcheck configured
"""
def __init__(self, *args: Any, **kwargs: Any):
self.docker_api: docker.APIClient = docker.APIClient()
self.docker_client = docker.from_env()
super().__init__(*args, **kwargs)
# Override built-in method
def do_GET(self) -> None:
"""
Handle GET requests
"""
if self.path == '/metrics':
self._metrics()
if self.path == '/healthcheck':
if not healthy():
print('ERROR: Check requirements')
self._respond(500, 'ERR')
self._respond(200, 'OK')
def _respond(self, status: int, message: str) -> None:
"""
Output a simple HTTP status and string to the client
e.g. 200 OK
Args:
status (int): HTTP status to output
message (str): String to output
"""
self.send_response(int(status) or 500)
self.end_headers()
self.wfile.write(bytes(str(message).encode()))
def _metrics(self) -> None:
"""
Handle the request for metrics
"""
if not healthy:
print('ERROR: Check requirements')
self._respond(500, 'Server not configured correctly')
return
registry = CollectorRegistry()
health_gauge = Gauge(
'container_inspect_state_health_status',
"Container's healthcheck value (binary)",
labelnames=['id', 'name', 'value'],
registry=registry,
)
status_gauge = Gauge(
'container_inspect_state_running',
"Container's running state (binary)",
labelnames=['id', 'name'],
registry=registry,
)
started_at_gauge = Gauge(
'container_inspect_state_started_at',
"Container's start time (int)",
labelnames=['id', 'name'],
registry=registry,
)
exit_code_gauge = Gauge(
'container_inspect_state_exit_code',
"Container's exit code (int)",
labelnames=['id', 'name'],
registry=registry,
)
alert_threshold_gauge = Gauge(
'container_inspect_downtime_alert_threshold',
"Container's downtime alert threshold in seconds (int)",
labelnames=['id', 'name'],
registry=registry,
)
for container in self.docker_client.containers.list(all=True):
try:
data = self.docker_api.inspect_container(container.id)
except docker.errors.NotFound:
print(f'WARNING: Container {container.id} does not exist. Skipping.')
continue
running = bool(data['State']['Running'])
started_at = data['State']['StartedAt']
exit_code = int(data['State']['ExitCode'])
alert_threshold = int(
data['Config']['Labels'].get('io.prometheus.alert.downtime', 3600)
)
starttime: numpy.longlong = numpy.datetime64(started_at, 's').astype('long')
status_gauge.labels(
container.id,
container.name,
).set(int(running))
started_at_gauge.labels(
container.id,
container.name,
).set(starttime)
exit_code_gauge.labels(
container.id,
container.name,
).set(int(exit_code))
alert_threshold_gauge.labels(
container.id,
container.name,
).set(alert_threshold)
try:
health_str = data['State']['Health']['Status']
health_gauge.labels(
container.id,
container.name,
health_str,
).set(int(health_str == HEALTHY_STR))
except KeyError:
pass
self._respond(200, generate_latest(registry).decode())
def healthy() -> bool:
"""
Simple funtion to return if all the requirements are met
Returns:
bool: True if healthy or False if unhealthy
"""
return all(
[
os.path.exists('/var/run/docker.sock'),
]
)
if __name__ == '__main__':
def cli_parse() -> argparse.Namespace:
"""
Parse the CLI
Returns:
argparse.Namespace: Arguments from the CLI
"""
parser = argparse.ArgumentParser()
parser.add_argument(
'-H',
'--healthcheck',
action='store_true',
help='Simply exit with 0 for healthy or 1 when unhealthy',
)
return parser.parse_args()
def main() -> int:
"""
main()
"""
args: argparse.Namespace = cli_parse()
if args.healthcheck:
# Invert the sense of 'healthy' for Unix CLI usage
return not healthy()
print(f'Starting web server on port {LISTEN_PORT}')
try:
HTTPServer(('', LISTEN_PORT), HTTPHandler).serve_forever()
except KeyboardInterrupt:
print('Exiting')
return 0
sys.exit(main())