Add stats to enable monitoring for non-running containers

This commit is contained in:
Scott Wallace 2021-09-10 13:21:44 +01:00
parent cbf4bd2c7b
commit 15efadbb5d
Signed by: scott
GPG key ID: AA742FDC5AFE2A72
3 changed files with 77 additions and 23 deletions

2
.gitignore vendored
View file

@ -1 +1,3 @@
.mypy_cache/
.pyenv/ .pyenv/
.vscode/

View file

@ -5,19 +5,21 @@ Module to act as a Prometheus Exporter for Docker containers with a
""" """
import argparse import argparse
import os
import os.path import os.path
import sys import sys
from http.server import HTTPServer from http.server import HTTPServer
import docker import docker # type: ignore
from prometheus_client import ( import numpy
from prometheus_client import ( # type: ignore
CollectorRegistry, CollectorRegistry,
Gauge, Gauge,
generate_latest,
MetricsHandler, MetricsHandler,
generate_latest,
) )
LISTEN_PORT = 8080 LISTEN_PORT = int(os.environ.get('LISTEN_PORT', 8080))
HEALTHY_STR = 'healthy' HEALTHY_STR = 'healthy'
@ -28,12 +30,11 @@ class HTTPHandler(MetricsHandler):
""" """
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.docker_api = docker.APIClient() self.docker_api: docker.APIClient = docker.APIClient()
self.docker_client = docker.from_env() self.docker_client = docker.from_env()
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
# Override built-in method # Override built-in method
# pylint: disable=invalid-name
def do_GET(self): def do_GET(self):
""" """
Method to handle GET requests Method to handle GET requests
@ -48,7 +49,7 @@ class HTTPHandler(MetricsHandler):
self._respond(200, 'OK') self._respond(200, 'OK')
def _respond(self, status, message): def _respond(self, status: int, message: str):
""" """
Method to output a simple HTTP status and string to the client Method to output a simple HTTP status and string to the client
""" """
@ -67,60 +68,110 @@ class HTTPHandler(MetricsHandler):
registry = CollectorRegistry() registry = CollectorRegistry()
gauge = Gauge( health_gauge = Gauge(
'container_inspect_state_health_status', 'container_inspect_state_health_status',
"Container's healthcheck value (binary)", "Container's healthcheck value (binary)",
labelnames=['id', 'name', 'value'], labelnames=['id', 'name', 'value'],
registry=registry registry=registry,
)
status_gauge = Gauge(
'container_inspect_state_running',
"Container's running state (binary)",
labelnames=['id', 'name'],
registry=registry,
)
started_at_gauge = Gauge(
'container_inspect_state_started_at',
"Container's start time (int)",
labelnames=['id', 'name'],
registry=registry,
)
exit_code_gauge = Gauge(
'container_inspect_state_exit_code',
"Container's exit code (int)",
labelnames=['id', 'name'],
registry=registry,
)
alert_threshold_gauge = Gauge(
'container_inspect_downtime_alert_threshold',
"Container's downtime alert threshold in seconds (int)",
labelnames=['id', 'name'],
registry=registry,
) )
for container in self.docker_client.containers.list(): for container in self.docker_client.containers.list(all=True):
data = self.docker_api.inspect_container(container.id) data = self.docker_api.inspect_container(container.id)
running: str = bool(data['State']['Running'])
started_at: int = data['State']['StartedAt']
exit_code: int = int(data['State']['ExitCode'])
alert_threshold = int(
data['Config']['Labels'].get('io.prometheus.alert.downtime', 3600)
)
starttime = numpy.datetime64(started_at)
status_gauge.labels(
container.id,
container.name,
).set(int(running))
started_at_gauge.labels(container.id, container.name,).set(
int(int(starttime) / 1000000000) # strip nanoseconds
)
exit_code_gauge.labels(
container.id,
container.name,
).set(int(exit_code))
alert_threshold_gauge.labels(
container.id,
container.name,
).set(alert_threshold)
try: try:
health_str = data["State"]["Health"]["Status"] health_str: str = data['State']['Health']['Status']
label_values = [ health_gauge.labels(
container.id, container.id,
container.name, container.name,
health_str, health_str,
] ).set(int(health_str == HEALTHY_STR))
except KeyError: except KeyError:
pass pass
else:
gauge.labels(*label_values).set(int(health_str == HEALTHY_STR))
self._respond(200, generate_latest(registry).decode()) self._respond(200, generate_latest(registry).decode())
def healthy(): def healthy() -> bool:
""" """
Simple funtion to return if all the requirements are met Simple funtion to return if all the requirements are met
""" """
return all([ return all(
[
os.path.exists('/var/run/docker.sock'), os.path.exists('/var/run/docker.sock'),
]) ]
)
if __name__ == '__main__': if __name__ == '__main__':
def cli_parse():
def cli_parse() -> argparse.Namespace:
""" """
Function to parse the CLI Function to parse the CLI
""" """
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
'-H', '--healthcheck', '-H',
'--healthcheck',
action='store_true', action='store_true',
help='Simply exit with 0 for healthy or 1 when unhealthy', help='Simply exit with 0 for healthy or 1 when unhealthy',
) )
return parser.parse_args() return parser.parse_args()
def main(): def main() -> int:
""" """
main() main()
""" """
args = cli_parse() args: argparse.Namespace = cli_parse()
if args.healthcheck: if args.healthcheck:
# Invert the sense of 'healthy' for Unix CLI usage # Invert the sense of 'healthy' for Unix CLI usage

View file

@ -1,3 +1,4 @@
# To ensure app dependencies are ported from your virtual environment/host machine into your container, run 'pip freeze > requirements.txt' in the terminal to overwrite this file # To ensure app dependencies are ported from your virtual environment/host machine into your container, run 'pip freeze > requirements.txt' in the terminal to overwrite this file
docker docker
numpy
prometheus_client prometheus_client