isengard-bot/data.py

260 lines
8.7 KiB
Python
Raw Normal View History

2022-02-08 15:46:34 +01:00
import datetime
import logging
from systemd.journal import JournalHandler
# Logging
log = logging.getLogger(__name__)
log.addHandler(JournalHandler())
2022-02-08 20:25:51 +01:00
2022-02-08 15:46:34 +01:00
class HostData:
"""
Data related to notifications related to a given host
"""
def __init__(self, name):
self.name = name
# Concerning host
2022-02-08 20:25:51 +01:00
self.type = ""
self.status = "OK"
2022-02-14 19:13:17 +01:00
self.downtime = False
2022-02-08 20:25:51 +01:00
# Concerning services
self.statuses = {}
self.types = {}
2022-02-08 15:46:34 +01:00
# Tools
2022-02-08 20:25:51 +01:00
self.counts = {"CRITICAL":0, "WARNING":0, "OK":0}
2022-02-08 15:46:34 +01:00
self.maintainer = "Tout le monde"
2022-02-08 20:25:51 +01:00
2022-02-08 15:46:34 +01:00
class DataStore:
def __init__(self, linkedBot):
log.info("Created DataStore")
self.knownHosts = {}
self.knownMaintainers = {}
self.linkedBot = linkedBot
def push(self, msg):
2022-02-08 20:25:51 +01:00
"""
Process messages like TYPE|HOST/SERVICE|STATE|OUTPUT|SENDER|COMMENT
"""
2022-02-08 15:46:34 +01:00
# Get current time
curtime = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
# Get all params
2022-02-08 20:25:51 +01:00
destmuc, type, location, status, text, sender, comment = msg.split("|")
# Check if message is about a service or host
try:
2022-02-08 15:46:34 +01:00
host, service = location.split("/")
2022-02-08 20:25:51 +01:00
except ValueError:
host = location.split("/")[0]
service = ''
# Create raw text from notification
raw = "%s [%s/%s]: %s (%s %s)" % (curtime, type, status, text,
sender, comment)
cur = None
log.info("Datastore received: %s" % msg)
log.info("Datastore understood: %s" % raw)
# Look for host
if host in self.knownHosts:
cur = self.knownHosts[host]
# It's a service
if service != "":
# is it known ?
if service in cur.statuses:
# does the status change ?
if cur.statuses[service] != status:
# update
if not cur.statuses[service] in cur.counts:
cur.counts[status] = 0
cur.counts[cur.statuses[service]] -= 1
if not status in cur.counts:
cur.counts[status] = 0
cur.counts[status] += 1
cur.statuses[service] = status
cur.types[service] = status
else:
# create status entry
cur.statuses[service] = status
if not status in cur.counts:
cur.counts[status] = 0
cur.counts[status] += 1
# It's not a service (so general)
else:
# does the status change ?
if cur.status != status:
cur.status = status
cur.type = type
# Host is not known
else:
# create host
self.knownHosts[host] = HostData(host)
cur = self.knownHosts[host]
# It's a service
if service != "":
# create status entry
cur.statuses[service] = status
if not status in cur.counts:
cur.counts[status] = 0
cur.counts[status] += 1
# It's not a service (so general)
else:
# create status entry
cur.status = status
cur.type = type
# Update history
if not cur.maintainer in self.knownMaintainers:
self.knownMaintainers[cur.maintainer] = {}
if not host in self.knownMaintainers[cur.maintainer]:
self.knownMaintainers[cur.maintainer][host] = []
self.knownMaintainers[cur.maintainer][host].append(raw)
# Is there only one service or more problems for this host?
problemCount = 0
for cstatus in cur.counts:
if not "OK" in cstatus:
problemCount += cur.counts[cstatus]
# If this notification is a problem
2022-02-14 19:40:23 +01:00
if not "OK" in status and not "UP" in status:
2022-02-14 17:41:42 +01:00
# Unknown state
if "UNKNOWN" in cur.type and problemCount == 0:
message = "état inconnu sur (%s)" % (host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Unknown state
elif "UNKNOWN" in cur.type:
pass
2022-02-08 20:25:51 +01:00
# General problem
2022-02-14 17:41:42 +01:00
elif not "OK" in cur.status and problemCount == 0:
2022-02-08 20:25:51 +01:00
message = "je détecte un problème général (%s)" \
" sur %s (%s)" % (status, host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Only one service has a problem
elif service and problemCount == 1:
message = "je détecte un problème (%s) sur le service %s de" \
" la machine %s" \
" (%s)" % (status, service, host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Multiple problems
else:
message = "je détecte de multiples problèmes " \
"sur la machine %s\n" % (host)
# create recap from statuses that are not OK
for cstatus in cur.counts:
if not "OK" in cstatus:
message += "%s %s(s), " % \
(str(cur.counts[cstatus]), cstatus)
message = message[:-2]
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# We have a recovery
2022-02-08 15:46:34 +01:00
else:
2022-02-14 19:25:51 +01:00
if "DOWNTIME" in type:
2022-02-14 19:34:57 +01:00
if "DOWNTIMESTART" in type and not cur.downtime:
2022-02-14 19:25:51 +01:00
cur.downtime = True
message = "début de downtime sur" \
" sur %s (%s)" % (host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
2022-02-14 19:34:57 +01:00
elif "DOWNTIMEEND" in type and not cur.downtime:
2022-02-14 19:25:51 +01:00
cur.downtime = True
message = "début de downtime sur" \
" sur %s (%s)" % (host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
2022-02-08 20:25:51 +01:00
# General problem
2022-02-14 19:13:17 +01:00
elif not service and problemCount == 0:
2022-02-08 20:25:51 +01:00
message = "fin d'alerte générale sur" \
" sur %s (%s)" % (host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Only one service has a problem
elif service and problemCount == 0:
message = "résolution du problème sur le service %s de" \
" la machine %s" \
" (%s)\n" % (service, host, text)
# create recap from statuses that are not OK
for cstatus in cur.counts:
message += "%s %s(s), " % \
(str(cur.counts[cstatus]), cstatus)
message = message[:-2]
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Resolution but multiple problems
else:
message = "résolution d'alertes en cours " \
"sur la machine %s\n" % (host)
# create recap from statuses
for cstatus in cur.counts:
message += "%s %s(s), " % \
(str(cur.counts[cstatus]), cstatus)
message = message[:-2]
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
log.info("Datastore known hosts: %s" % str(self.knownHosts))
log.info("Datastore known maintainers: %s" % str(self.knownHosts))
2022-02-08 15:46:34 +01:00