isengard-bot/data.py

260 lines
8.7 KiB
Python

import datetime
import logging
from systemd.journal import JournalHandler
# Logging
log = logging.getLogger(__name__)
log.addHandler(JournalHandler())
class HostData:
"""
Data related to notifications related to a given host
"""
def __init__(self, name):
self.name = name
# Concerning host
self.type = ""
self.status = "OK"
self.downtime = False
# Concerning services
self.statuses = {}
self.types = {}
# Tools
self.counts = {"CRITICAL":0, "WARNING":0, "OK":0}
self.maintainer = "Tout le monde"
class DataStore:
def __init__(self, linkedBot):
log.info("Created DataStore")
self.knownHosts = {}
self.knownMaintainers = {}
self.linkedBot = linkedBot
def push(self, msg):
"""
Process messages like TYPE|HOST/SERVICE|STATE|OUTPUT|SENDER|COMMENT
"""
# Get current time
curtime = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
# Get all params
destmuc, type, location, status, text, sender, comment = msg.split("|")
# Check if message is about a service or host
try:
host, service = location.split("/")
except ValueError:
host = location.split("/")[0]
service = ''
# Create raw text from notification
raw = "%s [%s/%s]: %s (%s %s)" % (curtime, type, status, text,
sender, comment)
cur = None
log.info("Datastore received: %s" % msg)
log.info("Datastore understood: %s" % raw)
# Look for host
if host in self.knownHosts:
cur = self.knownHosts[host]
# It's a service
if service != "":
# is it known ?
if service in cur.statuses:
# does the status change ?
if cur.statuses[service] != status:
# update
if not cur.statuses[service] in cur.counts:
cur.counts[status] = 0
cur.counts[cur.statuses[service]] -= 1
if not status in cur.counts:
cur.counts[status] = 0
cur.counts[status] += 1
cur.statuses[service] = status
cur.types[service] = status
else:
# create status entry
cur.statuses[service] = status
if not status in cur.counts:
cur.counts[status] = 0
cur.counts[status] += 1
# It's not a service (so general)
else:
# does the status change ?
if cur.status != status:
cur.status = status
cur.type = type
# Host is not known
else:
# create host
self.knownHosts[host] = HostData(host)
cur = self.knownHosts[host]
# It's a service
if service != "":
# create status entry
cur.statuses[service] = status
if not status in cur.counts:
cur.counts[status] = 0
cur.counts[status] += 1
# It's not a service (so general)
else:
# create status entry
cur.status = status
cur.type = type
# Update history
if not cur.maintainer in self.knownMaintainers:
self.knownMaintainers[cur.maintainer] = {}
if not host in self.knownMaintainers[cur.maintainer]:
self.knownMaintainers[cur.maintainer][host] = []
self.knownMaintainers[cur.maintainer][host].append(raw)
# Is there only one service or more problems for this host?
problemCount = 0
for cstatus in cur.counts:
if not "OK" in cstatus:
problemCount += cur.counts[cstatus]
# If this notification is a problem
if not "OK" in status and not "UP" in status:
# Unknown state
if "UNKNOWN" in cur.type and problemCount == 0:
message = "état inconnu sur (%s)" % (host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Unknown state
elif "UNKNOWN" in cur.type:
pass
# General problem
elif not "OK" in cur.status and problemCount == 0:
message = "je détecte un problème général (%s)" \
" sur %s (%s)" % (status, host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Only one service has a problem
elif service and problemCount == 1:
message = "je détecte un problème (%s) sur le service %s de" \
" la machine %s" \
" (%s)" % (status, service, host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Multiple problems
else:
message = "je détecte de multiples problèmes " \
"sur la machine %s\n" % (host)
# create recap from statuses that are not OK
for cstatus in cur.counts:
if not "OK" in cstatus:
message += "%s %s(s), " % \
(str(cur.counts[cstatus]), cstatus)
message = message[:-2]
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# We have a recovery
else:
if "DOWNTIME" in type:
if "DOWNTIMESTART" in type and not cur.downtime:
cur.downtime = True
message = "début de downtime sur" \
" sur %s (%s)" % (host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
elif "DOWNTIMEEND" in type and not cur.downtime:
cur.downtime = True
message = "début de downtime sur" \
" sur %s (%s)" % (host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# General problem
elif not service and problemCount == 0:
message = "fin d'alerte générale sur" \
" sur %s (%s)" % (host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Only one service has a problem
elif service and problemCount == 0:
message = "résolution du problème sur le service %s de" \
" la machine %s" \
" (%s)\n" % (service, host, text)
# create recap from statuses that are not OK
for cstatus in cur.counts:
message += "%s %s(s), " % \
(str(cur.counts[cstatus]), cstatus)
message = message[:-2]
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Resolution but multiple problems
else:
message = "résolution d'alertes en cours " \
"sur la machine %s\n" % (host)
# create recap from statuses
for cstatus in cur.counts:
message += "%s %s(s), " % \
(str(cur.counts[cstatus]), cstatus)
message = message[:-2]
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
log.info("Datastore known hosts: %s" % str(self.knownHosts))
log.info("Datastore known maintainers: %s" % str(self.knownHosts))