Nouvelle présentation des alertes

This commit is contained in:
Adrien Bourmault 2023-03-15 19:41:29 +01:00 committed by admin666
parent 630ad6948c
commit 81ac8af5cc
7 changed files with 170 additions and 329 deletions

14
bot.py
View File

@ -135,13 +135,13 @@ class MUCBot(slixmpp.ClientXMPP):
the user's nickname the user's nickname
""" """
if presence['muc']['nick'] in self.datastore.knownMaintainers: # if presence['muc']['nick'] in self.datastore.knownMaintainers:
self.send_message(mto=presence['from'].bare, # self.send_message(mto=presence['from'].bare,
mbody="Salut %s, vos services ont produit des " \ # mbody="Salut %s, vos services ont produit des " \
% (presence['muc']['nick']) + # % (presence['muc']['nick']) +
"alertes en votre absence !\nUtilisez la commande"+ # "alertes en votre absence !\nUtilisez la commande"+
" `hist` pour consulter l'historique", # " `hist` pour consulter l'historique",
mtype='groupchat') # mtype='groupchat')
if presence['muc']['affiliation'] == "owner": if presence['muc']['affiliation'] == "owner":
if not presence['from'].bare in self.owners: if not presence['from'].bare in self.owners:

View File

@ -34,114 +34,19 @@ def cmdhelp(owners, nick, text, store):
return msg return msg
def cmdmainteneur(owners, nick, text, store): def cmdstatus(owners, nick, text, store):
""" """
Change maintainer for an host Status informations command.
""" """
if not nick in owners:
return "désolé mais vous n'êtes pas autorisé à utiliser cette commande."
try:
splittedtext = text.split(" ")
# print maintainer
if len(splittedtext) == 2:
host = splittedtext[1]
for realhost in store.knownHosts:
if host in realhost:
return "le responsable de cette machine est " \
+ store.knownHosts[realhost].maintainer
else:
return "machine inconnue (tout le monde est son mainteneur ou bien elle n'existe pas)"
if len(splittedtext) == 3:
host = splittedtext[1]
maintainer = splittedtext[2]
if not host in store.knownHosts:
store.knownHosts[host] = HostData(host)
store.knownHosts[host].maintainer = maintainer return "\n"+store.get_status()
return "le responsable est à présent " + maintainer
except KeyError as e:
log.error(repr(e))
return "machine inconnue (tout le monde est son mainteneur)"
# except Exception as e:
# log.error(repr(e))
# return "erreur à l'exécution"
return "Syntaxe invalide"
def cmdsave(owners, nick, text, store):
"""
Save
"""
if not nick in owners:
return "désolé mais vous n'êtes pas autorisé à utiliser cette commande."
try:
with open('current_buffer', 'wb') as current_buffer_file:
pickle.dump(store.knownHosts, current_buffer_file)
return "OK"
except Exception as e:
log.error(repr(e))
return "erreur à l'exécution"
def cmdload(owners, nick, text, store):
"""
Save
"""
if not nick in owners:
return "désolé mais vous n'êtes pas autorisé à utiliser cette commande."
try:
with open('current_buffer', 'rb') as current_buffer_file:
store.knownHosts = pickle.load(current_buffer_file)
return "OK"
except Exception as e:
log.error(repr(e))
return "erreur à l'exécution"
def cmdhist(owners, nick, text, store):
"""
Check history for a maintainer
"""
if not nick in store.knownMaintainers or len(store.knownMaintainers[nick]) == 0:
return "pas d'historique disponible pour vous."
msg = "voici les cinq derniers évènements pour chaque hôte notifié:\n"
for host in store.knownMaintainers[nick]:
msg += "\nHôte %s:\n" % host
count = 0
for serviceline in store.knownMaintainers[nick][host]:
msg += "- %s" % serviceline + "\n"
count += 1
if count >= 5:
break
del store.knownMaintainers[nick]
store.knownMaintainers[nick] = {}
return msg
# Commands # Commands
commandtable = { commandtable = {
"help" : cmdhelp, "help" : cmdhelp,
"hist" : cmdhist, "ping" : cmdping,
"load" : cmdload, "status": cmdstatus
"mainteneur" : cmdmainteneur,
"ping" : cmdping,
"save" : cmdsave,
} }

Binary file not shown.

356
data.py
View File

@ -3,11 +3,30 @@ import logging
from systemd.journal import JournalHandler from systemd.journal import JournalHandler
# Logging # Logging
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
log.addHandler(JournalHandler()) log.addHandler(JournalHandler())
PRIORITY = {
"OK":0,
"UP":0,
"WARNING":1,
"CRITICAL":2,
"UNKNOWN": 2,
"DOWN": 3,
}
class ProblemData:
"""
Data related to notifications related to a given problem
"""
def __init__(self, name):
self.name = name
self.status = "OK"
self.last_update = datetime.datetime.fromtimestamp(0)
class HostData: class HostData:
""" """
@ -16,19 +35,11 @@ class HostData:
def __init__(self, name): def __init__(self, name):
self.name = name self.name = name
# Concerning host
self.type = ""
self.status = "OK" self.status = "OK"
self.downtime = False self.downtime = False
self.last_update = datetime.datetime.fromtimestamp(0)
# Concerning services self.worst = None
self.statuses = {} self.problems = set()
self.types = {}
# Tools
self.counts = {"CRITICAL":0, "WARNING":0, "OK":0}
self.maintainer = "Tout le monde"
class DataStore: class DataStore:
@ -37,225 +48,150 @@ class DataStore:
log.info("Created DataStore") log.info("Created DataStore")
self.knownHosts = {} self.hosts = set()
self.knownMaintainers = {}
self.linkedBot = linkedBot self.linkedBot = linkedBot
def notify(self, destmuc):
msg = "```\n"
msg += "*** Isengard - Statut des services ***\n\n"
msg += "-"*80 + "\n"
msg += "| Hôte | Statut | Pire service | Dernière maj |\n"
msg += "-"*80 + "\n"
for host in [x for x in self.hosts]:
msg += "*"
msg += " " + str(host.name)[:22] + " "*(23 - len(str(host.name)[:22])) + "|"
msg += " " + str(host.status)[:8] + " "*(9 - len(str(host.status))) + "|"
msg += " " + str(host.worst)[:16] + " "*(17 - len(str(host.worst)[:16])) + "|"
msg += " " + host.last_update.strftime("%m/%d/%Y, %H:%M:%S")[:21] +\
" "*(22 - len(host.last_update.strftime("%m/%d/%Y, %H:%M:%S"))) +\
"|\n"
msg += "-"*80 + "\n"
if PRIORITY[host.status] == 0:
self.hosts.discard(host)
msg += "```"
# Send notification
log.info("Sending to %s: %s" % (destmuc, msg))
self.linkedBot.push(destmuc, msg)
def get_status(self):
msg = "```\n"
msg += "*** Isengard - Statut des services ***\n\n"
msg += "-"*80 + "\n"
msg += "| Hôte | Statut | Pire service | Dernière maj |\n"
msg += "-"*80 + "\n"
for host in [x for x in self.hosts]:
msg += "*"
msg += " " + str(host.name)[:22] + " "*(23 - len(str(host.name)[:22])) + "|"
msg += " " + str(host.status)[:8] + " "*(9 - len(str(host.status))) + "|"
msg += " " + str(host.worst)[:16] + " "*(17 - len(str(host.worst)[:16])) + "|"
msg += " " + host.last_update.strftime("%m/%d/%Y, %H:%M:%S")[:21] +\
" "*(22 - len(host.last_update.strftime("%m/%d/%Y, %H:%M:%S"))) +\
"|\n"
msg += "-"*80 + "\n"
if PRIORITY[host.status] == 0:
self.hosts.discard(host)
msg += "```"
return msg
def push(self, msg): def push(self, msg):
""" """
Process messages like TYPE|HOST/SERVICE|STATE|OUTPUT|SENDER|COMMENT Process messages like TYPE|HOST/SERVICE|STATE|OUTPUT|SENDER|COMMENT
""" """
# Get current time # Get current time
curtime = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") curtime = datetime.datetime.now() #.strftime("%m/%d/%Y, %H:%M:%S")
# Get all params # Get all params
destmuc, type, location, status, text, sender, comment = msg.split("|") destmuc, mtype, location, status, text, sender, comment = msg.split("|")
mtype = mtype.replace(" ", "")
status = status.replace(" ", "")
# Check if message is about a service or host # Check if message is about a service or host
try: try:
host, service = location.split("/") hostname, service = location.split("/")
except ValueError: except ValueError:
host = location.split("/")[0] hostname = location.split("/")[0]
service = '' service = None
# Create raw text from notification
raw = "%s [%s/%s]: %s (%s %s)" % (curtime, type, status, text,
sender, comment)
cur = None cur = None
log.info("Datastore received: %s" % msg)
log.info("Datastore understood: %s" % raw)
# Look for host # Look for host
if host in self.knownHosts: for host in self.hosts:
cur = self.knownHosts[host] if host.name == hostname:
cur = host
# It's a service
if service != "": # Host not found
# is it known ? if not(cur):
if service in cur.statuses: cur = HostData(hostname)
# does the status change ? self.hosts.add(cur)
if cur.statuses[service] != status: log.info("CREATED : %s\n" % cur)
# update
if not cur.statuses[service] in cur.counts:
cur.counts[status] = 0 # Retrieve informations and update
cur.counts[cur.statuses[service]] -= 1 log.info("RECEIVED : status %s; mtype %s; location %s; sender %s; comment %s; text %s\n"
if not status in cur.counts: % (status, mtype, location, sender, comment, text))
cur.counts[status] = 0
cur.counts[status] += 1 cur.last_update = curtime
cur.statuses[service] = status
cur.types[service] = status # If that's global
else: if not(service):
# create status entry
cur.statuses[service] = status # Host is now down
if not status in cur.counts: if PRIORITY[status] > PRIORITY[cur.status]:
cur.counts[status] = 0
cur.counts[status] += 1
# It's not a service (so general)
else:
# does the status change ?
if cur.status != status:
cur.status = status
cur.type = type
# Host is not known
else:
# create host
self.knownHosts[host] = HostData(host)
cur = self.knownHosts[host]
# It's a service
if service != "":
# create status entry
cur.statuses[service] = status
if not status in cur.counts:
cur.counts[status] = 0
cur.counts[status] += 1
# It's not a service (so general)
else:
# create status entry
cur.status = status cur.status = status
cur.type = type cur.worst = "DOWN"
# Update history # DOWNTIME
if not cur.maintainer in self.knownMaintainers: elif "DOWNTIME" in mtype:
self.knownMaintainers[cur.maintainer] = {}
if not host in self.knownMaintainers[cur.maintainer]:
self.knownMaintainers[cur.maintainer][host] = []
self.knownMaintainers[cur.maintainer][host].append(raw)
# Is there only one service or more problems for this host?
problemCount = 0
for cstatus in cur.counts:
if not "OK" in cstatus:
problemCount += cur.counts[cstatus]
# If this notification is a problem
if not "OK" in status and not "UP" in status:
# Unknown state
if "UNKNOWN" in cur.type and problemCount == 0:
message = "état inconnu sur (%s)" % (host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Unknown state
elif "UNKNOWN" in cur.type:
pass pass
# General problem # Host is no more down and has no more problems
elif not "OK" in cur.status and problemCount == 0: elif not len(cur.problems):
message = "je détecte un problème général (%s)" \ cur.status = "OK"
" sur %s (%s)" % (status, host, text) cur.worst = None
# send notification
log.info("Sending to %s: %s" % (destmuc, message)) # Service problem
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Only one service has a problem
elif service and problemCount == 1:
message = "je détecte un problème (%s) sur le service %s de" \
" la machine %s" \
" (%s)" % (status, service, host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Multiple problems
else:
message = "je détecte de multiples problèmes " \
"sur la machine %s\n" % (host)
# create recap from statuses that are not OK
for cstatus in cur.counts:
if not "OK" in cstatus:
message += "%s %s(s), " % \
(str(cur.counts[cstatus]), cstatus)
message = message[:-2]
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# We have a recovery
else: else:
cur_problem = None
# Look for existing problem
for problem in cur.problems:
if problem.name == service:
cur_problem = problem
# Problem not found, create it
if not(cur_problem):
cur_problem = ProblemData(service)
cur.problems.add(cur_problem)
log.info("CREATED PROBLEM in %s : %s\n" % (cur, cur_problem))
cur_problem.last_update = curtime
cur_problem.status = status
if PRIORITY[status] == 0 and cur.worst == cur_problem:
cur.worst = None
cur.status = "OK"
if "DOWNTIME" in type: worst_problem = ProblemData(None)
if "DOWNTIMESTART" in type and not cur.downtime:
cur.downtime = True
message = "début de downtime sur" \
" sur %s (%s)" % (host, comment)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
elif "DOWNTIMEEND" in type and cur.downtime:
cur.downtime = False
message = "fin de downtime sur" \
" sur %s (%s)" % (host, comment)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
else:
pass
# General problem
elif not service and problemCount == 0:
message = "fin d'alerte générale sur" \
" sur %s (%s)" % (host, text)
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Only one service has a problem
elif service and problemCount == 0:
message = "résolution du problème sur le service %s de" \
" la machine %s" \
" (%s)\n" % (service, host, text)
# create recap from statuses that are not OK
for cstatus in cur.counts:
message += "%s %s(s), " % \
(str(cur.counts[cstatus]), cstatus)
message = message[:-2]
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
# Resolution but multiple problems
else:
message = "résolution d'alertes en cours " \
"sur la machine %s\n" % (host)
# create recap from statuses
for cstatus in cur.counts:
message += "%s %s(s), " % \
(str(cur.counts[cstatus]), cstatus)
message = message[:-2]
# send notification
log.info("Sending to %s: %s" % (destmuc, message))
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
log.info("Datastore known hosts: %s" % str(self.knownHosts))
log.info("Datastore known maintainers: %s" % str(self.knownHosts))
# Find the worst current problem
for problem in cur.problems:
if PRIORITY[problem.status] > PRIORITY[worst_problem.status]:
if problem.last_update > worst_problem.last_update:
worst_problem = problem
if worst_problem.name != None:
cur.status = worst_problem.status
cur.worst = worst_problem.name
if PRIORITY[status] >= 2 or ("RECOVERY" in mtype and cur.worst == None):
self.notify(destmuc)
return

View File

@ -10,9 +10,9 @@ log.addHandler(JournalHandler())
class LocalServer(threading.Thread): class LocalServer(threading.Thread):
def __init__(self, sharedBuffer): def __init__(self, datastore):
threading.Thread.__init__(self) threading.Thread.__init__(self)
self.sharedBuffer = sharedBuffer self.datastore = datastore
self.pleaseStop = False self.pleaseStop = False
def run(self): def run(self):
@ -32,7 +32,7 @@ class LocalServer(threading.Thread):
log.debug(rcvStr) log.debug(rcvStr)
if rcvStr != '': if rcvStr != '':
self.sharedBuffer.push(rcvStr) self.datastore.push(rcvStr)
client.send(b'') # Send a message to the client client.send(b'') # Send a message to the client
client.close() client.close()

10
main.py
View File

@ -85,7 +85,7 @@ if __name__ == '__main__':
nick = "Isengard" nick = "Isengard"
ERRORS = 0 ERRORS = 0
store = None datastore = None
localservthread = None localservthread = None
while True: while True:
@ -96,13 +96,13 @@ if __name__ == '__main__':
xmpp.register_plugin('xep_0199') # XMPP Ping xmpp.register_plugin('xep_0199') # XMPP Ping
# Create buffer # Create buffer
if not store: if not datastore:
store = DataStore(xmpp) datastore = DataStore(xmpp)
xmpp.datastore = store xmpp.datastore = datastore
# Launch local server # Launch local server
if not localservthread: if not localservthread:
localservthread = LocalServer(store) localservthread = LocalServer(datastore)
localservthread.start() localservthread.start()
time.sleep(1) time.sleep(1)

View File

@ -3,8 +3,8 @@ Description=SlixXMPP service bot
After=network.target ejabberd.service After=network.target ejabberd.service
[Service] [Service]
WorkingDirectory=/home/isengard_xmpp/xmpp_bot WorkingDirectory=/var/isengard_xmpp/repo
ExecStart=/bin/bash -c 'python3 main.py --jid isengard@a-lec.org --nick Isengard --password $(cat /etc/xmpp_bot/password.conf) --room cominfra@salons.a-lec.org' ExecStart=/bin/bash -c 'python3 main.py --jid isengard@a-lec.org --nick Isengard --password $(cat /etc/xmpp_bot/password.conf) --room cominfra-supervision@salons.a-lec.org'
Restart=on-failure Restart=on-failure
RestartSec=60s RestartSec=60s
User=isengard_xmpp User=isengard_xmpp