Nouvelle présentation des alertes
This commit is contained in:
parent
630ad6948c
commit
81ac8af5cc
14
bot.py
14
bot.py
|
@ -135,13 +135,13 @@ class MUCBot(slixmpp.ClientXMPP):
|
|||
the user's nickname
|
||||
"""
|
||||
|
||||
if presence['muc']['nick'] in self.datastore.knownMaintainers:
|
||||
self.send_message(mto=presence['from'].bare,
|
||||
mbody="Salut %s, vos services ont produit des " \
|
||||
% (presence['muc']['nick']) +
|
||||
"alertes en votre absence !\nUtilisez la commande"+
|
||||
" `hist` pour consulter l'historique",
|
||||
mtype='groupchat')
|
||||
# if presence['muc']['nick'] in self.datastore.knownMaintainers:
|
||||
# self.send_message(mto=presence['from'].bare,
|
||||
# mbody="Salut %s, vos services ont produit des " \
|
||||
# % (presence['muc']['nick']) +
|
||||
# "alertes en votre absence !\nUtilisez la commande"+
|
||||
# " `hist` pour consulter l'historique",
|
||||
# mtype='groupchat')
|
||||
|
||||
if presence['muc']['affiliation'] == "owner":
|
||||
if not presence['from'].bare in self.owners:
|
||||
|
|
109
commands.py
109
commands.py
|
@ -34,114 +34,19 @@ def cmdhelp(owners, nick, text, store):
|
|||
|
||||
return msg
|
||||
|
||||
def cmdmainteneur(owners, nick, text, store):
|
||||
def cmdstatus(owners, nick, text, store):
|
||||
"""
|
||||
Change maintainer for an host
|
||||
"""
|
||||
|
||||
if not nick in owners:
|
||||
return "désolé mais vous n'êtes pas autorisé à utiliser cette commande."
|
||||
|
||||
try:
|
||||
splittedtext = text.split(" ")
|
||||
|
||||
# print maintainer
|
||||
if len(splittedtext) == 2:
|
||||
host = splittedtext[1]
|
||||
|
||||
for realhost in store.knownHosts:
|
||||
if host in realhost:
|
||||
return "le responsable de cette machine est " \
|
||||
+ store.knownHosts[realhost].maintainer
|
||||
else:
|
||||
return "machine inconnue (tout le monde est son mainteneur ou bien elle n'existe pas)"
|
||||
|
||||
if len(splittedtext) == 3:
|
||||
host = splittedtext[1]
|
||||
maintainer = splittedtext[2]
|
||||
|
||||
if not host in store.knownHosts:
|
||||
store.knownHosts[host] = HostData(host)
|
||||
Status informations command.
|
||||
"""
|
||||
|
||||
store.knownHosts[host].maintainer = maintainer
|
||||
return "le responsable est à présent " + maintainer
|
||||
|
||||
except KeyError as e:
|
||||
log.error(repr(e))
|
||||
return "machine inconnue (tout le monde est son mainteneur)"
|
||||
|
||||
# except Exception as e:
|
||||
# log.error(repr(e))
|
||||
# return "erreur à l'exécution"
|
||||
|
||||
return "Syntaxe invalide"
|
||||
|
||||
def cmdsave(owners, nick, text, store):
|
||||
"""
|
||||
Save
|
||||
"""
|
||||
|
||||
if not nick in owners:
|
||||
return "désolé mais vous n'êtes pas autorisé à utiliser cette commande."
|
||||
|
||||
try:
|
||||
with open('current_buffer', 'wb') as current_buffer_file:
|
||||
pickle.dump(store.knownHosts, current_buffer_file)
|
||||
return "OK"
|
||||
|
||||
except Exception as e:
|
||||
log.error(repr(e))
|
||||
return "erreur à l'exécution"
|
||||
|
||||
def cmdload(owners, nick, text, store):
|
||||
"""
|
||||
Save
|
||||
"""
|
||||
|
||||
if not nick in owners:
|
||||
return "désolé mais vous n'êtes pas autorisé à utiliser cette commande."
|
||||
|
||||
try:
|
||||
with open('current_buffer', 'rb') as current_buffer_file:
|
||||
store.knownHosts = pickle.load(current_buffer_file)
|
||||
return "OK"
|
||||
|
||||
except Exception as e:
|
||||
log.error(repr(e))
|
||||
return "erreur à l'exécution"
|
||||
|
||||
def cmdhist(owners, nick, text, store):
|
||||
"""
|
||||
Check history for a maintainer
|
||||
"""
|
||||
|
||||
if not nick in store.knownMaintainers or len(store.knownMaintainers[nick]) == 0:
|
||||
return "pas d'historique disponible pour vous."
|
||||
|
||||
msg = "voici les cinq derniers évènements pour chaque hôte notifié:\n"
|
||||
for host in store.knownMaintainers[nick]:
|
||||
msg += "\nHôte %s:\n" % host
|
||||
|
||||
count = 0
|
||||
for serviceline in store.knownMaintainers[nick][host]:
|
||||
msg += "- %s" % serviceline + "\n"
|
||||
count += 1
|
||||
if count >= 5:
|
||||
break
|
||||
|
||||
del store.knownMaintainers[nick]
|
||||
store.knownMaintainers[nick] = {}
|
||||
return msg
|
||||
return "\n"+store.get_status()
|
||||
|
||||
# Commands
|
||||
|
||||
commandtable = {
|
||||
"help" : cmdhelp,
|
||||
"hist" : cmdhist,
|
||||
"load" : cmdload,
|
||||
"mainteneur" : cmdmainteneur,
|
||||
"ping" : cmdping,
|
||||
"save" : cmdsave,
|
||||
"help" : cmdhelp,
|
||||
"ping" : cmdping,
|
||||
"status": cmdstatus
|
||||
}
|
||||
|
||||
|
||||
|
|
BIN
current_buffer
BIN
current_buffer
Binary file not shown.
356
data.py
356
data.py
|
@ -3,11 +3,30 @@ import logging
|
|||
|
||||
from systemd.journal import JournalHandler
|
||||
|
||||
|
||||
# Logging
|
||||
log = logging.getLogger(__name__)
|
||||
log.addHandler(JournalHandler())
|
||||
|
||||
PRIORITY = {
|
||||
"OK":0,
|
||||
"UP":0,
|
||||
"WARNING":1,
|
||||
"CRITICAL":2,
|
||||
"UNKNOWN": 2,
|
||||
"DOWN": 3,
|
||||
}
|
||||
|
||||
|
||||
class ProblemData:
|
||||
"""
|
||||
Data related to notifications related to a given problem
|
||||
"""
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.status = "OK"
|
||||
self.last_update = datetime.datetime.fromtimestamp(0)
|
||||
|
||||
|
||||
class HostData:
|
||||
"""
|
||||
|
@ -16,19 +35,11 @@ class HostData:
|
|||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
||||
# Concerning host
|
||||
self.type = ""
|
||||
self.status = "OK"
|
||||
self.downtime = False
|
||||
|
||||
# Concerning services
|
||||
self.statuses = {}
|
||||
self.types = {}
|
||||
|
||||
# Tools
|
||||
self.counts = {"CRITICAL":0, "WARNING":0, "OK":0}
|
||||
self.maintainer = "Tout le monde"
|
||||
self.last_update = datetime.datetime.fromtimestamp(0)
|
||||
self.worst = None
|
||||
self.problems = set()
|
||||
|
||||
|
||||
class DataStore:
|
||||
|
@ -37,225 +48,150 @@ class DataStore:
|
|||
|
||||
log.info("Created DataStore")
|
||||
|
||||
self.knownHosts = {}
|
||||
self.knownMaintainers = {}
|
||||
self.hosts = set()
|
||||
self.linkedBot = linkedBot
|
||||
|
||||
def notify(self, destmuc):
|
||||
msg = "```\n"
|
||||
msg += "*** Isengard - Statut des services ***\n\n"
|
||||
msg += "-"*80 + "\n"
|
||||
msg += "| Hôte | Statut | Pire service | Dernière maj |\n"
|
||||
msg += "-"*80 + "\n"
|
||||
|
||||
for host in [x for x in self.hosts]:
|
||||
msg += "*"
|
||||
msg += " " + str(host.name)[:22] + " "*(23 - len(str(host.name)[:22])) + "|"
|
||||
msg += " " + str(host.status)[:8] + " "*(9 - len(str(host.status))) + "|"
|
||||
msg += " " + str(host.worst)[:16] + " "*(17 - len(str(host.worst)[:16])) + "|"
|
||||
msg += " " + host.last_update.strftime("%m/%d/%Y, %H:%M:%S")[:21] +\
|
||||
" "*(22 - len(host.last_update.strftime("%m/%d/%Y, %H:%M:%S"))) +\
|
||||
"|\n"
|
||||
msg += "-"*80 + "\n"
|
||||
|
||||
if PRIORITY[host.status] == 0:
|
||||
self.hosts.discard(host)
|
||||
msg += "```"
|
||||
|
||||
# Send notification
|
||||
log.info("Sending to %s: %s" % (destmuc, msg))
|
||||
self.linkedBot.push(destmuc, msg)
|
||||
|
||||
def get_status(self):
|
||||
msg = "```\n"
|
||||
msg += "*** Isengard - Statut des services ***\n\n"
|
||||
msg += "-"*80 + "\n"
|
||||
msg += "| Hôte | Statut | Pire service | Dernière maj |\n"
|
||||
msg += "-"*80 + "\n"
|
||||
|
||||
for host in [x for x in self.hosts]:
|
||||
msg += "*"
|
||||
msg += " " + str(host.name)[:22] + " "*(23 - len(str(host.name)[:22])) + "|"
|
||||
msg += " " + str(host.status)[:8] + " "*(9 - len(str(host.status))) + "|"
|
||||
msg += " " + str(host.worst)[:16] + " "*(17 - len(str(host.worst)[:16])) + "|"
|
||||
msg += " " + host.last_update.strftime("%m/%d/%Y, %H:%M:%S")[:21] +\
|
||||
" "*(22 - len(host.last_update.strftime("%m/%d/%Y, %H:%M:%S"))) +\
|
||||
"|\n"
|
||||
msg += "-"*80 + "\n"
|
||||
|
||||
if PRIORITY[host.status] == 0:
|
||||
self.hosts.discard(host)
|
||||
msg += "```"
|
||||
|
||||
return msg
|
||||
|
||||
|
||||
def push(self, msg):
|
||||
"""
|
||||
Process messages like TYPE|HOST/SERVICE|STATE|OUTPUT|SENDER|COMMENT
|
||||
"""
|
||||
|
||||
# Get current time
|
||||
curtime = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
|
||||
curtime = datetime.datetime.now() #.strftime("%m/%d/%Y, %H:%M:%S")
|
||||
# Get all params
|
||||
destmuc, type, location, status, text, sender, comment = msg.split("|")
|
||||
destmuc, mtype, location, status, text, sender, comment = msg.split("|")
|
||||
mtype = mtype.replace(" ", "")
|
||||
status = status.replace(" ", "")
|
||||
|
||||
# Check if message is about a service or host
|
||||
try:
|
||||
host, service = location.split("/")
|
||||
hostname, service = location.split("/")
|
||||
except ValueError:
|
||||
host = location.split("/")[0]
|
||||
service = ''
|
||||
|
||||
# Create raw text from notification
|
||||
raw = "%s [%s/%s]: %s (%s %s)" % (curtime, type, status, text,
|
||||
sender, comment)
|
||||
hostname = location.split("/")[0]
|
||||
service = None
|
||||
|
||||
cur = None
|
||||
|
||||
log.info("Datastore received: %s" % msg)
|
||||
log.info("Datastore understood: %s" % raw)
|
||||
|
||||
# Look for host
|
||||
if host in self.knownHosts:
|
||||
cur = self.knownHosts[host]
|
||||
|
||||
# It's a service
|
||||
if service != "":
|
||||
# is it known ?
|
||||
if service in cur.statuses:
|
||||
# does the status change ?
|
||||
if cur.statuses[service] != status:
|
||||
# update
|
||||
if not cur.statuses[service] in cur.counts:
|
||||
cur.counts[status] = 0
|
||||
cur.counts[cur.statuses[service]] -= 1
|
||||
if not status in cur.counts:
|
||||
cur.counts[status] = 0
|
||||
cur.counts[status] += 1
|
||||
cur.statuses[service] = status
|
||||
cur.types[service] = status
|
||||
else:
|
||||
# create status entry
|
||||
cur.statuses[service] = status
|
||||
if not status in cur.counts:
|
||||
cur.counts[status] = 0
|
||||
cur.counts[status] += 1
|
||||
|
||||
# It's not a service (so general)
|
||||
else:
|
||||
# does the status change ?
|
||||
if cur.status != status:
|
||||
cur.status = status
|
||||
cur.type = type
|
||||
|
||||
# Host is not known
|
||||
else:
|
||||
# create host
|
||||
self.knownHosts[host] = HostData(host)
|
||||
cur = self.knownHosts[host]
|
||||
|
||||
# It's a service
|
||||
if service != "":
|
||||
# create status entry
|
||||
cur.statuses[service] = status
|
||||
if not status in cur.counts:
|
||||
cur.counts[status] = 0
|
||||
cur.counts[status] += 1
|
||||
|
||||
# It's not a service (so general)
|
||||
else:
|
||||
# create status entry
|
||||
for host in self.hosts:
|
||||
if host.name == hostname:
|
||||
cur = host
|
||||
|
||||
# Host not found
|
||||
if not(cur):
|
||||
cur = HostData(hostname)
|
||||
self.hosts.add(cur)
|
||||
log.info("CREATED : %s\n" % cur)
|
||||
|
||||
|
||||
# Retrieve informations and update
|
||||
log.info("RECEIVED : status %s; mtype %s; location %s; sender %s; comment %s; text %s\n"
|
||||
% (status, mtype, location, sender, comment, text))
|
||||
|
||||
cur.last_update = curtime
|
||||
|
||||
# If that's global
|
||||
if not(service):
|
||||
|
||||
# Host is now down
|
||||
if PRIORITY[status] > PRIORITY[cur.status]:
|
||||
cur.status = status
|
||||
cur.type = type
|
||||
|
||||
# Update history
|
||||
if not cur.maintainer in self.knownMaintainers:
|
||||
self.knownMaintainers[cur.maintainer] = {}
|
||||
|
||||
if not host in self.knownMaintainers[cur.maintainer]:
|
||||
self.knownMaintainers[cur.maintainer][host] = []
|
||||
|
||||
self.knownMaintainers[cur.maintainer][host].append(raw)
|
||||
|
||||
# Is there only one service or more problems for this host?
|
||||
problemCount = 0
|
||||
for cstatus in cur.counts:
|
||||
if not "OK" in cstatus:
|
||||
problemCount += cur.counts[cstatus]
|
||||
|
||||
# If this notification is a problem
|
||||
if not "OK" in status and not "UP" in status:
|
||||
# Unknown state
|
||||
if "UNKNOWN" in cur.type and problemCount == 0:
|
||||
message = "état inconnu sur (%s)" % (host, text)
|
||||
# send notification
|
||||
log.info("Sending to %s: %s" % (destmuc, message))
|
||||
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
|
||||
# Unknown state
|
||||
elif "UNKNOWN" in cur.type:
|
||||
cur.worst = "DOWN"
|
||||
|
||||
# DOWNTIME
|
||||
elif "DOWNTIME" in mtype:
|
||||
pass
|
||||
|
||||
# General problem
|
||||
elif not "OK" in cur.status and problemCount == 0:
|
||||
message = "je détecte un problème général (%s)" \
|
||||
" sur %s (%s)" % (status, host, text)
|
||||
# send notification
|
||||
log.info("Sending to %s: %s" % (destmuc, message))
|
||||
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
|
||||
|
||||
# Only one service has a problem
|
||||
elif service and problemCount == 1:
|
||||
message = "je détecte un problème (%s) sur le service %s de" \
|
||||
" la machine %s" \
|
||||
" (%s)" % (status, service, host, text)
|
||||
# send notification
|
||||
log.info("Sending to %s: %s" % (destmuc, message))
|
||||
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
|
||||
|
||||
# Multiple problems
|
||||
else:
|
||||
message = "je détecte de multiples problèmes " \
|
||||
"sur la machine %s\n" % (host)
|
||||
# create recap from statuses that are not OK
|
||||
for cstatus in cur.counts:
|
||||
if not "OK" in cstatus:
|
||||
message += "%s %s(s), " % \
|
||||
(str(cur.counts[cstatus]), cstatus)
|
||||
message = message[:-2]
|
||||
# send notification
|
||||
log.info("Sending to %s: %s" % (destmuc, message))
|
||||
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
|
||||
|
||||
# We have a recovery
|
||||
# Host is no more down and has no more problems
|
||||
elif not len(cur.problems):
|
||||
cur.status = "OK"
|
||||
cur.worst = None
|
||||
|
||||
# Service problem
|
||||
else:
|
||||
cur_problem = None
|
||||
|
||||
# Look for existing problem
|
||||
for problem in cur.problems:
|
||||
if problem.name == service:
|
||||
cur_problem = problem
|
||||
|
||||
# Problem not found, create it
|
||||
if not(cur_problem):
|
||||
cur_problem = ProblemData(service)
|
||||
cur.problems.add(cur_problem)
|
||||
log.info("CREATED PROBLEM in %s : %s\n" % (cur, cur_problem))
|
||||
|
||||
cur_problem.last_update = curtime
|
||||
cur_problem.status = status
|
||||
|
||||
if PRIORITY[status] == 0 and cur.worst == cur_problem:
|
||||
cur.worst = None
|
||||
cur.status = "OK"
|
||||
|
||||
|
||||
if "DOWNTIME" in type:
|
||||
if "DOWNTIMESTART" in type and not cur.downtime:
|
||||
cur.downtime = True
|
||||
message = "début de downtime sur" \
|
||||
" sur %s (%s)" % (host, comment)
|
||||
# send notification
|
||||
log.info("Sending to %s: %s" % (destmuc, message))
|
||||
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
|
||||
|
||||
elif "DOWNTIMEEND" in type and cur.downtime:
|
||||
cur.downtime = False
|
||||
message = "fin de downtime sur" \
|
||||
" sur %s (%s)" % (host, comment)
|
||||
# send notification
|
||||
log.info("Sending to %s: %s" % (destmuc, message))
|
||||
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
|
||||
else:
|
||||
pass
|
||||
|
||||
# General problem
|
||||
elif not service and problemCount == 0:
|
||||
message = "fin d'alerte générale sur" \
|
||||
" sur %s (%s)" % (host, text)
|
||||
# send notification
|
||||
log.info("Sending to %s: %s" % (destmuc, message))
|
||||
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
|
||||
|
||||
# Only one service has a problem
|
||||
elif service and problemCount == 0:
|
||||
message = "résolution du problème sur le service %s de" \
|
||||
" la machine %s" \
|
||||
" (%s)\n" % (service, host, text)
|
||||
# create recap from statuses that are not OK
|
||||
for cstatus in cur.counts:
|
||||
message += "%s %s(s), " % \
|
||||
(str(cur.counts[cstatus]), cstatus)
|
||||
message = message[:-2]
|
||||
# send notification
|
||||
log.info("Sending to %s: %s" % (destmuc, message))
|
||||
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
|
||||
|
||||
# Resolution but multiple problems
|
||||
else:
|
||||
message = "résolution d'alertes en cours " \
|
||||
"sur la machine %s\n" % (host)
|
||||
# create recap from statuses
|
||||
for cstatus in cur.counts:
|
||||
message += "%s %s(s), " % \
|
||||
(str(cur.counts[cstatus]), cstatus)
|
||||
message = message[:-2]
|
||||
# send notification
|
||||
log.info("Sending to %s: %s" % (destmuc, message))
|
||||
self.linkedBot.push(destmuc, cur.maintainer+", "+message)
|
||||
|
||||
|
||||
log.info("Datastore known hosts: %s" % str(self.knownHosts))
|
||||
log.info("Datastore known maintainers: %s" % str(self.knownHosts))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
worst_problem = ProblemData(None)
|
||||
|
||||
# Find the worst current problem
|
||||
for problem in cur.problems:
|
||||
if PRIORITY[problem.status] > PRIORITY[worst_problem.status]:
|
||||
if problem.last_update > worst_problem.last_update:
|
||||
worst_problem = problem
|
||||
|
||||
if worst_problem.name != None:
|
||||
cur.status = worst_problem.status
|
||||
cur.worst = worst_problem.name
|
||||
|
||||
if PRIORITY[status] >= 2 or ("RECOVERY" in mtype and cur.worst == None):
|
||||
self.notify(destmuc)
|
||||
|
||||
return
|
||||
|
|
|
@ -10,9 +10,9 @@ log.addHandler(JournalHandler())
|
|||
|
||||
|
||||
class LocalServer(threading.Thread):
|
||||
def __init__(self, sharedBuffer):
|
||||
def __init__(self, datastore):
|
||||
threading.Thread.__init__(self)
|
||||
self.sharedBuffer = sharedBuffer
|
||||
self.datastore = datastore
|
||||
self.pleaseStop = False
|
||||
|
||||
def run(self):
|
||||
|
@ -32,7 +32,7 @@ class LocalServer(threading.Thread):
|
|||
log.debug(rcvStr)
|
||||
|
||||
if rcvStr != '':
|
||||
self.sharedBuffer.push(rcvStr)
|
||||
self.datastore.push(rcvStr)
|
||||
|
||||
client.send(b'') # Send a message to the client
|
||||
client.close()
|
||||
|
|
10
main.py
10
main.py
|
@ -85,7 +85,7 @@ if __name__ == '__main__':
|
|||
nick = "Isengard"
|
||||
|
||||
ERRORS = 0
|
||||
store = None
|
||||
datastore = None
|
||||
localservthread = None
|
||||
|
||||
while True:
|
||||
|
@ -96,13 +96,13 @@ if __name__ == '__main__':
|
|||
xmpp.register_plugin('xep_0199') # XMPP Ping
|
||||
|
||||
# Create buffer
|
||||
if not store:
|
||||
store = DataStore(xmpp)
|
||||
xmpp.datastore = store
|
||||
if not datastore:
|
||||
datastore = DataStore(xmpp)
|
||||
xmpp.datastore = datastore
|
||||
|
||||
# Launch local server
|
||||
if not localservthread:
|
||||
localservthread = LocalServer(store)
|
||||
localservthread = LocalServer(datastore)
|
||||
localservthread.start()
|
||||
time.sleep(1)
|
||||
|
||||
|
|
|
@ -3,8 +3,8 @@ Description=SlixXMPP service bot
|
|||
After=network.target ejabberd.service
|
||||
|
||||
[Service]
|
||||
WorkingDirectory=/home/isengard_xmpp/xmpp_bot
|
||||
ExecStart=/bin/bash -c 'python3 main.py --jid isengard@a-lec.org --nick Isengard --password $(cat /etc/xmpp_bot/password.conf) --room cominfra@salons.a-lec.org'
|
||||
WorkingDirectory=/var/isengard_xmpp/repo
|
||||
ExecStart=/bin/bash -c 'python3 main.py --jid isengard@a-lec.org --nick Isengard --password $(cat /etc/xmpp_bot/password.conf) --room cominfra-supervision@salons.a-lec.org'
|
||||
Restart=on-failure
|
||||
RestartSec=60s
|
||||
User=isengard_xmpp
|
||||
|
|
Loading…
Reference in New Issue