diff --git a/bot.py b/bot.py index fad2e31..142d07f 100644 --- a/bot.py +++ b/bot.py @@ -135,13 +135,13 @@ class MUCBot(slixmpp.ClientXMPP): the user's nickname """ - if presence['muc']['nick'] in self.datastore.knownMaintainers: - self.send_message(mto=presence['from'].bare, - mbody="Salut %s, vos services ont produit des " \ - % (presence['muc']['nick']) + - "alertes en votre absence !\nUtilisez la commande"+ - " `hist` pour consulter l'historique", - mtype='groupchat') + # if presence['muc']['nick'] in self.datastore.knownMaintainers: + # self.send_message(mto=presence['from'].bare, + # mbody="Salut %s, vos services ont produit des " \ + # % (presence['muc']['nick']) + + # "alertes en votre absence !\nUtilisez la commande"+ + # " `hist` pour consulter l'historique", + # mtype='groupchat') if presence['muc']['affiliation'] == "owner": if not presence['from'].bare in self.owners: diff --git a/commands.py b/commands.py index 7299b39..b2a7e03 100644 --- a/commands.py +++ b/commands.py @@ -34,114 +34,19 @@ def cmdhelp(owners, nick, text, store): return msg -def cmdmainteneur(owners, nick, text, store): +def cmdstatus(owners, nick, text, store): """ - Change maintainer for an host - """ - - if not nick in owners: - return "désolé mais vous n'êtes pas autorisé à utiliser cette commande." - - try: - splittedtext = text.split(" ") - - # print maintainer - if len(splittedtext) == 2: - host = splittedtext[1] - - for realhost in store.knownHosts: - if host in realhost: - return "le responsable de cette machine est " \ - + store.knownHosts[realhost].maintainer - else: - return "machine inconnue (tout le monde est son mainteneur ou bien elle n'existe pas)" - - if len(splittedtext) == 3: - host = splittedtext[1] - maintainer = splittedtext[2] - - if not host in store.knownHosts: - store.knownHosts[host] = HostData(host) + Status informations command. + """ - store.knownHosts[host].maintainer = maintainer - return "le responsable est à présent " + maintainer - - except KeyError as e: - log.error(repr(e)) - return "machine inconnue (tout le monde est son mainteneur)" - -# except Exception as e: -# log.error(repr(e)) -# return "erreur à l'exécution" - - return "Syntaxe invalide" - -def cmdsave(owners, nick, text, store): - """ - Save - """ - - if not nick in owners: - return "désolé mais vous n'êtes pas autorisé à utiliser cette commande." - - try: - with open('current_buffer', 'wb') as current_buffer_file: - pickle.dump(store.knownHosts, current_buffer_file) - return "OK" - - except Exception as e: - log.error(repr(e)) - return "erreur à l'exécution" - -def cmdload(owners, nick, text, store): - """ - Save - """ - - if not nick in owners: - return "désolé mais vous n'êtes pas autorisé à utiliser cette commande." - - try: - with open('current_buffer', 'rb') as current_buffer_file: - store.knownHosts = pickle.load(current_buffer_file) - return "OK" - - except Exception as e: - log.error(repr(e)) - return "erreur à l'exécution" - -def cmdhist(owners, nick, text, store): - """ - Check history for a maintainer - """ - - if not nick in store.knownMaintainers or len(store.knownMaintainers[nick]) == 0: - return "pas d'historique disponible pour vous." - - msg = "voici les cinq derniers évènements pour chaque hôte notifié:\n" - for host in store.knownMaintainers[nick]: - msg += "\nHôte %s:\n" % host - - count = 0 - for serviceline in store.knownMaintainers[nick][host]: - msg += "- %s" % serviceline + "\n" - count += 1 - if count >= 5: - break - - del store.knownMaintainers[nick] - store.knownMaintainers[nick] = {} - return msg + return "\n"+store.get_status() # Commands commandtable = { -"help" : cmdhelp, -"hist" : cmdhist, -"load" : cmdload, -"mainteneur" : cmdmainteneur, -"ping" : cmdping, -"save" : cmdsave, +"help" : cmdhelp, +"ping" : cmdping, +"status": cmdstatus } diff --git a/current_buffer b/current_buffer deleted file mode 100644 index 163b0cc..0000000 Binary files a/current_buffer and /dev/null differ diff --git a/data.py b/data.py index 4a55274..cffb09f 100644 --- a/data.py +++ b/data.py @@ -3,11 +3,30 @@ import logging from systemd.journal import JournalHandler - # Logging log = logging.getLogger(__name__) log.addHandler(JournalHandler()) +PRIORITY = { + "OK":0, + "UP":0, + "WARNING":1, + "CRITICAL":2, + "UNKNOWN": 2, + "DOWN": 3, +} + + +class ProblemData: + """ + Data related to notifications related to a given problem + """ + + def __init__(self, name): + self.name = name + self.status = "OK" + self.last_update = datetime.datetime.fromtimestamp(0) + class HostData: """ @@ -16,19 +35,11 @@ class HostData: def __init__(self, name): self.name = name - - # Concerning host - self.type = "" self.status = "OK" self.downtime = False - - # Concerning services - self.statuses = {} - self.types = {} - - # Tools - self.counts = {"CRITICAL":0, "WARNING":0, "OK":0} - self.maintainer = "Tout le monde" + self.last_update = datetime.datetime.fromtimestamp(0) + self.worst = None + self.problems = set() class DataStore: @@ -37,225 +48,150 @@ class DataStore: log.info("Created DataStore") - self.knownHosts = {} - self.knownMaintainers = {} + self.hosts = set() self.linkedBot = linkedBot + def notify(self, destmuc): + msg = "```\n" + msg += "*** Isengard - Statut des services ***\n\n" + msg += "-"*80 + "\n" + msg += "| Hôte | Statut | Pire service | Dernière maj |\n" + msg += "-"*80 + "\n" + + for host in [x for x in self.hosts]: + msg += "*" + msg += " " + str(host.name)[:22] + " "*(23 - len(str(host.name)[:22])) + "|" + msg += " " + str(host.status)[:8] + " "*(9 - len(str(host.status))) + "|" + msg += " " + str(host.worst)[:16] + " "*(17 - len(str(host.worst)[:16])) + "|" + msg += " " + host.last_update.strftime("%m/%d/%Y, %H:%M:%S")[:21] +\ + " "*(22 - len(host.last_update.strftime("%m/%d/%Y, %H:%M:%S"))) +\ + "|\n" + msg += "-"*80 + "\n" + + if PRIORITY[host.status] == 0: + self.hosts.discard(host) + msg += "```" + + # Send notification + log.info("Sending to %s: %s" % (destmuc, msg)) + self.linkedBot.push(destmuc, msg) + + def get_status(self): + msg = "```\n" + msg += "*** Isengard - Statut des services ***\n\n" + msg += "-"*80 + "\n" + msg += "| Hôte | Statut | Pire service | Dernière maj |\n" + msg += "-"*80 + "\n" + + for host in [x for x in self.hosts]: + msg += "*" + msg += " " + str(host.name)[:22] + " "*(23 - len(str(host.name)[:22])) + "|" + msg += " " + str(host.status)[:8] + " "*(9 - len(str(host.status))) + "|" + msg += " " + str(host.worst)[:16] + " "*(17 - len(str(host.worst)[:16])) + "|" + msg += " " + host.last_update.strftime("%m/%d/%Y, %H:%M:%S")[:21] +\ + " "*(22 - len(host.last_update.strftime("%m/%d/%Y, %H:%M:%S"))) +\ + "|\n" + msg += "-"*80 + "\n" + + if PRIORITY[host.status] == 0: + self.hosts.discard(host) + msg += "```" + + return msg + + def push(self, msg): """ Process messages like TYPE|HOST/SERVICE|STATE|OUTPUT|SENDER|COMMENT """ # Get current time - curtime = datetime.datetime.now().strftime("%m/%d/%Y, %H:%M:%S") + curtime = datetime.datetime.now() #.strftime("%m/%d/%Y, %H:%M:%S") # Get all params - destmuc, type, location, status, text, sender, comment = msg.split("|") + destmuc, mtype, location, status, text, sender, comment = msg.split("|") + mtype = mtype.replace(" ", "") + status = status.replace(" ", "") + # Check if message is about a service or host try: - host, service = location.split("/") + hostname, service = location.split("/") except ValueError: - host = location.split("/")[0] - service = '' - - # Create raw text from notification - raw = "%s [%s/%s]: %s (%s %s)" % (curtime, type, status, text, - sender, comment) + hostname = location.split("/")[0] + service = None cur = None - log.info("Datastore received: %s" % msg) - log.info("Datastore understood: %s" % raw) - # Look for host - if host in self.knownHosts: - cur = self.knownHosts[host] - - # It's a service - if service != "": - # is it known ? - if service in cur.statuses: - # does the status change ? - if cur.statuses[service] != status: - # update - if not cur.statuses[service] in cur.counts: - cur.counts[status] = 0 - cur.counts[cur.statuses[service]] -= 1 - if not status in cur.counts: - cur.counts[status] = 0 - cur.counts[status] += 1 - cur.statuses[service] = status - cur.types[service] = status - else: - # create status entry - cur.statuses[service] = status - if not status in cur.counts: - cur.counts[status] = 0 - cur.counts[status] += 1 - - # It's not a service (so general) - else: - # does the status change ? - if cur.status != status: - cur.status = status - cur.type = type - - # Host is not known - else: - # create host - self.knownHosts[host] = HostData(host) - cur = self.knownHosts[host] - - # It's a service - if service != "": - # create status entry - cur.statuses[service] = status - if not status in cur.counts: - cur.counts[status] = 0 - cur.counts[status] += 1 - - # It's not a service (so general) - else: - # create status entry + for host in self.hosts: + if host.name == hostname: + cur = host + + # Host not found + if not(cur): + cur = HostData(hostname) + self.hosts.add(cur) + log.info("CREATED : %s\n" % cur) + + + # Retrieve informations and update + log.info("RECEIVED : status %s; mtype %s; location %s; sender %s; comment %s; text %s\n" + % (status, mtype, location, sender, comment, text)) + + cur.last_update = curtime + + # If that's global + if not(service): + + # Host is now down + if PRIORITY[status] > PRIORITY[cur.status]: cur.status = status - cur.type = type - - # Update history - if not cur.maintainer in self.knownMaintainers: - self.knownMaintainers[cur.maintainer] = {} - - if not host in self.knownMaintainers[cur.maintainer]: - self.knownMaintainers[cur.maintainer][host] = [] - - self.knownMaintainers[cur.maintainer][host].append(raw) - - # Is there only one service or more problems for this host? - problemCount = 0 - for cstatus in cur.counts: - if not "OK" in cstatus: - problemCount += cur.counts[cstatus] - - # If this notification is a problem - if not "OK" in status and not "UP" in status: - # Unknown state - if "UNKNOWN" in cur.type and problemCount == 0: - message = "état inconnu sur (%s)" % (host, text) - # send notification - log.info("Sending to %s: %s" % (destmuc, message)) - self.linkedBot.push(destmuc, cur.maintainer+", "+message) - # Unknown state - elif "UNKNOWN" in cur.type: + cur.worst = "DOWN" + + # DOWNTIME + elif "DOWNTIME" in mtype: pass - # General problem - elif not "OK" in cur.status and problemCount == 0: - message = "je détecte un problème général (%s)" \ - " sur %s (%s)" % (status, host, text) - # send notification - log.info("Sending to %s: %s" % (destmuc, message)) - self.linkedBot.push(destmuc, cur.maintainer+", "+message) - - # Only one service has a problem - elif service and problemCount == 1: - message = "je détecte un problème (%s) sur le service %s de" \ - " la machine %s" \ - " (%s)" % (status, service, host, text) - # send notification - log.info("Sending to %s: %s" % (destmuc, message)) - self.linkedBot.push(destmuc, cur.maintainer+", "+message) - - # Multiple problems - else: - message = "je détecte de multiples problèmes " \ - "sur la machine %s\n" % (host) - # create recap from statuses that are not OK - for cstatus in cur.counts: - if not "OK" in cstatus: - message += "%s %s(s), " % \ - (str(cur.counts[cstatus]), cstatus) - message = message[:-2] - # send notification - log.info("Sending to %s: %s" % (destmuc, message)) - self.linkedBot.push(destmuc, cur.maintainer+", "+message) - - # We have a recovery + # Host is no more down and has no more problems + elif not len(cur.problems): + cur.status = "OK" + cur.worst = None + + # Service problem else: + cur_problem = None + + # Look for existing problem + for problem in cur.problems: + if problem.name == service: + cur_problem = problem + + # Problem not found, create it + if not(cur_problem): + cur_problem = ProblemData(service) + cur.problems.add(cur_problem) + log.info("CREATED PROBLEM in %s : %s\n" % (cur, cur_problem)) + + cur_problem.last_update = curtime + cur_problem.status = status + + if PRIORITY[status] == 0 and cur.worst == cur_problem: + cur.worst = None + cur.status = "OK" + - if "DOWNTIME" in type: - if "DOWNTIMESTART" in type and not cur.downtime: - cur.downtime = True - message = "début de downtime sur" \ - " sur %s (%s)" % (host, comment) - # send notification - log.info("Sending to %s: %s" % (destmuc, message)) - self.linkedBot.push(destmuc, cur.maintainer+", "+message) - - elif "DOWNTIMEEND" in type and cur.downtime: - cur.downtime = False - message = "fin de downtime sur" \ - " sur %s (%s)" % (host, comment) - # send notification - log.info("Sending to %s: %s" % (destmuc, message)) - self.linkedBot.push(destmuc, cur.maintainer+", "+message) - else: - pass - - # General problem - elif not service and problemCount == 0: - message = "fin d'alerte générale sur" \ - " sur %s (%s)" % (host, text) - # send notification - log.info("Sending to %s: %s" % (destmuc, message)) - self.linkedBot.push(destmuc, cur.maintainer+", "+message) - - # Only one service has a problem - elif service and problemCount == 0: - message = "résolution du problème sur le service %s de" \ - " la machine %s" \ - " (%s)\n" % (service, host, text) - # create recap from statuses that are not OK - for cstatus in cur.counts: - message += "%s %s(s), " % \ - (str(cur.counts[cstatus]), cstatus) - message = message[:-2] - # send notification - log.info("Sending to %s: %s" % (destmuc, message)) - self.linkedBot.push(destmuc, cur.maintainer+", "+message) - - # Resolution but multiple problems - else: - message = "résolution d'alertes en cours " \ - "sur la machine %s\n" % (host) - # create recap from statuses - for cstatus in cur.counts: - message += "%s %s(s), " % \ - (str(cur.counts[cstatus]), cstatus) - message = message[:-2] - # send notification - log.info("Sending to %s: %s" % (destmuc, message)) - self.linkedBot.push(destmuc, cur.maintainer+", "+message) - - - log.info("Datastore known hosts: %s" % str(self.knownHosts)) - log.info("Datastore known maintainers: %s" % str(self.knownHosts)) - - - - - - - - - - - - - - - - - - - + worst_problem = ProblemData(None) + # Find the worst current problem + for problem in cur.problems: + if PRIORITY[problem.status] > PRIORITY[worst_problem.status]: + if problem.last_update > worst_problem.last_update: + worst_problem = problem + if worst_problem.name != None: + cur.status = worst_problem.status + cur.worst = worst_problem.name + if PRIORITY[status] >= 2 or ("RECOVERY" in mtype and cur.worst == None): + self.notify(destmuc) + + return diff --git a/localServer.py b/localServer.py index 3ff7282..26d09f9 100644 --- a/localServer.py +++ b/localServer.py @@ -10,9 +10,9 @@ log.addHandler(JournalHandler()) class LocalServer(threading.Thread): - def __init__(self, sharedBuffer): + def __init__(self, datastore): threading.Thread.__init__(self) - self.sharedBuffer = sharedBuffer + self.datastore = datastore self.pleaseStop = False def run(self): @@ -32,7 +32,7 @@ class LocalServer(threading.Thread): log.debug(rcvStr) if rcvStr != '': - self.sharedBuffer.push(rcvStr) + self.datastore.push(rcvStr) client.send(b'') # Send a message to the client client.close() diff --git a/main.py b/main.py index 5f9bd9c..7495fd5 100755 --- a/main.py +++ b/main.py @@ -85,7 +85,7 @@ if __name__ == '__main__': nick = "Isengard" ERRORS = 0 - store = None + datastore = None localservthread = None while True: @@ -96,13 +96,13 @@ if __name__ == '__main__': xmpp.register_plugin('xep_0199') # XMPP Ping # Create buffer - if not store: - store = DataStore(xmpp) - xmpp.datastore = store + if not datastore: + datastore = DataStore(xmpp) + xmpp.datastore = datastore # Launch local server if not localservthread: - localservthread = LocalServer(store) + localservthread = LocalServer(datastore) localservthread.start() time.sleep(1) diff --git a/xmpp_bot.service b/xmpp_bot.service index cf41f6f..05e23b5 100644 --- a/xmpp_bot.service +++ b/xmpp_bot.service @@ -3,8 +3,8 @@ Description=SlixXMPP service bot After=network.target ejabberd.service [Service] -WorkingDirectory=/home/isengard_xmpp/xmpp_bot -ExecStart=/bin/bash -c 'python3 main.py --jid isengard@a-lec.org --nick Isengard --password $(cat /etc/xmpp_bot/password.conf) --room cominfra@salons.a-lec.org' +WorkingDirectory=/var/isengard_xmpp/repo +ExecStart=/bin/bash -c 'python3 main.py --jid isengard@a-lec.org --nick Isengard --password $(cat /etc/xmpp_bot/password.conf) --room cominfra-supervision@salons.a-lec.org' Restart=on-failure RestartSec=60s User=isengard_xmpp