188 lines
7.0 KiB
Python
188 lines
7.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright (C) 2011, 2014, 2020 kaliko <kaliko@azylum.org>
|
|
# Copyright (C) 2020 Adrien Bourmault <neox@os-k.eu>
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, version 3 only.
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
import datetime
|
|
import threading
|
|
import bs4
|
|
import time
|
|
import traceback
|
|
|
|
from urllib.error import URLError
|
|
|
|
from feedparser import parse as feed_parse
|
|
|
|
from sid.plugin import Plugin, botcmd
|
|
|
|
|
|
html_escape_table = {
|
|
"&": "&",
|
|
'"': """,
|
|
"'": "'",
|
|
">": ">",
|
|
"<": "<",
|
|
}
|
|
|
|
|
|
def html_escape(text):
|
|
"""Produce entities within text."""
|
|
return ''.join(html_escape_table.get(c, c) for c in text)
|
|
|
|
|
|
def strtm_to_dtm(struc_time):
|
|
return datetime.datetime(*struc_time[:6])
|
|
|
|
|
|
class FeedMonitor(threading.Thread):
|
|
def __init__(self, plugin):
|
|
threading.Thread.__init__(self)
|
|
self.feeds_list = plugin.FEEDS
|
|
self.tempo = plugin.TEMPO
|
|
self.plugin = plugin
|
|
self.last_check = datetime.datetime.utcnow()
|
|
self.seen = dict()
|
|
self.thread_killed = False
|
|
|
|
def _update_cache(self, feed, parsed):
|
|
self.seen[feed].update({'ids': {p.id for p in parsed.entries} or {}})
|
|
# Common HTTP caching
|
|
if parsed.get('etag', False):
|
|
self.seen[feed].update({'cache': {'etag': parsed.etag}})
|
|
if parsed.get('modified', False):
|
|
self.seen[feed].update({'cache': {'modified': parsed.modified}})
|
|
|
|
def new_posts(self, feed):
|
|
"""Send new posts in feed"""
|
|
self.plugin.log.debug('feed: : "%s"', feed)
|
|
if self.seen.get(feed) and self.seen.get(feed).get('cache'):
|
|
parsed_feed = feed_parse(feed, **self.seen[feed]['cache'])
|
|
else:
|
|
if self.seen.get(feed):
|
|
self.plugin.log.debug('No cache headers set (etag/modified)')
|
|
parsed_feed = feed_parse(feed)
|
|
# Cannot resolve address
|
|
if 'status' not in parsed_feed:
|
|
self.plugin.log.error('Error from "%s": %s.',
|
|
feed, parsed_feed.bozo_exception.__repr__())
|
|
return
|
|
# http caching
|
|
if parsed_feed.status == 304:
|
|
self.plugin.log.debug('Got 304 not modified')
|
|
return
|
|
# unusual return http code
|
|
if parsed_feed.status != 200:
|
|
self.plugin.log.warning(
|
|
'Got code %(status)d from "%(href)s" (please update).',
|
|
parsed_feed)
|
|
return
|
|
if not self.seen.setdefault(feed):
|
|
# Fills with post id when first started (prevent from posting all
|
|
# entries at startup)
|
|
self.seen[feed] = {'cache': None}
|
|
self._update_cache(feed, parsed_feed)
|
|
return
|
|
title = '[%s]' % parsed_feed.feed.get('title', 'n/a')
|
|
xtitle = '<strong>%s</strong>:' % html_escape(
|
|
parsed_feed.feed.get('title', 'n/a'))
|
|
text = [title]
|
|
xhtml = [xtitle]
|
|
|
|
# Detecting new post
|
|
entries = {p.id for p in parsed_feed.entries}
|
|
seen_ids = self.seen.get(feed).get('ids')
|
|
new_entries = [p for p in parsed_feed.entries
|
|
if p.id in entries - seen_ids]
|
|
for post in new_entries:
|
|
self.plugin.log.info(post.title)
|
|
self.plugin.log.info(post.title)
|
|
body = '%(title)s ' % post
|
|
try:
|
|
body += "\nSummary: " + bs4.BeautifulSoup('%(summary)s' % post).select(".blockquote")[0].get_text()
|
|
except IndexError:
|
|
pass
|
|
body += '\n%(link)s' % post
|
|
text.append(bs4.BeautifulSoup(body).get_text())
|
|
xpost = {'title': html_escape(post.get('title', 'n/a'))}
|
|
xpost['link'] = html_escape(post.get('link',))
|
|
xbody = '<a href="{link}">{title}</a>'.format(**xpost)
|
|
xhtml.append(xbody)
|
|
# Updating self.seen, entries and cache headers
|
|
self._update_cache(feed, parsed_feed)
|
|
if len(text) > 1:
|
|
self.plugin.send(self.plugin.bot.room,
|
|
{'mhtml': '<br />'.join(xhtml), 'mbody': '\n'.join(text)},
|
|
mtype='groupchat')
|
|
|
|
def run(self):
|
|
while not self.thread_killed:
|
|
self.plugin.log.debug('feeds check')
|
|
for feed in self.feeds_list:
|
|
try:
|
|
self.new_posts(feed)
|
|
except ConnectionError as err: # Non fatal exception
|
|
self.plugin.log.error('connection error on %s: %s', feed, err)
|
|
except URLError as err: # Non fatal exception
|
|
self.plugin.log.error('error for "%s": %s', feed, err.reason)
|
|
except Exception as err: # Unknown execption, killing thread anyway
|
|
self.plugin.log.error('feeds thread crashed: %s', err)
|
|
self.plugin.log.error(''.join(traceback.format_exc()))
|
|
self.thread_killed = True
|
|
self.last_check = datetime.datetime.utcnow()
|
|
for _ in list(range(self.tempo)):
|
|
time.sleep(1)
|
|
if self.thread_killed:
|
|
return
|
|
|
|
|
|
class Feeds(Plugin):
|
|
"""
|
|
.. note::
|
|
Feeds plugin depends on external module: **feedparser**
|
|
"""
|
|
|
|
TEMPO = 1
|
|
FEEDS = [ 'https://gitlab.os-k.eu/os-k-team/os-k.atom?feed_token=En-15azki1VriEgkCQHN',
|
|
'https://gitlab.os-k.eu/os-k-team/kvisc.atom?feed_token=En-15azki1VriEgkCQHN',
|
|
'https://gitlab.os-k.eu/os-k-team/sid-xmpp-bot.atom?feed_token=En-15azki1VriEgkCQHN',
|
|
'https://gitlab.os-k.eu/os-k-team/cross-cc-builder.atom?feed_token=En-15azki1VriEgkCQHN'
|
|
]
|
|
|
|
def __init__(self, bot):
|
|
Plugin.__init__(self, bot)
|
|
self.last_check = None
|
|
self.th_mon = FeedMonitor(self)
|
|
self.th_mon.start()
|
|
|
|
def shutdown(self):
|
|
self.th_mon.thread_killed = True
|
|
|
|
@botcmd
|
|
def feeds(self, rcv, args):
|
|
"""Monitors project related feeds.
|
|
|
|
* ``!feeds`` : registred feeds list
|
|
* ``!feeds last`` : last check time"""
|
|
if 'last' in args:
|
|
date = '{:%Y-%m-%d %H:%M} (utc)'.format(self.th_mon.last_check)
|
|
self.reply(rcv, f'Last feeds check: {date}')
|
|
return
|
|
html = ['<a href="{0}">{1}</a>'.format(html_escape(u),
|
|
html_escape(u[7:])
|
|
) for u in Feeds.FEEDS]
|
|
msg = {'mbody': 'Feeds:\n' + '\n'.join(Feeds.FEEDS),
|
|
'mhtml': 'Feeds:<br />' + '<br />'.join(html)}
|
|
self.reply(rcv, msg)
|