From 30ef187e810cb25894b864bc464c129c429ac2cf Mon Sep 17 00:00:00 2001 From: foormea Date: Thu, 23 Jan 2020 19:14:50 +0100 Subject: [PATCH] new initial commit --- .gitignore | 4 + README.md | 21 +++ dock/docker-compose.yml.template | 19 +++ dock/wordpress_webhook/Dockerfile | 9 + dock/wordpress_webhook/converter.py | 158 ++++++++++++++++++ .../credentials.json.template | 6 + dock/wordpress_webhook/hfr.py | 52 ++++++ dock/wordpress_webhook/wp_wh.py | 82 +++++++++ 8 files changed, 351 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 dock/docker-compose.yml.template create mode 100644 dock/wordpress_webhook/Dockerfile create mode 100644 dock/wordpress_webhook/converter.py create mode 100644 dock/wordpress_webhook/credentials.json.template create mode 100644 dock/wordpress_webhook/hfr.py create mode 100644 dock/wordpress_webhook/wp_wh.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8f39185 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +credentials.json +__pycache__ +.vscode +docker-compose.yml diff --git a/README.md b/README.md new file mode 100644 index 0000000..3e7240c --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +# WordPress to HFR (forum.hardware.fr) converter +## Still plenty of work to do, really ugly in some places + +A `flask` route that does the following: + +- Receive webhooks from WordPress. +- Convert the WordPress blogpost's HTML into BBcode. +- Post the resulting BBcode to an HFR topic. + +## Setup +- WordPress must be set up to send the hook `publish_post` with fields `post_content`, `post_name`, and `post_url` to the URL defined by your network setup. Go to [https:///wp-admin](https:///wp-admin) `> Settings > Webhooks`. +- This is packaged as a Docker container meant to be used together with the [Letsencrypt nginx proxy companion](https://github.com/JrCs/docker-letsencrypt-nginx-proxy-companion). This could easily be adjusted to run on a [GCP Cloud Function](https://cloud.google.com/functions/). WordPress doesn't require `https` so this could also easily run as a standalone service at home without much setup. +- Look through source files for extra info. + +## What works, what doesn't +- Simple HTML tags should all work (note: this is HFR-centric, HFR doesn't support a number of BBcode tags, hence the questionable translation table compared with other BBcode implementations). +- The code is ugly in some places and might explode anytime. +- Colours are not supported yet. + +## Be careful of... +- HFR's agressive anti-spam. Possibly a good idea to adjust the code to not post when doing tests, or edit a post (instead of posting). To edit: modify `POST` to `/bdd.php` and pass a `numreponse` arg to `payload` with a post that belongs to you. diff --git a/dock/docker-compose.yml.template b/dock/docker-compose.yml.template new file mode 100644 index 0000000..1e4f621 --- /dev/null +++ b/dock/docker-compose.yml.template @@ -0,0 +1,19 @@ +version: '3' + +services: + listener: + build: ./wordpress_webhook + container_name: wordpress_webhook_listener + environment: + - VIRTUAL_HOST= + - LETSENCRYPT_HOST= + - LETSENCRYPT_EMAIL= + expose: + - 54321 + restart: unless-stopped + networks: + - letsencrypt_proxy + +networks: + letsencrypt_proxy: + external: true diff --git a/dock/wordpress_webhook/Dockerfile b/dock/wordpress_webhook/Dockerfile new file mode 100644 index 0000000..29e4dfb --- /dev/null +++ b/dock/wordpress_webhook/Dockerfile @@ -0,0 +1,9 @@ +FROM python:slim + +RUN pip install gunicorn flask requests bs4 + +COPY wp_wh.py converter.py hfr.py credentials.json ./ + +EXPOSE 54321 + +ENTRYPOINT ["/usr/local/bin/gunicorn", "-b", ":54321", "wp_wh:app"] diff --git a/dock/wordpress_webhook/converter.py b/dock/wordpress_webhook/converter.py new file mode 100644 index 0000000..e73a53a --- /dev/null +++ b/dock/wordpress_webhook/converter.py @@ -0,0 +1,158 @@ +"""A simple HTML to BBcode converter. Aimed to be used to convert WordPress +HTML into `forum.hardware.fr` BBcode. Currently missing support for colours, +and everything else might explode anytime. +""" + +import html.parser +from collections import namedtuple +import types + +TRtuple = namedtuple('TRtuple', + ['tags', 'txt_action'], + defaults=[('', ''), None]) + + +class HTMLtoBBcode(html.parser.HTMLParser): + """Subclass of `html.parser.HTMLParser` that does the format conversion.""" + + def __init__(self): + super().__init__() + self.tree = [] + self.output = '' + + self.tag_translate = { + 'h2': TRtuple(('[b][u]', '[/u][/b]\n'), + txt_action='capitalise'), + 'h3': TRtuple(('[b][u]', '[/u][/b]\n'), + txt_action='capitalise'), + 'h4': TRtuple(('[b][u]', '[/u][/b]\n'), + txt_action='capitalise'), + 'p': TRtuple(('\n', '')), + 'strong': TRtuple(('[b]', '[/b]')), + 'em': TRtuple(('[i]', '[/i]')), + 's': TRtuple(('[strike]', '[/strike]')), + 'blockquote': TRtuple(('[quote]', '[/quote]')), + 'code': TRtuple(('[cpp]', '[/cpp]')), + 'ul': TRtuple(('\n', '')), + 'ol': TRtuple(('\n', '')), + 'li': TRtuple(('[*]', '\n')), + 'a': TRtuple(('[url={href}]', '[/url]')), + # `img` tags are not always closed, leading to issues, + # therefore using a single tag for images is simpler + 'img': TRtuple(('[img]{src}[/img]', '')), + # `span` is here for underlines to be processed + 'span': TRtuple(('', '')) + } + + self.attr_translate = { + 'style': {'text-decoration': {'underline': ('[u]', '[/u]')}, + 'color': {}} + } + + self.text_translate = { + 'capitalise': str.upper + } + + def handle_starttag(self, tag, attrs): + # Nothing to do here? + if not self.tag_translate.get(tag): + return + + # As some tags are part of tag properties instead of being direct tags, + # if they need to be closed, they cannot be collected easily by + # `handle_endtag`. Therefore, all BBcode closing tags are added to the + # `self.tree` list, and they will just be popped by `handle_endtag`. + self.output += (self.tag_translate[tag].tags[0]. + format(**dict(attrs))) + self.tree.append(self.tag_translate[tag].tags[1]) + + # Keeping this commented just in case this is after all the right + # way to exclude those... + # # Do we have a link? In that case we skip this entire bit + # if tag == 'a' and any('href' in _attr for _attr in attrs): + # return + # # Do we have an image? Already handled + # if tag == 'img': + # return + # Should be a lot easier with the `if` block, but we might exclude some + # tags... + if not tag in self.tag_translate: + for attr, val in attrs: + # e.g. attrs = [('style', 'text-decoration:underline;'), + # (...), (...)] + # e.g. attrs = [('style', 'color:#4ac456;'), + # ('class', 'has-text-color')] + # + # e.g. attr = 'style' + # + # e.g. val = 'text-decoration:underline;' + # e.g. val = 'color:#4ac456;' + # + # e.g. property_ = 'text-decoration' + # e.g. property_ = 'color' + # + # e.g. value = 'underline' + # e.g. value = '#4ac456' + # + _parse_attr = val.strip(';').split(':') + # Not everything can be stripped/split, hence the try block + try: + property_ = _parse_attr[0] + value = _parse_attr[1] + + # Are we dealing with a color? Insert it to the translation + # dict `self.attr_translate` + if property_ == 'color': + self.attr_translate['style']['color'].setdefault( + value, + ('[{}]'.format(value), '[/{}]'.format(value)) + ) + + self.output += (self.attr_translate + [attr][property_][value][0]) + self.tree.append(self.attr_translate + [attr][property_][value][1]) + # Any other error to catch here? + except IndexError: + pass + # If the data that follows the tag needs to be modified, we need to + # pass the info somehow to `handle_data`. `self.text_translate` + # will likely only contain methods as values, as opposed to strings + # for normal tags, so it's easy to just add a method to `self.tree` + # and look for methods in `handle_data`. + if self.tag_translate[tag].txt_action: + self.tree.append(self.text_translate + [self.tag_translate[tag].txt_action]) + + def handle_endtag(self, tag): + if self.tag_translate.get(tag): + self.output += self.tree.pop() + + def handle_data(self, data): + # TODO: check: reconversion to handle webhook input? that seems to + # completely mess up text... why did I have that in the first place?! + # _data = bytes(data, 'utf8').decode('unicode_escape') + _data = data + # + # Otherwise this works fine for debugging with the webhook as a `str` + # _data = data + # In case the incoming data is just a series of new lines, discard them + if set(_data) == {'\n'}: + return + + # Check if last element from `self.tree` is a method. If yes, it means + # we have a `txt_action` to perform against the `data`. + # Currently the only method type is `types.MethodDescriptorType` but + # possibly there will be more later. + try: + if isinstance(self.tree[-1], types.MethodDescriptorType): + # self.tree.pop() is a method! + _data = self.tree.pop()(_data) + except IndexError: + pass + self.output += _data + + def feed(self, data): + # Resets `output` to an empty string before calling `feed` + self.output = '' + super().feed(data) diff --git a/dock/wordpress_webhook/credentials.json.template b/dock/wordpress_webhook/credentials.json.template new file mode 100644 index 0000000..d1bcafb --- /dev/null +++ b/dock/wordpress_webhook/credentials.json.template @@ -0,0 +1,6 @@ +{ + "user": "", + "user_hash": "", + "posturl": "", + "pre_title": "" +} diff --git a/dock/wordpress_webhook/hfr.py b/dock/wordpress_webhook/hfr.py new file mode 100644 index 0000000..cca47e4 --- /dev/null +++ b/dock/wordpress_webhook/hfr.py @@ -0,0 +1,52 @@ +"""Provides the `post_HFR` function that posts messages to HFR. + +You should get those ready: + user (str): username + user_hash (str): hashlib.md5(.encode()).hexdigest() + posturl (str): the post URL you are responding to (any post in the + topic will do) + newpost (str): BBcode of the new post that's being posted + +Beware of throttling/anti-flood limitations. +""" + +import urllib.parse +import requests +from bs4 import BeautifulSoup + +BASE = 'https://forum.hardware.fr' +POST = '/bddpost.php' + + +def post_HFR(user, user_hash, posturl, newpost): + """Based on inputs, posts to HFR. Returns the resulting `requests` object. + """ + # Cookie preparation. 3 Ss for md_passs, yes. + cookie = {'md_passs': user_hash, + 'md_user': user} + + # Create HFR session + HFR = requests.Session() + HFR.cookies.update(cookie) + + # Retrieve page, we'll need the hash_check from it + req = HFR.get(posturl) + soup = BeautifulSoup(req.text, 'html.parser') + + # Parse post URL, we'll need to extract a bunch of values from it + parsedURL = urllib.parse.parse_qs(posturl) + + # Prepare payload + payload = {'hash_check': soup.find('input', + {'name': 'hash_check'})['value'], + 'post': parsedURL['post'][0], + 'cat': parsedURL['cat'][0], + 'verifrequet': 1100, + 'sujet': soup.find('input', {'name': 'sujet'})['value'], + 'content_form': newpost, + 'pseudo': user} + + # Post payload + post_req = HFR.post(BASE+POST, data=payload) + + return post_req diff --git a/dock/wordpress_webhook/wp_wh.py b/dock/wordpress_webhook/wp_wh.py new file mode 100644 index 0000000..183608f --- /dev/null +++ b/dock/wordpress_webhook/wp_wh.py @@ -0,0 +1,82 @@ +"""Provides a flask route that receives webhooks from WordPress (expecting +`post_content`, `post_name`, and `post_url`), attempts to convert into BBcode, +then posts it to a `forum.hardware.fr` topic. + +HFR credentials are stored in a separate `credentials.json` JSON file. See +`hfr.py` for details on how to generate the `user_hash`. `pre_title` can be +anything, just add as many `\n`s as you wish at the end of the string. + +{ + "user": "", + "user_hash": "", + "posturl": "", + "pre_title": "" +} +""" + +import logging +import urllib.parse +import json +from flask import Flask, request +from converter import HTMLtoBBcode +from hfr import post_HFR + +app = Flask(__name__) +CREDENTIALS_FILE = 'credentials.json' +parser = HTMLtoBBcode() + +with open(CREDENTIALS_FILE, 'r') as fp: + config = json.load(fp) + + +@app.route('/wordpress_webhook', methods=['POST']) +def webhook_handler(): + """Function doing all the work.""" + logger.info('got called, {}'.format(request)) + + webhook_data = urllib.parse.parse_qs(request.get_data(as_text=True)) + logger.info(webhook_data) + + parser.feed(webhook_data['post_content'][0]) + content = parser.output + + title = webhook_data['post_name'][0].replace('-', ' ') + wordpress_url = webhook_data['post_url'][0] + title_BB = '[url={url}][b]{title}[/b][/url]\n\n'.format( + url=wordpress_url, + title=title) + + post_BB = config['pre_title'] + title_BB + content + + logger.info(post_BB) + + outcome = post_HFR(config['user'], + config['user_hash'], + config['posturl'], + post_BB) + + logger.info(outcome) + logger.info(outcome.text) + + return '' + + +def prepare_logger(logger_name=__name__): + """Simple logger preparation function + """ + logger = logging.getLogger(logger_name) + logger.setLevel(logging.INFO) + handler = logging.StreamHandler() + formatter = logging.Formatter( + '{asctime} {name} {levelname:8s} {message}', + style='{') + handler.setFormatter(formatter) + logger.addHandler(handler) + + return logger + + +logger = prepare_logger() + +if __name__ == '__main__': + app.run(host='0.0.0.0', port=54321, debug=True)