"""A simple HTML to BBcode converter. Aimed to be used to convert WordPress
|
|
HTML into `forum.hardware.fr` BBcode. Currently missing support for colours,
|
|
and everything else might explode anytime.
|
|
"""
|
|
|
|
import html.parser
|
|
from collections import namedtuple
|
|
import types
|
|
|
|
TRtuple = namedtuple('TRtuple',
|
|
['tags', 'txt_action'],
|
|
defaults=[('', ''), None])
|
|
|
|
|
|
class HTMLtoBBcode(html.parser.HTMLParser):
|
|
"""Subclass of `html.parser.HTMLParser` that does the format conversion."""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.tree = []
|
|
self.output = ''
|
|
|
|
self.tag_translate = {
|
|
'h2': TRtuple(('[b][u]', '[/u][/b]\n'),
|
|
txt_action='capitalise'),
|
|
'h3': TRtuple(('[b][u]', '[/u][/b]\n'),
|
|
txt_action='capitalise'),
|
|
'h4': TRtuple(('[b][u]', '[/u][/b]\n'),
|
|
txt_action='capitalise'),
|
|
'p': TRtuple(('\n', '')),
|
|
'strong': TRtuple(('[b]', '[/b]')),
|
|
'em': TRtuple(('[i]', '[/i]')),
|
|
's': TRtuple(('[strike]', '[/strike]')),
|
|
'blockquote': TRtuple(('[quote]', '[/quote]')),
|
|
'code': TRtuple(('[cpp]', '[/cpp]')),
|
|
'ul': TRtuple(('\n', '')),
|
|
'ol': TRtuple(('\n', '')),
|
|
'li': TRtuple(('[*]', '\n')),
|
|
'a': TRtuple(('[url={href}]', '[/url]')),
|
|
# `img` tags are not always closed, leading to issues,
|
|
# therefore using a single tag for images is simpler
|
|
'img': TRtuple(('[img]{src}[/img]', '')),
|
|
# `span` is here for underlines to be processed
|
|
'span': TRtuple(('', ''))
|
|
}
|
|
|
|
self.attr_translate = {
|
|
'style': {'text-decoration': {'underline': ('[u]', '[/u]')},
|
|
'color': {}}
|
|
}
|
|
|
|
self.text_translate = {
|
|
'capitalise': str.upper
|
|
}
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
# Nothing to do here?
|
|
if not self.tag_translate.get(tag):
|
|
return
|
|
|
|
# As some tags are part of tag properties instead of being direct tags,
|
|
# if they need to be closed, they cannot be collected easily by
|
|
# `handle_endtag`. Therefore, all BBcode closing tags are added to the
|
|
# `self.tree` list, and they will just be popped by `handle_endtag`.
|
|
self.output += (self.tag_translate[tag].tags[0].
|
|
format(**dict(attrs)))
|
|
self.tree.append(self.tag_translate[tag].tags[1])
|
|
|
|
# Keeping this commented just in case this is after all the right
|
|
# way to exclude those...
|
|
# # Do we have a link? In that case we skip this entire bit
|
|
# if tag == 'a' and any('href' in _attr for _attr in attrs):
|
|
# return
|
|
# # Do we have an image? Already handled
|
|
# if tag == 'img':
|
|
# return
|
|
# Should be a lot easier with the `if` block, but we might exclude some
|
|
# tags...
|
|
if not tag in self.tag_translate:
|
|
for attr, val in attrs:
|
|
# e.g. attrs = [('style', 'text-decoration:underline;'),
|
|
# (...), (...)]
|
|
# e.g. attrs = [('style', 'color:#4ac456;'),
|
|
# ('class', 'has-text-color')]
|
|
#
|
|
# e.g. attr = 'style'
|
|
#
|
|
# e.g. val = 'text-decoration:underline;'
|
|
# e.g. val = 'color:#4ac456;'
|
|
#
|
|
# e.g. property_ = 'text-decoration'
|
|
# e.g. property_ = 'color'
|
|
#
|
|
# e.g. value = 'underline'
|
|
# e.g. value = '#4ac456'
|
|
#
|
|
_parse_attr = val.strip(';').split(':')
|
|
# Not everything can be stripped/split, hence the try block
|
|
try:
|
|
property_ = _parse_attr[0]
|
|
value = _parse_attr[1]
|
|
|
|
# Are we dealing with a color? Insert it to the translation
|
|
# dict `self.attr_translate`
|
|
if property_ == 'color':
|
|
self.attr_translate['style']['color'].setdefault(
|
|
value,
|
|
('[{}]'.format(value), '[/{}]'.format(value))
|
|
)
|
|
|
|
self.output += (self.attr_translate
|
|
[attr][property_][value][0])
|
|
self.tree.append(self.attr_translate
|
|
[attr][property_][value][1])
|
|
# Any other error to catch here?
|
|
except IndexError:
|
|
pass
|
|
# If the data that follows the tag needs to be modified, we need to
|
|
# pass the info somehow to `handle_data`. `self.text_translate`
|
|
# will likely only contain methods as values, as opposed to strings
|
|
# for normal tags, so it's easy to just add a method to `self.tree`
|
|
# and look for methods in `handle_data`.
|
|
if self.tag_translate[tag].txt_action:
|
|
self.tree.append(self.text_translate
|
|
[self.tag_translate[tag].txt_action])
|
|
|
|
def handle_endtag(self, tag):
|
|
if self.tag_translate.get(tag):
|
|
self.output += self.tree.pop()
|
|
|
|
def handle_data(self, data):
|
|
# TODO: check: reconversion to handle webhook input? that seems to
|
|
# completely mess up text... why did I have that in the first place?!
|
|
# _data = bytes(data, 'utf8').decode('unicode_escape')
|
|
_data = data
|
|
#
|
|
# Otherwise this works fine for debugging with the webhook as a `str`
|
|
# _data = data
|
|
# In case the incoming data is just a series of new lines, discard them
|
|
if set(_data) == {'\n'}:
|
|
return
|
|
|
|
# Check if last element from `self.tree` is a method. If yes, it means
|
|
# we have a `txt_action` to perform against the `data`.
|
|
# Currently the only method type is `types.MethodDescriptorType` but
|
|
# possibly there will be more later.
|
|
try:
|
|
if isinstance(self.tree[-1], types.MethodDescriptorType):
|
|
# self.tree.pop() is a method!
|
|
_data = self.tree.pop()(_data)
|
|
except IndexError:
|
|
pass
|
|
self.output += _data
|
|
|
|
def feed(self, data):
|
|
# Resets `output` to an empty string before calling `feed`
|
|
self.output = ''
|
|
super().feed(data)
|