"""A simple HTML to BBcode converter. Aimed to be used to convert WordPress
HTML into `forum.hardware.fr` BBcode. Currently missing support for colours,
and everything else might explode anytime.
"""
import html.parser
from collections import namedtuple
import types
TRtuple = namedtuple('TRtuple',
['tags', 'txt_action'],
defaults=[('', ''), None])
class HTMLtoBBcode(html.parser.HTMLParser):
"""Subclass of `html.parser.HTMLParser` that does the format conversion."""
def __init__(self):
super().__init__()
self.tree = []
self.output = ''
self.tag_translate = {
'h2': TRtuple(('[b][u]', '[/u][/b]\n'),
txt_action='capitalise'),
'h3': TRtuple(('[b][u]', '[/u][/b]\n'),
txt_action='capitalise'),
'h4': TRtuple(('[b][u]', '[/u][/b]\n'),
txt_action='capitalise'),
'p': TRtuple(('\n', '')),
'strong': TRtuple(('[b]', '[/b]')),
'em': TRtuple(('[i]', '[/i]')),
's': TRtuple(('[strike]', '[/strike]')),
'blockquote': TRtuple(('[quote]', '[/quote]')),
'code': TRtuple(('[cpp]', '[/cpp]')),
'ul': TRtuple(('\n', '')),
'ol': TRtuple(('\n', '')),
'li': TRtuple(('[*]', '\n')),
'a': TRtuple(('[url={href}]', '[/url]')),
# `img` tags are not always closed, leading to issues,
# therefore using a single tag for images is simpler
'img': TRtuple(('[img]{src}[/img]', '')),
# `span` is here for underlines to be processed
'span': TRtuple(('', ''))
}
self.attr_translate = {
'style': {'text-decoration': {'underline': ('[u]', '[/u]')},
'color': {}}
}
self.text_translate = {
'capitalise': str.upper
}
def handle_starttag(self, tag, attrs):
# Nothing to do here?
if not self.tag_translate.get(tag):
return
# As some tags are part of tag properties instead of being direct tags,
# if they need to be closed, they cannot be collected easily by
# `handle_endtag`. Therefore, all BBcode closing tags are added to the
# `self.tree` list, and they will just be popped by `handle_endtag`.
self.output += (self.tag_translate[tag].tags[0].
format(**dict(attrs)))
self.tree.append(self.tag_translate[tag].tags[1])
# Keeping this commented just in case this is after all the right
# way to exclude those...
# # Do we have a link? In that case we skip this entire bit
# if tag == 'a' and any('href' in _attr for _attr in attrs):
# return
# # Do we have an image? Already handled
# if tag == 'img':
# return
# Should be a lot easier with the `if` block, but we might exclude some
# tags...
if not tag in self.tag_translate:
for attr, val in attrs:
# e.g. attrs = [('style', 'text-decoration:underline;'),
# (...), (...)]
# e.g. attrs = [('style', 'color:#4ac456;'),
# ('class', 'has-text-color')]
#
# e.g. attr = 'style'
#
# e.g. val = 'text-decoration:underline;'
# e.g. val = 'color:#4ac456;'
#
# e.g. property_ = 'text-decoration'
# e.g. property_ = 'color'
#
# e.g. value = 'underline'
# e.g. value = '#4ac456'
#
_parse_attr = val.strip(';').split(':')
# Not everything can be stripped/split, hence the try block
try:
property_ = _parse_attr[0]
value = _parse_attr[1]
# Are we dealing with a color? Insert it to the translation
# dict `self.attr_translate`
if property_ == 'color':
self.attr_translate['style']['color'].setdefault(
value,
('[{}]'.format(value), '[/{}]'.format(value))
)
self.output += (self.attr_translate
[attr][property_][value][0])
self.tree.append(self.attr_translate
[attr][property_][value][1])
# Any other error to catch here?
except IndexError:
pass
# If the data that follows the tag needs to be modified, we need to
# pass the info somehow to `handle_data`. `self.text_translate`
# will likely only contain methods as values, as opposed to strings
# for normal tags, so it's easy to just add a method to `self.tree`
# and look for methods in `handle_data`.
if self.tag_translate[tag].txt_action:
self.tree.append(self.text_translate
[self.tag_translate[tag].txt_action])
def handle_endtag(self, tag):
if self.tag_translate.get(tag):
self.output += self.tree.pop()
def handle_data(self, data):
# TODO: check: reconversion to handle webhook input? that seems to
# completely mess up text... why did I have that in the first place?!
# _data = bytes(data, 'utf8').decode('unicode_escape')
_data = data
#
# Otherwise this works fine for debugging with the webhook as a `str`
# _data = data
# In case the incoming data is just a series of new lines, discard them
if set(_data) == {'\n'}:
return
# Check if last element from `self.tree` is a method. If yes, it means
# we have a `txt_action` to perform against the `data`.
# Currently the only method type is `types.MethodDescriptorType` but
# possibly there will be more later.
try:
if isinstance(self.tree[-1], types.MethodDescriptorType):
# self.tree.pop() is a method!
_data = self.tree.pop()(_data)
except IndexError:
pass
self.output += _data
def feed(self, data):
# Resets `output` to an empty string before calling `feed`
self.output = ''
super().feed(data)