"""A simple HTML to BBcode converter. Aimed to be used to convert WordPress HTML into `forum.hardware.fr` BBcode. Currently missing support for colours, and everything else might explode anytime. """ import html.parser from collections import namedtuple import types TRtuple = namedtuple('TRtuple', ['tags', 'txt_action'], defaults=[('', ''), None]) class HTMLtoBBcode(html.parser.HTMLParser): """Subclass of `html.parser.HTMLParser` that does the format conversion.""" def __init__(self): super().__init__() self.tree = [] self.output = '' self.tag_translate = { 'h2': TRtuple(('[b][u]', '[/u][/b]\n'), txt_action='capitalise'), 'h3': TRtuple(('[b][u]', '[/u][/b]\n'), txt_action='capitalise'), 'h4': TRtuple(('[b][u]', '[/u][/b]\n'), txt_action='capitalise'), 'p': TRtuple(('\n', '')), 'strong': TRtuple(('[b]', '[/b]')), 'em': TRtuple(('[i]', '[/i]')), 's': TRtuple(('[strike]', '[/strike]')), 'blockquote': TRtuple(('[quote]', '[/quote]')), 'code': TRtuple(('[cpp]', '[/cpp]')), 'ul': TRtuple(('\n', '')), 'ol': TRtuple(('\n', '')), 'li': TRtuple(('[*]', '\n')), 'a': TRtuple(('[url={href}]', '[/url]')), # `img` tags are not always closed, leading to issues, # therefore using a single tag for images is simpler 'img': TRtuple(('[img]{src}[/img]', '')), # `span` is here for underlines to be processed 'span': TRtuple(('', '')) } self.attr_translate = { 'style': {'text-decoration': {'underline': ('[u]', '[/u]')}, 'color': {}} } self.text_translate = { 'capitalise': str.upper } def handle_starttag(self, tag, attrs): # Nothing to do here? if not self.tag_translate.get(tag): return # As some tags are part of tag properties instead of being direct tags, # if they need to be closed, they cannot be collected easily by # `handle_endtag`. Therefore, all BBcode closing tags are added to the # `self.tree` list, and they will just be popped by `handle_endtag`. self.output += (self.tag_translate[tag].tags[0]. format(**dict(attrs))) self.tree.append(self.tag_translate[tag].tags[1]) # Keeping this commented just in case this is after all the right # way to exclude those... # # Do we have a link? In that case we skip this entire bit # if tag == 'a' and any('href' in _attr for _attr in attrs): # return # # Do we have an image? Already handled # if tag == 'img': # return # Should be a lot easier with the `if` block, but we might exclude some # tags... if not tag in self.tag_translate: for attr, val in attrs: # e.g. attrs = [('style', 'text-decoration:underline;'), # (...), (...)] # e.g. attrs = [('style', 'color:#4ac456;'), # ('class', 'has-text-color')] # # e.g. attr = 'style' # # e.g. val = 'text-decoration:underline;' # e.g. val = 'color:#4ac456;' # # e.g. property_ = 'text-decoration' # e.g. property_ = 'color' # # e.g. value = 'underline' # e.g. value = '#4ac456' # _parse_attr = val.strip(';').split(':') # Not everything can be stripped/split, hence the try block try: property_ = _parse_attr[0] value = _parse_attr[1] # Are we dealing with a color? Insert it to the translation # dict `self.attr_translate` if property_ == 'color': self.attr_translate['style']['color'].setdefault( value, ('[{}]'.format(value), '[/{}]'.format(value)) ) self.output += (self.attr_translate [attr][property_][value][0]) self.tree.append(self.attr_translate [attr][property_][value][1]) # Any other error to catch here? except IndexError: pass # If the data that follows the tag needs to be modified, we need to # pass the info somehow to `handle_data`. `self.text_translate` # will likely only contain methods as values, as opposed to strings # for normal tags, so it's easy to just add a method to `self.tree` # and look for methods in `handle_data`. if self.tag_translate[tag].txt_action: self.tree.append(self.text_translate [self.tag_translate[tag].txt_action]) def handle_endtag(self, tag): if self.tag_translate.get(tag): self.output += self.tree.pop() def handle_data(self, data): # TODO: check: reconversion to handle webhook input? that seems to # completely mess up text... why did I have that in the first place?! # _data = bytes(data, 'utf8').decode('unicode_escape') _data = data # # Otherwise this works fine for debugging with the webhook as a `str` # _data = data # In case the incoming data is just a series of new lines, discard them if set(_data) == {'\n'}: return # Check if last element from `self.tree` is a method. If yes, it means # we have a `txt_action` to perform against the `data`. # Currently the only method type is `types.MethodDescriptorType` but # possibly there will be more later. try: if isinstance(self.tree[-1], types.MethodDescriptorType): # self.tree.pop() is a method! _data = self.tree.pop()(_data) except IndexError: pass self.output += _data def feed(self, data): # Resets `output` to an empty string before calling `feed` self.output = '' super().feed(data)