nomis
/
wp_to_hfr


								"""A simple HTML to BBcode converter. Aimed to be used to convert WordPress

								HTML into `forum.hardware.fr` BBcode. Currently missing support for colours,

								and everything else might explode anytime.

								"""


								import html.parser

								from collections import namedtuple

								import types


								TRtuple = namedtuple('TRtuple',

								                     ['tags', 'txt_action'],

								                     defaults=[('', ''), None])


								class HTMLtoBBcode(html.parser.HTMLParser):

								    """Subclass of `html.parser.HTMLParser` that does the format conversion."""


								    def __init__(self):

								        super().__init__()

								        self.tree = []

								        self.output = ''


								        self.tag_translate = {

								                'h2':         TRtuple(('[b][u]', '[/u][/b]\n'),

								                                      txt_action='capitalise'),

								                'h3':         TRtuple(('[b][u]', '[/u][/b]\n'),

								                                      txt_action='capitalise'),

								                'h4':         TRtuple(('[b][u]', '[/u][/b]\n'),

								                                      txt_action='capitalise'),

								                'p':          TRtuple(('\n', '')),

								                'strong':     TRtuple(('[b]', '[/b]')),

								                'em':         TRtuple(('[i]', '[/i]')),

								                's':          TRtuple(('[strike]', '[/strike]')),

								                'blockquote': TRtuple(('[quote]', '[/quote]')),

								                'code':       TRtuple(('[cpp]', '[/cpp]')),

								                'ul':         TRtuple(('\n', '')),

								                'ol':         TRtuple(('\n', '')),

								                'li':         TRtuple(('[*]', '\n')),

								                'a':          TRtuple(('[url={href}]', '[/url]')),

								                # `img` tags are not always closed, leading to issues,

								                # therefore using a single tag for images is simpler

								                'img':        TRtuple(('[img]{src}[/img]', '')),

								                # `span` is here for underlines to be processed

								                'span':       TRtuple(('', ''))

								        }


								        self.attr_translate = {

								                'style': {'text-decoration': {'underline': ('[u]', '[/u]')},

								                          'color': {}}

								        }


								        self.text_translate = {

								                'capitalise': str.upper

								        }


								    def handle_starttag(self, tag, attrs):

								        # Nothing to do here?

								        if not self.tag_translate.get(tag):

								            return


								        # As some tags are part of tag properties instead of being direct tags,

								        # if they need to be closed, they cannot be collected easily by

								        # `handle_endtag`. Therefore, all BBcode closing tags are added to the

								        # `self.tree` list, and they will just be popped by `handle_endtag`.

								        self.output += (self.tag_translate[tag].tags[0].

								                        format(**dict(attrs)))

								        self.tree.append(self.tag_translate[tag].tags[1])


								        # Keeping this commented just in case this is after all the right

								        # way to exclude those...

								        # # Do we have a link? In that case we skip this entire bit

								        # if tag == 'a' and any('href' in _attr for _attr in attrs):

								        #     return

								        # # Do we have an image? Already handled

								        # if tag == 'img':

								        #     return

								        # Should be a lot easier with the `if` block, but we might exclude some

								        # tags...

								        if not tag in self.tag_translate:

								            for attr, val in attrs:

								                # e.g. attrs = [('style', 'text-decoration:underline;'),

								                #               (...), (...)]

								                # e.g. attrs = [('style', 'color:#4ac456;'),

								                #               ('class', 'has-text-color')]

								                #

								                # e.g. attr = 'style'

								                #

								                # e.g. val = 'text-decoration:underline;'

								                # e.g. val = 'color:#4ac456;'

								                #

								                # e.g. property_ = 'text-decoration'

								                # e.g. property_ = 'color'

								                #

								                # e.g. value = 'underline'

								                # e.g. value = '#4ac456'

								                #

								                _parse_attr = val.strip(';').split(':')

								                # Not everything can be stripped/split, hence the try block

								                try:

								                    property_ = _parse_attr[0]

								                    value = _parse_attr[1]


								                    # Are we dealing with a color? Insert it to the translation

								                    # dict `self.attr_translate`

								                    if property_ == 'color':

								                        self.attr_translate['style']['color'].setdefault(

								                                value,

								                                ('[{}]'.format(value), '[/{}]'.format(value))

								                        )


								                    self.output += (self.attr_translate

								                                    [attr][property_][value][0])

								                    self.tree.append(self.attr_translate

								                                    [attr][property_][value][1])

								                # Any other error to catch here?

								                except IndexError:

								                    pass

								        # If the data that follows the tag needs to be modified, we need to

								        # pass the info somehow to `handle_data`. `self.text_translate`

								        # will likely only contain methods as values, as opposed to strings

								        # for normal tags, so it's easy to just add a method to `self.tree`

								        # and look for methods in `handle_data`.

								        if self.tag_translate[tag].txt_action:

								            self.tree.append(self.text_translate

								                                [self.tag_translate[tag].txt_action])


								    def handle_endtag(self, tag):

								        if self.tag_translate.get(tag):

								            self.output += self.tree.pop()


								    def handle_data(self, data):

								        # TODO: check: reconversion to handle webhook input? that seems to

								        # completely mess up text... why did I have that in the first place?!

								        # _data = bytes(data, 'utf8').decode('unicode_escape')

								        _data = data

								        #

								        # Otherwise this works fine for debugging with the webhook as a `str`

								        # _data = data

								        # In case the incoming data is just a series of new lines, discard them

								        if set(_data) == {'\n'}:

								            return


								        # Check if last element from `self.tree` is a method. If yes, it means

								        # we have a `txt_action` to perform against the `data`.

								        # Currently the only method type is `types.MethodDescriptorType` but

								        # possibly there will be more later.

								        try:

								            if isinstance(self.tree[-1], types.MethodDescriptorType):

								                # self.tree.pop() is a method!

								                _data = self.tree.pop()(_data)

								        except IndexError:

								            pass

								        self.output += _data


								    def feed(self, data):

								        # Resets `output` to an empty string before calling `feed`

								        self.output = ''

								        super().feed(data)