Source code for webstruct.sequence_encoding

# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re


[docs]class IobEncoder(object):
    """
    Utility class for encoding tagged token streams using IOB2 encoding.

    Encode input tokens using ``encode`` method::

        >>> iob_encoder = IobEncoder()
        >>> input_tokens = ["__START_PER__", "John", "__END_PER__", "said"]
        >>> def encode(encoder, tokens): return [p for p in IobEncoder.from_indices(encoder.encode(tokens), tokens)]
        >>> encode(iob_encoder, input_tokens)
        [('John', 'B-PER'), ('said', 'O')]


        >>> input_tokens = ["hello", "__START_PER__", "John", "Doe", "__END_PER__", "__START_PER__", "Mary", "__END_PER__", "said"]
        >>> tokens = encode(iob_encoder, input_tokens)
        >>> tokens, tags = iob_encoder.split(tokens)
        >>> tokens, tags
        (['hello', 'John', 'Doe', 'Mary', 'said'], ['O', 'B-PER', 'I-PER', 'B-PER', 'O'])

    Note that IobEncoder is stateful. This means you can encode incomplete
    stream and continue the encoding later::

        >>> iob_encoder = IobEncoder()
        >>> input_tokens_partial = ["__START_PER__", "John"]
        >>> encode(iob_encoder, input_tokens_partial)
        [('John', 'B-PER')]
        >>> input_tokens_partial = ["Mayer", "__END_PER__", "said"]
        >>> encode(iob_encoder, input_tokens_partial)
        [('Mayer', 'I-PER'), ('said', 'O')]

    To reset internal state, use ``reset method``::

        >>> iob_encoder.reset()

    Group results to entities::

        >>> iob_encoder.group(encode(iob_encoder, input_tokens))
        [(['hello'], 'O'), (['John', 'Doe'], 'PER'), (['Mary'], 'PER'), (['said'], 'O')]

    Input token stream is processed by ``InputTokenProcessor()`` by default;
    you can pass other token processing class to customize which tokens
    are considered start/end tags.
    """

    def __init__(self, token_processor=None):
        self.token_processor = token_processor or InputTokenProcessor()
        self.reset()

[docs]    def reset(self):
        """ Reset the sequence """
        self.tag = 'O'

[docs]    def iter_encode(self, input_tokens):
        for number, token in enumerate(input_tokens):
            token_type, value = self.token_processor.classify(token)

            if token_type == 'start':
                self.tag = "B-" + value

            elif token_type == 'end':
                if value != self.tag[2:]:
                    raise ValueError(
                        "Invalid tag sequence: close tag '%s' "
                        "doesn't match open tag '%s'." % (value, self.tag)
                    )
                self.tag = "O"

            elif token_type == 'token':
                yield number, self.tag
                if self.tag[0] == 'B':
                    self.tag = "I" + self.tag[1:]

            elif token_type == 'drop':
                continue

            else:
                raise ValueError("Unknown token type '%s' for token '%s'" % (token_type, token))

[docs]    def encode(self, input_tokens):
        return list(self.iter_encode(input_tokens))

[docs]    def split(self, tokens):
        """ split ``[(token, tag)]`` to ``([token], [tags])`` tuple """
        return [t[0] for t in tokens], [t[1] for t in tokens]

[docs]    @classmethod
    def from_indices(cls, indices, input_tokens):
        for idx, tag in indices:
            yield input_tokens[idx], tag

[docs]    @classmethod
    def group(cls, data, strict=False):
        """
        Group IOB2-encoded entities. ``data`` should be an iterable
        of ``(info, iob_tag)`` tuples. ``info`` could be any Python object,
        ``iob_tag`` should be a string with a tag.

        Example::

            >>>
            >>> data = [("hello", "O"), (",", "O"), ("John", "B-PER"),
            ...         ("Doe", "I-PER"), ("Mary", "B-PER"), ("said", "O")]
            >>> for items, tag in IobEncoder.iter_group(data):
            ...     print("%s %s" % (items, tag))
            ['hello', ','] O
            ['John', 'Doe'] PER
            ['Mary'] PER
            ['said'] O

        By default, invalid sequences are fixed::

            >>> data = [("hello", "O"), ("John", "I-PER"), ("Doe", "I-PER")]
            >>> for items, tag in IobEncoder.iter_group(data):
            ...     print("%s %s" % (items, tag))
            ['hello'] O
            ['John', 'Doe'] PER

        Pass 'strict=True' argument to raise an exception for
        invalid sequences::

            >>> for items, tag in IobEncoder.iter_group(data, strict=True):
            ...     print("%s %s" % (items, tag))
            Traceback (most recent call last):
            ...
            ValueError: Invalid sequence: I-PER tag can't start sequence
        """
        return list(cls.iter_group(data, strict))

[docs]    @classmethod
    def iter_group(cls, data, strict=False):
        buf, tag = [], 'O'

        for info, iob_tag in data:
            if iob_tag.startswith('I-') and tag != iob_tag[2:]:
                if strict:
                    raise ValueError("Invalid sequence: %s tag can't start sequence" % iob_tag)
                else:
                    iob_tag = 'B-' + iob_tag[2:]  # fix bad tag

            if iob_tag.startswith('B-'):
                if buf:
                    yield buf, tag
                buf = []

            elif iob_tag == 'O':
                if buf and tag != 'O':
                    yield buf, tag
                    buf = []

            tag = 'O' if iob_tag == 'O' else iob_tag[2:]
            buf.append(info)

        if buf:
            yield buf, tag


# FIXME: this hook is incomplete: __START_TAG__ tokens are assumed everywhere.
[docs]class InputTokenProcessor(object):
    def __init__(self, tagset=None):
        if tagset is not None:
            tag_re = '|'.join(tagset)
        else:
            tag_re = '\w+?'
        self.tag_re = re.compile('__(START|END)_(%s)__' % tag_re)

[docs]    def classify(self, token):
        """
        >>> tp = InputTokenProcessor()
        >>> tp.classify('foo')
        ('token', 'foo')
        >>> tp.classify('__START_ORG__')
        ('start', 'ORG')
        >>> tp.classify('__END_ORG__')
        ('end', 'ORG')
        """

        # start/end tags
        m = self.tag_re.match(token)
        if m:
            return m.group(1).lower(), m.group(2)

        # # drop standalone commas and semicolons by default?
        # if token in {',', ';'}:
        #     return 'drop', token

        # regular token
        return 'token', token