Source code for webstruct.loaders

# -*- coding: utf-8 -*-
"""
Webstruct supports WebAnnotator_ and GATE_ annotation formats out of box;
WebAnnotator_ is recommended.

Both GATE and WebAnnotator embed annotations into HTML using special tags:
GATE uses custom tags like ``<ORG>`` while WebAnnotator uses tags like
``<span wa-type="ORG">``.

:mod:`webstruct.loaders` classes convert GATE and WebAnnotator tags into
``__START_TAGNAME__`` and ``__END_TAGNAME__`` tokens, clean the HTML
and return the result as a tree parsed by lxml::

    >>> from webstruct import WebAnnotatorLoader  # doctest: +SKIP
    >>> loader = WebAnnotatorLoader()  # doctest: +SKIP
    >>> loader.load('0.html')  # doctest: +SKIP
    <Element html at ...>

Such trees can be processed with utilities from
:mod:`webstruct.feature_extraction`.

.. _WebAnnotator: https://github.com/xtannier/WebAnnotator
.. _GATE: http://gate.ac.uk/
"""
from __future__ import absolute_import
import re
import glob
from collections import defaultdict
import six

import lxml.html
import lxml.html.clean
from lxml.etree import ProcessingInstruction

from webstruct.utils import human_sorted, html_document_fromstring
from webstruct import webannotator


[docs]class HtmlLoader(object):
    """
    Class for loading unannotated HTML files.
    """
    def __init__(self, encoding=None, cleaner=None):
        self.encoding = encoding
        self.cleaner = cleaner or _get_default_cleaner()

[docs]    def load(self, filename):
        with open(filename, 'rb') as f:
            return self.loadbytes(f.read())

[docs]    def loadbytes(self, data):
        tree = html_document_fromstring(data, self.encoding)
        return self.cleaner.clean_html(tree)


[docs]class WebAnnotatorLoader(HtmlLoader):
    """
    Class for loading HTML annotated using
    `WebAnnotator <https://github.com/xtannier/WebAnnotator>`_.

    .. note::

        Use WebAnnotator's "save format", not "export format".

    """
    def __init__(self, encoding=None, cleaner=None, known_entities=None):
        self.known_entities = known_entities
        super(WebAnnotatorLoader, self).__init__(encoding, cleaner)

[docs]    def loadbytes(self, data):
        # defer cleaning the tree to prevent custom cleaners from cleaning
        # WebAnnotator markup
        tree = html_document_fromstring(data, encoding=self.encoding)
        webannotator.apply_wa_title(tree)
        if self.known_entities:
            self._prune_tags(tree)
        entities = self._get_entities(tree)
        self._process_entities(entities)
        return self._cleanup_tree(tree)

    def _prune_tags(self, tree):
        """remove the element with wa-type not in ``known_entities``"""
        for el in tree.xpath('//span[@wa-type]'):
            if el.attrib['wa-type'] not in self.known_entities:
                el.drop_tag()

    def _get_entities(self, tree):
        entities = defaultdict(list)
        for el in tree.xpath('//span[@wa-id]'):
            entities[el.attrib['wa-id']].append(el)
        return dict(entities)

    def _process_entities(self, entities):
        for _id, elems in entities.items():
            tp = elems[0].attrib['wa-type']
            elems[0].text = ' __START_%s__ %s' % (tp, elems[0].text or '')
            elems[-1].text = '%s __END_%s__ ' % (elems[-1].text or '', tp)
            for el in elems:
                el.drop_tag()

    def _cleanup_tree(self, tree):
        for el in tree.xpath('//wa-color'):
            el.drop_tree()
        return self.cleaner.clean_html(tree)


[docs]class GateLoader(HtmlLoader):
    """
    Class for loading HTML annotated using `GATE <http://gate.ac.uk/>`_

    >>> import lxml.html
    >>> from webstruct import GateLoader

    >>> loader = GateLoader(known_entities={'ORG', 'CITY'})
    >>> html = b"<html><body><p><ORG>Scrapinghub</ORG> has an <b>office</b> in <CITY>Montevideo</CITY></p></body></html>"
    >>> tree = loader.loadbytes(html)
    >>> lxml.html.tostring(tree).decode()
    '<html><body><p> __START_ORG__ Scrapinghub __END_ORG__  has an <b>office</b> in  __START_CITY__ Montevideo __END_CITY__ </p></body></html>'

    Note that you must specify known_entities when creating GateLoader.
    It should contain all entities which are present in data, even if
    you want to use only a subset of them for training. Use arguments of
    :class:`~.HtmlLoader` to train a tagger which uses a subset of labels.
    """

    def __init__(self, encoding=None, cleaner=None, known_entities=None):
        if known_entities is None:
            raise ValueError("Please pass `known_entities` argument with a "
                             "list of all possible entities")
        self.known_entities = known_entities
        super(GateLoader, self).__init__(encoding, cleaner)

[docs]    def loadbytes(self, data):
        # tags are replaced before parsing data as HTML because
        # GATE's html is invalid
        data = self._replace_entities(data)
        return super(GateLoader, self).loadbytes(data)

    def _replace_entities(self, html_bytes):
        # replace requested entities with unified tokens
        open_re, close_re = self._entity_patterns(self.known_entities)
        html_bytes = re.sub(open_re, br' __START_\1__ ', html_bytes)
        html_bytes = re.sub(close_re, br' __END_\1__ ', html_bytes)
        return html_bytes

    def _entity_patterns(self, entities):
        entities_pattern = '|'.join(list(entities))
        open_re = re.compile(six.b('<(%s)>' % entities_pattern), re.I)
        close_re = re.compile(six.b('</(%s)>' % entities_pattern), re.I)
        return open_re, close_re


[docs]def load_trees(pattern, loader, verbose=False):
    """
    Load HTML data using loader ``loader`` from all files matched by
    ``pattern`` glob pattern.

    Example:

    >>> trees = load_trees('path/*.html', HtmlLoader())  # doctest: +SKIP

    """
    for path in human_sorted(glob.glob(pattern)):
        if verbose:
            print(path)
        yield loader.load(path)


def _get_default_cleaner():
    return lxml.html.clean.Cleaner(
        scripts=False,     # non-default: preserve scripts
        javascript=False,  # non-default: keep external stylesheets
                           # (javascript=True removes them)

        # Non-default: keep comments because they may contain <base> tag.
        # Just comments=False doesn't work; we need to disable processing
        # instructions as well (and enable them again, via kill_tags).
        comments=False,
        processing_instructions=False,  # required to keep comments
        kill_tags=[ProcessingInstruction],

        style=False,  # non-default: keep stylesheets
        links=False,  # non-default: keep external stylesheets
        meta=False,   # non-default
        page_structure=False,  # non-default
        embedded=True,
        frames=True,
        forms=False,  # non-default
        annoying_tags=False,  # non-default
        remove_unknown_tags=False,  # non-default
        safe_attrs_only=False,  # non-default
    )