Source code for webstruct.loaders

# -*- coding: utf-8 -*-
"""
Webstruct supports WebAnnotator_ and GATE_ annotation formats out of box;
WebAnnotator_ is recommended.

Both GATE and WebAnnotator embed annotations into HTML using special tags:
GATE uses custom tags like ``<ORG>`` while WebAnnotator uses tags like
``<span wa-type="ORG">``.

:mod:`webstruct.loaders` classes convert GATE and WebAnnotator tags into
``__START_TAGNAME__`` and ``__END_TAGNAME__`` tokens, clean the HTML
and return the result as a tree parsed by lxml::

    >>> from webstruct import WebAnnotatorLoader  # doctest: +SKIP
    >>> loader = WebAnnotatorLoader()  # doctest: +SKIP
    >>> loader.load('0.html')  # doctest: +SKIP
    <Element html at ...>

Such trees can be processed with utilities from
:mod:`webstruct.feature_extraction`.

.. _WebAnnotator: https://github.com/xtannier/WebAnnotator
.. _GATE: http://gate.ac.uk/
"""
from __future__ import absolute_import
import re
import glob
from collections import defaultdict
import six

import lxml.html
import lxml.html.clean
from lxml.etree import ProcessingInstruction

from webstruct.utils import human_sorted, html_document_fromstring
from webstruct import webannotator


[docs]class HtmlLoader(object): """ Class for loading unannotated HTML files. """ def __init__(self, encoding=None, cleaner=None): self.encoding = encoding self.cleaner = cleaner or _get_default_cleaner()
[docs] def load(self, filename): with open(filename, 'rb') as f: return self.loadbytes(f.read())
[docs] def loadbytes(self, data): tree = html_document_fromstring(data, self.encoding) return self.cleaner.clean_html(tree)
[docs]class WebAnnotatorLoader(HtmlLoader): """ Class for loading HTML annotated using `WebAnnotator <https://github.com/xtannier/WebAnnotator>`_. .. note:: Use WebAnnotator's "save format", not "export format". """ def __init__(self, encoding=None, cleaner=None, known_entities=None): self.known_entities = known_entities super(WebAnnotatorLoader, self).__init__(encoding, cleaner)
[docs] def loadbytes(self, data): # defer cleaning the tree to prevent custom cleaners from cleaning # WebAnnotator markup tree = html_document_fromstring(data, encoding=self.encoding) webannotator.apply_wa_title(tree) if self.known_entities: self._prune_tags(tree) entities = self._get_entities(tree) self._process_entities(entities) return self._cleanup_tree(tree)
def _prune_tags(self, tree): """remove the element with wa-type not in ``known_entities``""" for el in tree.xpath('//span[@wa-type]'): if el.attrib['wa-type'] not in self.known_entities: el.drop_tag() def _get_entities(self, tree): entities = defaultdict(list) for el in tree.xpath('//span[@wa-id]'): entities[el.attrib['wa-id']].append(el) return dict(entities) def _process_entities(self, entities): for _id, elems in entities.items(): tp = elems[0].attrib['wa-type'] elems[0].text = ' __START_%s__ %s' % (tp, elems[0].text or '') elems[-1].text = '%s __END_%s__ ' % (elems[-1].text or '', tp) for el in elems: el.drop_tag() def _cleanup_tree(self, tree): for el in tree.xpath('//wa-color'): el.drop_tree() return self.cleaner.clean_html(tree)
[docs]class GateLoader(HtmlLoader): """ Class for loading HTML annotated using `GATE <http://gate.ac.uk/>`_ >>> import lxml.html >>> from webstruct import GateLoader >>> loader = GateLoader(known_entities={'ORG', 'CITY'}) >>> html = b"<html><body><p><ORG>Scrapinghub</ORG> has an <b>office</b> in <CITY>Montevideo</CITY></p></body></html>" >>> tree = loader.loadbytes(html) >>> lxml.html.tostring(tree).decode() '<html><body><p> __START_ORG__ Scrapinghub __END_ORG__ has an <b>office</b> in __START_CITY__ Montevideo __END_CITY__ </p></body></html>' Note that you must specify known_entities when creating GateLoader. It should contain all entities which are present in data, even if you want to use only a subset of them for training. Use arguments of :class:`~.HtmlLoader` to train a tagger which uses a subset of labels. """ def __init__(self, encoding=None, cleaner=None, known_entities=None): if known_entities is None: raise ValueError("Please pass `known_entities` argument with a " "list of all possible entities") self.known_entities = known_entities super(GateLoader, self).__init__(encoding, cleaner)
[docs] def loadbytes(self, data): # tags are replaced before parsing data as HTML because # GATE's html is invalid data = self._replace_entities(data) return super(GateLoader, self).loadbytes(data)
def _replace_entities(self, html_bytes): # replace requested entities with unified tokens open_re, close_re = self._entity_patterns(self.known_entities) html_bytes = re.sub(open_re, br' __START_\1__ ', html_bytes) html_bytes = re.sub(close_re, br' __END_\1__ ', html_bytes) return html_bytes def _entity_patterns(self, entities): entities_pattern = '|'.join(list(entities)) open_re = re.compile(six.b('<(%s)>' % entities_pattern), re.I) close_re = re.compile(six.b('</(%s)>' % entities_pattern), re.I) return open_re, close_re
[docs]def load_trees(pattern, loader, verbose=False): """ Load HTML data using loader ``loader`` from all files matched by ``pattern`` glob pattern. Example: >>> trees = load_trees('path/*.html', HtmlLoader()) # doctest: +SKIP """ for path in human_sorted(glob.glob(pattern)): if verbose: print(path) yield loader.load(path)
def _get_default_cleaner(): return lxml.html.clean.Cleaner( scripts=False, # non-default: preserve scripts javascript=False, # non-default: keep external stylesheets # (javascript=True removes them) # Non-default: keep comments because they may contain <base> tag. # Just comments=False doesn't work; we need to disable processing # instructions as well (and enable them again, via kill_tags). comments=False, processing_instructions=False, # required to keep comments kill_tags=[ProcessingInstruction], style=False, # non-default: keep stylesheets links=False, # non-default: keep external stylesheets meta=False, # non-default page_structure=False, # non-default embedded=True, frames=True, forms=False, # non-default annoying_tags=False, # non-default remove_unknown_tags=False, # non-default safe_attrs_only=False, # non-default )