Source code for webstruct.model

# -*- coding: utf-8 -*-
"""
:mod:`webstruct.model` contains convetional wrappers for creating NER models.
"""
from __future__ import absolute_import
import urllib2

from webstruct.loaders import HtmlLoader
from webstruct.feature_extraction import HtmlTokenizer
from webstruct.sequence_encoding import IobEncoder
from webstruct.utils import smart_join
from webstruct.grouping import choose_best_clustering


[docs]class NER(object): """ Class for extracting named entities from HTML. Initialize it with a trained ``model``. ``model`` must have ``transform`` method that accepts lists of :class:`~.HtmlToken` sequences and returns lists of predicted IOB2 tags. :func:`~.create_wapiti_pipeline` function returns such model. """ def __init__(self, model, loader=None, html_tokenizer=None): self.model = model self.loader = loader or HtmlLoader() self.html_tokenizer = html_tokenizer or HtmlTokenizer()
[docs] def extract(self, bytes_data): """ Extract named entities from binary HTML data ``bytes_data``. Return a list of ``(entity_text, entity_type)`` tuples. """ html_tokens, tags = self.extract_raw(bytes_data) groups = IobEncoder.group(zip(html_tokens, tags)) return _drop_empty( (self.build_entity(tokens, tag), tag) for (tokens, tag) in groups if tag != 'O' )
[docs] def extract_from_url(self, url): """ A convenience wrapper for :meth:`extract` method that downloads input data from a remote URL. """ data = urllib2.urlopen(url).read() return self.extract(data)
[docs] def extract_raw(self, bytes_data): """ Extract named entities from binary HTML data ``bytes_data``. Return a list of ``(html_token, iob2_tag)`` tuples. """ tree = self.loader.loadbytes(bytes_data) html_tokens, _ = self.html_tokenizer.tokenize_single(tree) tags = self.model.transform([html_tokens])[0] return html_tokens, tags
[docs] def extract_groups(self, bytes_data, dont_penalize=None): """ Extract groups of named entities from binary HTML data ``bytes_data``. Return a list of lists of ``(entity_text, entity_type)`` tuples. Entites are grouped using algorithm from :mod:`webstruct.grouping`. """ html_tokens, tags = self.extract_raw(bytes_data) _, _, clusters = choose_best_clustering( html_tokens, tags, score_kwargs={'dont_penalize': dont_penalize} ) entities = [] for cluster in clusters: text_entities = _drop_empty( (self.build_entity(tokens, tag), tag) for tokens, tag, dist in cluster ) if text_entities: entities.append(text_entities) return entities
[docs] def build_entity(self, html_tokens, tag): """ Join tokens to an entity. Return an entity, as text. By default this function uses :func:`webstruct.utils.smart_join`. Override it to customize :meth:`extract`, :meth:`extract_from_url` and :meth:`extract_groups` results. If this function returns empty string or None, entity is dropped. """ return smart_join(t.token for t in html_tokens)
def _drop_empty(entities): return [(text, tag) for (text, tag) in entities if text]