Source code for webstruct.feature_extraction
# -*- coding: utf-8 -*-
"""
:mod:`webstruct.feature_extraction` contains classes that help
with:
- converting HTML pages into lists of feature dicts and
- extracting annotations.
Usually, the approach is the following:
1. Convert a web page to a list of :class:`~.HtmlToken` instances
and a list of annotation tags (if present). :class:`~.HtmlTokenizer`
is used for that.
2. Run a number of "token feature functions" that return bits of information
about each token: token text, token shape (uppercased/lowercased/...),
whether token is in ``<a>`` HTML element, etc. For each token information
is combined into a single feature dictionary.
Use :class:`HtmlFeatureExtractor` at this stage. There is a number of
predefined token feature functions in :mod:`webstruct.features`.
3. Run a number of "global feature functions" that can modify token feature
dicts inplace (insert new features, change, remove them) using "global"
information - information about all other tokens in a document and their
existing token-level feature dicts. Global feature functions are applied
sequentially: subsequent global feature functions get feature dicts updated
by previous feature functions.
This is also done by :class:`HtmlFeatureExtractor`.
:class:`~webstruct.features.utils.LongestMatchGlobalFeature` can be used
to create features that capture multi-token patterns. Some predefined
global feature functions can be found in :mod:`webstruct.gazetteers`.
"""
from __future__ import absolute_import, print_function
from itertools import chain
from collections import Counter
from six.moves import zip
from sklearn.base import BaseEstimator, TransformerMixin
from webstruct.utils import merge_dicts
[docs]class HtmlFeatureExtractor(BaseEstimator, TransformerMixin):
"""
This class extracts features from lists of :class:`~.HtmlToken` instances
(:class:`~.HtmlTokenizer` can be used to create such lists).
:meth:`fit` / :meth:`transform` / :meth:`fit_transform` interface
may look familiar to you if you ever used scikit-learn_:
:class:`HtmlFeatureExtractor` implements sklearn's
Transformer interface. But there is one twist: usually for sequence
labelling tasks the whole sequences are considered observations.
So in our case a single observation is a tokenized document
(a list of tokens), not an individual token:
:meth:`fit` / :meth:`transform` / :meth:`fit_transform` methods accept
lists of documents (lists of lists of tokens), and return lists
of documents' feature dicts (lists of lists of feature dicts).
.. _scikit-learn: http://scikit-learn.org
Parameters
----------
token_features : list of callables
List of "token" feature functions. Each function accepts
a single ``html_token`` parameter and returns a dictionary
wich maps feature names to feature values. Dicts from all
token feature functions are merged by HtmlFeatureExtractor.
Example token feature (it just returns token text)::
>>> def current_token(html_token):
... return {'tok': html_token.token}
:mod:`webstruct.features` module provides some predefined feature
functions, e.g. :func:`parent_tag <webstruct.features.block_features.parent_tag>`
which returns token's parent tag.
Example::
>>> from webstruct import GateLoader, HtmlTokenizer, HtmlFeatureExtractor
>>> from webstruct.features import parent_tag
>>> loader = GateLoader(known_entities={'PER'})
>>> html_tokenizer = HtmlTokenizer()
>>> feature_extractor = HtmlFeatureExtractor(token_features=[parent_tag])
>>> tree = loader.loadbytes(b"<p>hello, <PER>John <b>Doe</b></PER> <br> <PER>Mary</PER> said</p>")
>>> html_tokens, tags = html_tokenizer.tokenize_single(tree)
>>> feature_dicts = feature_extractor.transform_single(html_tokens)
>>> for token, tag, feat in zip(html_tokens, tags, feature_dicts):
... print("%s %s %s" % (token.token, tag, feat))
hello O {'parent_tag': 'p'}
John B-PER {'parent_tag': 'p'}
Doe I-PER {'parent_tag': 'b'}
Mary B-PER {'parent_tag': 'p'}
said O {'parent_tag': 'p'}
global_features : list of callables, optional
List of "global" feature functions. Each "global" feature function
should accept a single argument - a list
of ``(html_token, feature_dict)`` tuples.
This list contains all tokens from the document and
features extracted by previous feature functions.
"Global" feature functions are applied after "token" feature
functions in the order they are passed.
They should change feature dicts ``feature_dict`` inplace.
min_df : integer or Mapping, optional
Feature values that have a document frequency strictly
lower than the given threshold are removed.
If ``min_df`` is integer, its value is used as threshold.
TODO: if ``min_df`` is a dictionary, it should map feature names
to thresholds.
"""
def __init__(self, token_features, global_features=None, min_df=1):
self.token_features = token_features
self.global_features = global_features or []
self.min_df = min_df
[docs] def fit_transform(self, html_token_lists, y=None, **fit_params):
X = [self.transform_single(html_tokens) for html_tokens in html_token_lists]
return self._pruned(X, low=self.min_df)
[docs] def transform(self, html_token_lists):
return [self.transform_single(html_tokens) for html_tokens in html_token_lists]
[docs] def transform_single(self, html_tokens):
feature_func = _CombinedFeatures(*self.token_features)
token_data = list(zip(html_tokens, map(feature_func, html_tokens)))
for feat in self.global_features:
feat(token_data)
return [featdict for tok, featdict in token_data]
def _pruned(self, X, low=None):
if low is None or low <= 1:
return X
cnt = self._document_frequency(X)
keep = {k for (k, v) in cnt.items() if v >= low}
del cnt
return [
[{k: v for k, v in fd.items() if (k, v) in keep} for fd in doc]
for doc in X
]
def _document_frequency(self, X):
cnt = Counter()
for doc in X:
seen_features = set(chain.from_iterable(fd.items() for fd in doc))
cnt.update(seen_features)
return cnt
class _CombinedFeatures(object):
"""
Utility for combining several feature functions::
>>> from pprint import pprint
>>> def f1(tok): return {'upper': tok.isupper()}
>>> def f2(tok): return {'len': len(tok)}
>>> features = _CombinedFeatures(f1, f2)
>>> pprint(features('foo'))
{'len': 3, 'upper': False}
"""
def __init__(self, *feature_funcs):
self.feature_funcs = list(feature_funcs)
def __call__(self, *args, **kwargs):
features = [f(*args, **kwargs) for f in self.feature_funcs]
return merge_dicts(*features)