Source code for webstruct.feature_extraction

# -*- coding: utf-8 -*-
:mod:`webstruct.feature_extraction` contains classes that help

- converting HTML pages into lists of feature dicts and
- extracting annotations.

Usually, the approach is the following:

1. Convert a web page to a list of :class:`~.HtmlToken` instances
   and a list of annotation tags (if present). :class:`~.HtmlTokenizer`
   is used for that.

2. Run a number of "token feature functions" that return bits of information
   about each token: token text, token shape (uppercased/lowercased/...),
   whether token is in ``<a>`` HTML element, etc. For each token information
   is combined into a single feature dictionary.

   Use :class:`HtmlFeatureExtractor` at this stage. There is a number of
   predefined token feature functions in :mod:`webstruct.features`.

3. Run a number of "global feature functions" that can modify token feature
   dicts inplace (insert new features, change, remove them) using "global"
   information - information about all other tokens in a document and their
   existing token-level feature dicts. Global feature functions are applied
   sequentially: subsequent global feature functions get feature dicts updated
   by previous feature functions.

   This is also done by :class:`HtmlFeatureExtractor`.

   :class:`~webstruct.features.utils.LongestMatchGlobalFeature` can be used
   to create features that capture multi-token patterns. Some predefined
   global feature functions can be found in :mod:`webstruct.gazetteers`.

from __future__ import absolute_import, print_function
from itertools import chain
from collections import Counter
from six.moves import zip

from sklearn.base import BaseEstimator, TransformerMixin
from webstruct.utils import merge_dicts

[docs]class HtmlFeatureExtractor(BaseEstimator, TransformerMixin): """ This class extracts features from lists of :class:`~.HtmlToken` instances (:class:`~.HtmlTokenizer` can be used to create such lists). :meth:`fit` / :meth:`transform` / :meth:`fit_transform` interface may look familiar to you if you ever used scikit-learn_: :class:`HtmlFeatureExtractor` implements sklearn's Transformer interface. But there is one twist: usually for sequence labelling tasks the whole sequences are considered observations. So in our case a single observation is a tokenized document (a list of tokens), not an individual token: :meth:`fit` / :meth:`transform` / :meth:`fit_transform` methods accept lists of documents (lists of lists of tokens), and return lists of documents' feature dicts (lists of lists of feature dicts). .. _scikit-learn: Parameters ---------- token_features : list of callables List of "token" feature functions. Each function accepts a single ``html_token`` parameter and returns a dictionary wich maps feature names to feature values. Dicts from all token feature functions are merged by HtmlFeatureExtractor. Example token feature (it just returns token text):: >>> def current_token(html_token): ... return {'tok': html_token.token} :mod:`webstruct.features` module provides some predefined feature functions, e.g. :func:`parent_tag <webstruct.features.block_features.parent_tag>` which returns token's parent tag. Example:: >>> from webstruct import GateLoader, HtmlTokenizer, HtmlFeatureExtractor >>> from webstruct.features import parent_tag >>> loader = GateLoader(known_entities={'PER'}) >>> html_tokenizer = HtmlTokenizer() >>> feature_extractor = HtmlFeatureExtractor(token_features=[parent_tag]) >>> tree = loader.loadbytes(b"<p>hello, <PER>John <b>Doe</b></PER> <br> <PER>Mary</PER> said</p>") >>> html_tokens, tags = html_tokenizer.tokenize_single(tree) >>> feature_dicts = feature_extractor.transform_single(html_tokens) >>> for token, tag, feat in zip(html_tokens, tags, feature_dicts): ... print("%s %s %s" % (token.token, tag, feat)) hello O {'parent_tag': 'p'} John B-PER {'parent_tag': 'p'} Doe I-PER {'parent_tag': 'b'} Mary B-PER {'parent_tag': 'p'} said O {'parent_tag': 'p'} global_features : list of callables, optional List of "global" feature functions. Each "global" feature function should accept a single argument - a list of ``(html_token, feature_dict)`` tuples. This list contains all tokens from the document and features extracted by previous feature functions. "Global" feature functions are applied after "token" feature functions in the order they are passed. They should change feature dicts ``feature_dict`` inplace. min_df : integer or Mapping, optional Feature values that have a document frequency strictly lower than the given threshold are removed. If ``min_df`` is integer, its value is used as threshold. TODO: if ``min_df`` is a dictionary, it should map feature names to thresholds. """ def __init__(self, token_features, global_features=None, min_df=1): self.token_features = token_features self.global_features = global_features or [] self.min_df = min_df
[docs] def fit(self, html_token_lists, y=None): self.fit_transform(html_token_lists) return self
[docs] def fit_transform(self, html_token_lists, y=None, **fit_params): X = [self.transform_single(html_tokens) for html_tokens in html_token_lists] return self._pruned(X, low=self.min_df)
[docs] def transform(self, html_token_lists): return [self.transform_single(html_tokens) for html_tokens in html_token_lists]
[docs] def transform_single(self, html_tokens): feature_func = _CombinedFeatures(*self.token_features) token_data = list(zip(html_tokens, map(feature_func, html_tokens))) for feat in self.global_features: feat(token_data) return [featdict for tok, featdict in token_data]
def _pruned(self, X, low=None): if low is None or low <= 1: return X cnt = self._document_frequency(X) keep = {k for (k, v) in cnt.items() if v >= low} del cnt return [ [{k: v for k, v in fd.items() if (k, v) in keep} for fd in doc] for doc in X ] def _document_frequency(self, X): cnt = Counter() for doc in X: seen_features = set(chain.from_iterable(fd.items() for fd in doc)) cnt.update(seen_features) return cnt
class _CombinedFeatures(object): """ Utility for combining several feature functions:: >>> from pprint import pprint >>> def f1(tok): return {'upper': tok.isupper()} >>> def f2(tok): return {'len': len(tok)} >>> features = _CombinedFeatures(f1, f2) >>> pprint(features('foo')) {'len': 3, 'upper': False} """ def __init__(self, *feature_funcs): self.feature_funcs = list(feature_funcs) def __call__(self, *args, **kwargs): features = [f(*args, **kwargs) for f in self.feature_funcs] return merge_dicts(*features)