Source code for webstruct.crfsuite

# -*- coding: utf-8 -*-
"""
CRFsuite_ backend for webstruct based on python-crfsuite_ and sklearn-crfsuite_.

.. _CRFsuite: http://www.chokkan.org/software/crfsuite/
.. _python-crfsuite: https://github.com/tpeng/python-crfsuite
.. _sklearn-crfsuite: https://github.com/TeamHG-Memex/sklearn-crfsuite

"""
from __future__ import absolute_import
from sklearn.pipeline import Pipeline

from webstruct import HtmlFeatureExtractor


[docs]class CRFsuitePipeline(Pipeline):
    """
    A pipeline for HTML tagging using CRFsuite. It combines
    a feature extractor and a CRF; they are available
    as :attr:`fe` and :attr:`crf` attributes for easier access.

    In addition to that, this class adds support for X_dev/y_dev arguments
    for :meth:`fit` and :meth:`fit_transform` methods - they work as expected,
    being transformed using feature extractor.
    """
    def __init__(self, fe, crf):
        self.fe = fe
        self.crf = crf
        super(CRFsuitePipeline, self).__init__([
            ('vec', self.fe),
            ('clf', self.crf),
        ])

    def fit(self, X, y=None, **fit_params):
        X_dev = fit_params.pop('X_dev', None)
        if X_dev is not None:
            fit_params['clf__X_dev'] = self.fe.transform(X_dev)
            fit_params['clf__y_dev'] = fit_params.pop('y_dev', None)
        return super(CRFsuitePipeline, self).fit(X, y, **fit_params)

    def fit_transform(self, X, y=None, **fit_params):
        X_dev = fit_params.pop('X_dev', None)
        if X_dev is not None:
            fit_params['clf__X_dev'] = self.fe.transform(X_dev)
            fit_params['clf__y_dev'] = fit_params.pop('y_dev', None)
        return super(CRFsuitePipeline, self).fit_transform(X, y, **fit_params)


[docs]def create_crfsuite_pipeline(token_features=None,
                             global_features=None,
                             min_df=1,
                             **crf_kwargs):
    """
    Create :class:`CRFsuitePipeline` for HTML tagging using CRFsuite.
    This pipeline expects data produced by
    :class:`~.HtmlTokenizer` as an input and produces
    sequences of IOB2 tags as output.

    Example::

        import webstruct
        from webstruct.features import EXAMPLE_TOKEN_FEATURES

        # load train data
        html_tokenizer = webstruct.HtmlTokenizer()
        train_trees = webstruct.load_trees(
            "train/*.html",
            webstruct.WebAnnotatorLoader()
        )
        X_train, y_train = html_tokenizer.tokenize(train_trees)

        # train
        model = webstruct.create_crfsuite_pipeline(
            token_features = EXAMPLE_TOKEN_FEATURES,
        )
        model.fit(X_train, y_train)

        # load test data
        test_trees = webstruct.load_trees(
            "test/*.html",
            webstruct.WebAnnotatorLoader()
        )
        X_test, y_test = html_tokenizer.tokenize(test_trees)

        # do a prediction
        y_pred = model.predict(X_test)

    """
    from sklearn_crfsuite import CRF

    if token_features is None:
        token_features = []

    fe = HtmlFeatureExtractor(token_features, global_features, min_df=min_df)
    crf = CRF(**crf_kwargs)

    return CRFsuitePipeline(fe, crf)