Source code for webstruct.crfsuite

# -*- coding: utf-8 -*-
"""
CRFsuite_ backend for webstruct based on python-crfsuite_

.. _CRFsuite: http://www.chokkan.org/software/crfsuite/
.. _python-crfsuite: https://github.com/tpeng/python-crfsuite

"""
from __future__ import absolute_import
from sklearn.pipeline import Pipeline

from webstruct import HtmlFeatureExtractor
from webstruct.base import BaseSequenceClassifier
from webstruct._fileresource import FileResource


class CRFsuiteCRF(BaseSequenceClassifier):
    def __init__(self, algorithm=None, train_params=None, verbose=False,
                 model_filename=None, keep_tempfiles=False, trainer_cls=None):
        self.algorithm = algorithm
        self.train_params = train_params
        self.modelfile = FileResource(
            filename =model_filename,
            keep_tempfiles=keep_tempfiles,
            suffix=".crfsuite",
            prefix="model"
        )
        self.verbose = verbose
        self._tagger = None
        if trainer_cls is None:
            import pycrfsuite
            self.trainer_cls = pycrfsuite.Trainer
        else:
            self.trainer_cls = trainer_cls
        self.training_log_ = None
        super(CRFsuiteCRF, self).__init__()

    def fit(self, X, y, X_dev=None, y_dev=None):
        """
        Train a model.

        Parameters
        ----------
        X : list of lists of dicts
            Feature dicts for several documents (in a python-crfsuite format).

        y : list of lists of strings
            Labels for several documents.

        X_dev : (optional) list of lists of dicts
            Feature dicts used for testing.

        y_dev : (optional) list of lists of strings
            Labels corresponding to X_dev.
        """
        if (X_dev is None and y_dev is not None) or (X_dev is not None and y_dev is None):
            raise ValueError("Pass both X_dev and y_dev to use the holdout data")

        if self._tagger is not None:
            self._tagger.close()
            self._tagger = None
        self.modelfile.refresh()

        trainer = self._get_trainer()

        for xseq, yseq in zip(X, y):
            trainer.append(xseq, yseq)

        if X_dev is not None:
            for xseq, yseq in zip(X_dev, y_dev):
                trainer.append(xseq, yseq, 1)

        trainer.train(self.modelfile.name, holdout=-1 if X_dev is None else 1)
        self.training_log_ = trainer.logparser
        return self

    def predict(self, X):
        """
        Make a prediction.

        Parameters
        ----------
        X : list of lists of dicts
            feature dicts in python-crfsuite format

        Returns
        -------
        y : list of lists
            predicted labels

        """
        y = []
        tagger = self.tagger
        for xseq in X:
            y.append(tagger.tag(xseq))
        return y

    @property
    def tagger(self):
        if self._tagger is None:
            if self.modelfile.name is None:
                raise Exception("Can't load model. Is the model trained?")

            import pycrfsuite
            tagger = pycrfsuite.Tagger()
            tagger.open(self.modelfile.name)
            self._tagger = tagger
        return self._tagger

    def _get_trainer(self):
        return self.trainer_cls(
            algorithm=self.algorithm,
            params=self.train_params,
            verbose=self.verbose,
        )

    def __getstate__(self):
        dct = self.__dict__.copy()
        dct['_tagger'] = None
        return dct


[docs]class CRFsuitePipeline(Pipeline):
    """
    A pipeline for HTML tagging using CRFsuite. It combines
    a feature extractor and a CRF; they are available
    as :attr:`fe` and :attr:`crf` attributes for easier access.

    In addition to that, this class adds support for X_dev/y_dev arguments
    for :meth:`fit` and :meth:`fit_transform` methods - they work as expected,
    being transformed using feature extractor.
    """
    def __init__(self, fe, crf):
        self.fe = fe
        self.crf = crf
        super(CRFsuitePipeline, self).__init__([
            ('fe', self.fe),
            ('crf', self.crf),
        ])

    def fit(self, X, y=None, **fit_params):
        X_dev = fit_params.pop('X_dev', None)
        if X_dev is not None:
            fit_params['crf__X_dev'] = self.fe.transform(X_dev)
            fit_params['crf__y_dev'] = fit_params.pop('y_dev', None)
        return super(CRFsuitePipeline, self).fit(X, y, **fit_params)

    def fit_transform(self, X, y=None, **fit_params):
        X_dev = fit_params.pop('X_dev', None)
        if X_dev is not None:
            fit_params['crf__X_dev'] = self.fe.transform(X_dev)
            fit_params['crf__y_dev'] = fit_params.pop('y_dev', None)
        return super(CRFsuitePipeline, self).fit_transform(X, y, **fit_params)


[docs]def create_crfsuite_pipeline(token_features=None,
                             global_features=None,
                             min_df=1,
                             **crf_kwargs):
    """
    Create :class:`CRFsuitePipeline` for HTML tagging using CRFsuite.
    This pipeline expects data produced by
    :class:`~.HtmlTokenizer` as an input and produces
    sequences of IOB2 tags as output.

    Example::

        import webstruct
        from webstruct.features import EXAMPLE_TOKEN_FEATURES

        # load train data
        html_tokenizer = webstruct.HtmlTokenizer()
        train_trees = webstruct.load_trees(
            "train/*.html",
            webstruct.WebAnnotatorLoader()
        )
        X_train, y_train = html_tokenizer.tokenize(train_trees)

        # train
        model = webstruct.create_crfsuite_pipeline(
            token_features = EXAMPLE_TOKEN_FEATURES,
        )
        model.fit(X_train, y_train)

        # load test data
        test_trees = webstruct.load_trees(
            "test/*.html",
            webstruct.WebAnnotatorLoader()
        )
        X_test, y_test = html_tokenizer.tokenize(test_trees)

        # do a prediction
        y_pred = model.predict(X_test)

    """
    if token_features is None:
        token_features = []

    fe = HtmlFeatureExtractor(token_features, global_features, min_df=min_df)
    crf = CRFsuiteCRF(**crf_kwargs)

    return CRFsuitePipeline(fe, crf)