# -*- coding: utf-8 -*-
"""
CRFsuite_ backend for webstruct based on python-crfsuite_
.. _CRFsuite: http://www.chokkan.org/software/crfsuite/
.. _python-crfsuite: https://github.com/tpeng/python-crfsuite
"""
from __future__ import absolute_import
from sklearn.pipeline import Pipeline
from webstruct import HtmlFeatureExtractor
from webstruct.base import BaseSequenceClassifier
from webstruct._fileresource import FileResource
class CRFsuiteCRF(BaseSequenceClassifier):
def __init__(self, algorithm=None, train_params=None, verbose=False,
model_filename=None, keep_tempfiles=False, trainer_cls=None):
self.algorithm = algorithm
self.train_params = train_params
self.modelfile = FileResource(
filename =model_filename,
keep_tempfiles=keep_tempfiles,
suffix=".crfsuite",
prefix="model"
)
self.verbose = verbose
self._tagger = None
if trainer_cls is None:
import pycrfsuite
self.trainer_cls = pycrfsuite.Trainer
else:
self.trainer_cls = trainer_cls
self.training_log_ = None
super(CRFsuiteCRF, self).__init__()
def fit(self, X, y, X_dev=None, y_dev=None):
"""
Train a model.
Parameters
----------
X : list of lists of dicts
Feature dicts for several documents (in a python-crfsuite format).
y : list of lists of strings
Labels for several documents.
X_dev : (optional) list of lists of dicts
Feature dicts used for testing.
y_dev : (optional) list of lists of strings
Labels corresponding to X_dev.
"""
if (X_dev is None and y_dev is not None) or (X_dev is not None and y_dev is None):
raise ValueError("Pass both X_dev and y_dev to use the holdout data")
if self._tagger is not None:
self._tagger.close()
self._tagger = None
self.modelfile.refresh()
trainer = self._get_trainer()
for xseq, yseq in zip(X, y):
trainer.append(xseq, yseq)
if X_dev is not None:
for xseq, yseq in zip(X_dev, y_dev):
trainer.append(xseq, yseq, 1)
trainer.train(self.modelfile.name, holdout=-1 if X_dev is None else 1)
self.training_log_ = trainer.logparser
return self
def predict(self, X):
"""
Make a prediction.
Parameters
----------
X : list of lists of dicts
feature dicts in python-crfsuite format
Returns
-------
y : list of lists
predicted labels
"""
y = []
tagger = self.tagger
for xseq in X:
y.append(tagger.tag(xseq))
return y
@property
def tagger(self):
if self._tagger is None:
if self.modelfile.name is None:
raise Exception("Can't load model. Is the model trained?")
import pycrfsuite
tagger = pycrfsuite.Tagger()
tagger.open(self.modelfile.name)
self._tagger = tagger
return self._tagger
def _get_trainer(self):
return self.trainer_cls(
algorithm=self.algorithm,
params=self.train_params,
verbose=self.verbose,
)
def __getstate__(self):
dct = self.__dict__.copy()
dct['_tagger'] = None
return dct
[docs]class CRFsuitePipeline(Pipeline):
"""
A pipeline for HTML tagging using CRFsuite. It combines
a feature extractor and a CRF; they are available
as :attr:`fe` and :attr:`crf` attributes for easier access.
In addition to that, this class adds support for X_dev/y_dev arguments
for :meth:`fit` and :meth:`fit_transform` methods - they work as expected,
being transformed using feature extractor.
"""
def __init__(self, fe, crf):
self.fe = fe
self.crf = crf
super(CRFsuitePipeline, self).__init__([
('fe', self.fe),
('crf', self.crf),
])
def fit(self, X, y=None, **fit_params):
X_dev = fit_params.pop('X_dev', None)
if X_dev is not None:
fit_params['crf__X_dev'] = self.fe.transform(X_dev)
fit_params['crf__y_dev'] = fit_params.pop('y_dev', None)
return super(CRFsuitePipeline, self).fit(X, y, **fit_params)
def fit_transform(self, X, y=None, **fit_params):
X_dev = fit_params.pop('X_dev', None)
if X_dev is not None:
fit_params['crf__X_dev'] = self.fe.transform(X_dev)
fit_params['crf__y_dev'] = fit_params.pop('y_dev', None)
return super(CRFsuitePipeline, self).fit_transform(X, y, **fit_params)
[docs]def create_crfsuite_pipeline(token_features=None,
global_features=None,
min_df=1,
**crf_kwargs):
"""
Create :class:`CRFsuitePipeline` for HTML tagging using CRFsuite.
This pipeline expects data produced by
:class:`~.HtmlTokenizer` as an input and produces
sequences of IOB2 tags as output.
Example::
import webstruct
from webstruct.features import EXAMPLE_TOKEN_FEATURES
# load train data
html_tokenizer = webstruct.HtmlTokenizer()
train_trees = webstruct.load_trees(
"train/*.html",
webstruct.WebAnnotatorLoader()
)
X_train, y_train = html_tokenizer.tokenize(train_trees)
# train
model = webstruct.create_crfsuite_pipeline(
token_features = EXAMPLE_TOKEN_FEATURES,
)
model.fit(X_train, y_train)
# load test data
test_trees = webstruct.load_trees(
"test/*.html",
webstruct.WebAnnotatorLoader()
)
X_test, y_test = html_tokenizer.tokenize(test_trees)
# do a prediction
y_pred = model.predict(X_test)
"""
if token_features is None:
token_features = []
fe = HtmlFeatureExtractor(token_features, global_features, min_df=min_df)
crf = CRFsuiteCRF(**crf_kwargs)
return CRFsuitePipeline(fe, crf)