# -*- coding: utf-8 -*-
"""
CRFsuite_ backend for webstruct based on python-crfsuite_ and sklearn-crfsuite_.
.. _CRFsuite: http://www.chokkan.org/software/crfsuite/
.. _python-crfsuite: https://github.com/tpeng/python-crfsuite
.. _sklearn-crfsuite: https://github.com/TeamHG-Memex/sklearn-crfsuite
"""
from __future__ import absolute_import
from sklearn.pipeline import Pipeline
from webstruct import HtmlFeatureExtractor
[docs]class CRFsuitePipeline(Pipeline):
"""
A pipeline for HTML tagging using CRFsuite. It combines
a feature extractor and a CRF; they are available
as :attr:`fe` and :attr:`crf` attributes for easier access.
In addition to that, this class adds support for X_dev/y_dev arguments
for :meth:`fit` and :meth:`fit_transform` methods - they work as expected,
being transformed using feature extractor.
"""
def __init__(self, fe, crf):
self.fe = fe
self.crf = crf
super(CRFsuitePipeline, self).__init__([
('vec', self.fe),
('clf', self.crf),
])
def fit(self, X, y=None, **fit_params):
X_dev = fit_params.pop('X_dev', None)
if X_dev is not None:
fit_params['clf__X_dev'] = self.fe.transform(X_dev)
fit_params['clf__y_dev'] = fit_params.pop('y_dev', None)
return super(CRFsuitePipeline, self).fit(X, y, **fit_params)
def fit_transform(self, X, y=None, **fit_params):
X_dev = fit_params.pop('X_dev', None)
if X_dev is not None:
fit_params['clf__X_dev'] = self.fe.transform(X_dev)
fit_params['clf__y_dev'] = fit_params.pop('y_dev', None)
return super(CRFsuitePipeline, self).fit_transform(X, y, **fit_params)
[docs]def create_crfsuite_pipeline(token_features=None,
global_features=None,
min_df=1,
**crf_kwargs):
"""
Create :class:`CRFsuitePipeline` for HTML tagging using CRFsuite.
This pipeline expects data produced by
:class:`~.HtmlTokenizer` as an input and produces
sequences of IOB2 tags as output.
Example::
import webstruct
from webstruct.features import EXAMPLE_TOKEN_FEATURES
# load train data
html_tokenizer = webstruct.HtmlTokenizer()
train_trees = webstruct.load_trees(
"train/*.html",
webstruct.WebAnnotatorLoader()
)
X_train, y_train = html_tokenizer.tokenize(train_trees)
# train
model = webstruct.create_crfsuite_pipeline(
token_features = EXAMPLE_TOKEN_FEATURES,
)
model.fit(X_train, y_train)
# load test data
test_trees = webstruct.load_trees(
"test/*.html",
webstruct.WebAnnotatorLoader()
)
X_test, y_test = html_tokenizer.tokenize(test_trees)
# do a prediction
y_pred = model.predict(X_test)
"""
from sklearn_crfsuite import CRF
if token_features is None:
token_features = []
fe = HtmlFeatureExtractor(token_features, global_features, min_df=min_df)
crf = CRF(**crf_kwargs)
return CRFsuitePipeline(fe, crf)