Source code for webstruct.metrics

# -*- coding: utf-8 -*-
:mod:`webstruct.metrics` contains metric functions that can be used for
model developmenton: on their own or as scoring functions for
scikit-learn's `cross-validation`_ and `model selection`_.

.. _cross-validation:
.. _model selection:

from __future__ import absolute_import
from itertools import chain
from functools import partial
import numpy as np
from sklearn.metrics import classification_report

# steal from seqlearn
[docs]def bio_f_score(y_true, y_pred): """F-score for BIO-tagging scheme, as used by CoNLL. This F-score variant is used for evaluating named-entity recognition and related problems, where the goal is to predict segments of interest within sequences and mark these as a "B" (begin) tag followed by zero or more "I" (inside) tags. A true positive is then defined as a BI* segment in both y_true and y_pred, with false positives and false negatives defined similarly. Support for tags schemes with classes (e.g. "B-NP") are limited: reported scores may be too high for inconsistent labelings. Parameters ---------- y_true : array-like of strings, shape (n_samples,) Ground truth labeling. y_pred : array-like of strings, shape (n_samples,) Sequence classifier's predictions. Returns ------- f : float F-score. """ if len(y_true) != len(y_pred): msg = "Sequences not of the same length ({} != {}).""" raise ValueError(msg.format(len(y_true), len(y_pred))) y_true = np.asarray(y_true) y_pred = np.asarray(y_pred) is_b = partial(np.char.startswith, prefix="B") where = np.where t_starts = where(is_b(y_true))[0] p_starts = where(is_b(y_pred))[0] # These lengths are off-by-one because we skip the "B", but that's ok. # t_lengths = np.diff(where(is_b(np.r_[y_true[y_true != 'O'], ['B']]))[0]) p_lengths = np.diff(where(is_b(np.r_[y_pred[y_pred != 'O'], ['B']]))[0]) t_segments = set(zip(t_starts, t_lengths, y_true[t_starts])) p_segments = set(zip(p_starts, p_lengths, y_pred[p_starts])) # tp = len(t_segments & p_segments) # fn = len(t_segments - p_segments) # fp = len(p_segments - t_segments) tp = sum(x in t_segments for x in p_segments) fn = sum(x not in p_segments for x in t_segments) fp = sum(x not in t_segments for x in p_segments) if tp == 0: # special-cased like this in CoNLL evaluation return 0. precision = tp / float(tp + fp) recall = tp / float(tp + fn) return 2. * precision * recall / (precision + recall)
[docs]def avg_bio_f1_score(y_true, y_pred): """ Macro-averaged F1 score of lists of BIO-encoded sequences ``y_true`` and ``y_pred``. A named entity in a sequence from ``y_pred`` is considered correct only if it is an exact match of the corresponding entity in the ``y_true``. It requires to work. """ return sum(map(bio_f_score, y_true, y_pred)) / len(y_true)
[docs]def bio_classification_report(y_true, y_pred): """ Classification report for a list of BIO-encoded sequences. It computes token-level metrics and discards "O" labels. """ y_true_combined = list(chain.from_iterable(y_true)) y_pred_combined = list(chain.from_iterable(y_pred)) tagset = (set(y_true_combined) | set(y_pred_combined)) - {'O'} return classification_report( y_true_combined, y_pred_combined, labels = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1]) )