Source code for webstruct.metrics

# -*- coding: utf-8 -*-
"""
:mod:`webstruct.metrics` contains metric functions that can be used for
model developmenton: on their own or as scoring functions for
scikit-learn's `cross-validation`_ and `model selection`_.

.. _cross-validation: http://scikit-learn.org/stable/modules/cross_validation.html
.. _model selection: http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html

"""
from __future__ import absolute_import
from itertools import chain
from functools import partial
import numpy as np
from sklearn.metrics import classification_report

# steal from seqlearn
[docs]def bio_f_score(y_true, y_pred):
    """F-score for BIO-tagging scheme, as used by CoNLL.

    This F-score variant is used for evaluating named-entity recognition and
    related problems, where the goal is to predict segments of interest within
    sequences and mark these as a "B" (begin) tag followed by zero or more "I"
    (inside) tags. A true positive is then defined as a BI* segment in both
    y_true and y_pred, with false positives and false negatives defined
    similarly.

    Support for tags schemes with classes (e.g. "B-NP") are limited: reported
    scores may be too high for inconsistent labelings.

    Parameters
    ----------
    y_true : array-like of strings, shape (n_samples,)
        Ground truth labeling.

    y_pred : array-like of strings, shape (n_samples,)
        Sequence classifier's predictions.

    Returns
    -------
    f : float
        F-score.
    """

    if len(y_true) != len(y_pred):
        msg = "Sequences not of the same length ({} != {})."""
        raise ValueError(msg.format(len(y_true), len(y_pred)))

    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    is_b = partial(np.char.startswith, prefix="B")

    where = np.where
    t_starts = where(is_b(y_true))[0]
    p_starts = where(is_b(y_pred))[0]

    # These lengths are off-by-one because we skip the "B", but that's ok.
    # http://stackoverflow.com/q/17929499/166749
    t_lengths = np.diff(where(is_b(np.r_[y_true[y_true != 'O'], ['B']]))[0])
    p_lengths = np.diff(where(is_b(np.r_[y_pred[y_pred != 'O'], ['B']]))[0])

    t_segments = set(zip(t_starts, t_lengths, y_true[t_starts]))
    p_segments = set(zip(p_starts, p_lengths, y_pred[p_starts]))

    # tp = len(t_segments & p_segments)
    # fn = len(t_segments - p_segments)
    # fp = len(p_segments - t_segments)
    tp = sum(x in t_segments for x in p_segments)
    fn = sum(x not in p_segments for x in t_segments)
    fp = sum(x not in t_segments for x in p_segments)

    if tp == 0:
        # special-cased like this in CoNLL evaluation
        return 0.

    precision = tp / float(tp + fp)
    recall = tp / float(tp + fn)

    return 2. * precision * recall / (precision + recall)


[docs]def avg_bio_f1_score(y_true, y_pred):
    """
    Macro-averaged F1 score of lists of BIO-encoded sequences
    ``y_true`` and ``y_pred``.

    A named entity in a sequence from ``y_pred`` is considered
    correct only if it is an exact match of the corresponding entity
    in the ``y_true``.

    It requires https://github.com/larsmans/seqlearn to work.
    """
    return sum(map(bio_f_score, y_true, y_pred)) / len(y_true)


[docs]def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    """
    y_true_combined = list(chain.from_iterable(y_true))
    y_pred_combined = list(chain.from_iterable(y_pred))
    tagset = (set(y_true_combined) | set(y_pred_combined)) - {'O'}
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    )