Source code for colourlab.statistics

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
statistics: Colour metric statistics, part of the colourlab package

Copyright (C) 2013-2016 Ivar Farup

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import numpy as np
import scipy.integrate
import scipy.optimize


# =============================================================================
# Statistics of metrics
# =============================================================================


[docs]def stress(diff1, diff2, weights=None, confidence=.95):
    """
    Compute the STRESS for the two sets of differences.

    The STRESS (standardised residual sum of squares) is returned as a
    percentage. If weights are given, WSTRESS is calculated.

    Parameters
    ----------
    diff1 : ndarray
        1D array of colour differences.
    diff2 : ndarray
        1D array of colour differences.
    weights : ndarray
        1D array of individual weights for the colour differences. If None,
        the standard STRESS is calculated, if given, WSTRESS is calculated.
    confidence : float
        The size of the confidence interval (e.g., .95 for a 95%
        confidence interval)

    Returns
    -------
    stress : float
        Standard residual sum of squares.
    interval : tuple
        The confidence interval for STRESS_a / STRESS_b
    """
    from scipy.stats import f
    if weights is None:
        weights = np.ones(np.shape(diff1))
    F = (diff1**2).sum() / (diff1 * diff2).sum()
    stress = np.sqrt((weights * (diff1 - F * diff2)**2).sum() /
                     (weights * (F * diff2)**2).sum())
    N = np.shape(weights)[0]
    return stress, f.interval(confidence, N - 1, N - 1)


def _ellipse_union(th, ell1, ell2):
    """
    For the Pant R values. Integrand the union of two ellipses.
    """
    r1 = ell1[0] * ell1[1] / (np.sqrt(ell1[1]**2 * np.cos(th-ell1[2])**2 +
                              ell1[0]**2 * np.sin(th-ell1[2])**2))
    r2 = ell2[0] * ell2[1] / (np.sqrt(ell2[1]**2 * np.cos(th-ell2[2])**2 +
                              ell2[0]**2 * np.sin(th-ell2[2])**2))
    u = max(r1, r2)
    return .5 * u**2


def _ellipse_intersection(th, ell1, ell2):
    """
    For the Pant R values. Integrand for the intersection of two ellipses.
    """
    r1 = ell1[0] * ell1[1] / (np.sqrt(ell1[1]**2 * np.cos(th-ell1[2])**2 +
                              ell1[0]**2 * np.sin(th-ell1[2])**2))
    r2 = ell2[0] * ell2[1] / (np.sqrt(ell2[1]**2 * np.cos(th-ell2[2])**2 +
                              ell2[0]**2 * np.sin(th-ell2[2])**2))
    u = min(r1, r2)
    return .5 * u**2


def _pant_R_value(ell1, ell2):
    """
    Compute single R value for the two given ellipses.
    """
    area_intersection = scipy.integrate.quad(_ellipse_intersection,
                                             0, 2 * np.pi, (ell1, ell2))
    area_union = scipy.integrate.quad(_ellipse_union,
                                      0, 2 * np.pi, (ell1, ell2))
    return area_intersection[0] / area_union[0]


def _pant_R_values(ells1, ells2, scale=1):
    """
    Compute set of R values for the two given sets of ellipses.
    """
    ells1 = ells1.copy()
    ells1[:, 0:2] = ells1[:, 0:2] * scale
    N = np.shape(ells1)[0]
    r_values = np.zeros(N)
    for i in range(N):
        r_values[i] = _pant_R_value(ells1[i], ells2[i])
    return r_values


def _cost_function_pant(scale, ells1, ells2):
    """
    Cost function for the optimisation of the scale for the R values.
    """
    r_values = _pant_R_values(ells1, ells2, scale)
    return 1 - r_values.mean()


[docs]def pant_R_values(space, tdata1, tdata2, optimise=True, plane=None):
    """
    Compute the list of R values for the given metric tensors in tdataN.

    The R values are computed in the given colour space. If optimise=True,
    the maximum overall R values are found by scaling one of the data sets.
    The ellipses are computed in the given plane. If plane=None, all three
    principal planes are used, and the resulting array of R values will
    be three times the length ot tdataN.

    Parameters
    ----------
    space : Space
        The colour space in which to compute the R values.
    tdata1 : Tensors
        The first set of colour metric tensors.
    tdata2 : Tensors
        The second set of colour metric tensors.
    optimise : bool
        Whether or not to optimise the scaling of the ellipse set.
    plane : slice
        The principal plan for the ellipsoid cross sections.

    Returns
    -------
    r_values : ndarray
        Pant R values
    """
    if plane is None:
        ell1a = tdata1.get_ellipse_parameters(space, tdata1.plane_01)
        ell1b = tdata1.get_ellipse_parameters(space, tdata1.plane_12)
        ell1c = tdata1.get_ellipse_parameters(space, tdata1.plane_20)
        ell1 = np.concatenate((ell1a, ell1b, ell1c))
        ell2a = tdata2.get_ellipse_parameters(space, tdata1.plane_01)
        ell2b = tdata2.get_ellipse_parameters(space, tdata1.plane_12)
        ell2c = tdata2.get_ellipse_parameters(space, tdata1.plane_20)
        ell2 = np.concatenate((ell2a, ell2b, ell2c))
    else:
        ell1 = tdata1.get_ellipse_parameters(space, plane)
        ell2 = tdata2.get_ellipse_parameters(space, plane)
    if optimise:
        res = scipy.optimize.fmin(_cost_function_pant, 1, (ell1, ell2))
        return _pant_R_values(ell1, ell2, res[0]), res[0]
    else:
        return _pant_R_values(ell1, ell2), 0


[docs]def dataset_distance(data1, data2):
    """
    Euclidean distances between data in the two data sets.

    Parameters
    ----------
    data1 : ndarray
        The first dataset, Nx3.
    data2 : ndarray
        The second data set, Nx3.

    Returns
    -------
    diff : ndarray
        Array of Euclidean distances, N.
    """
    return np.sqrt(((data1 - data2)**2).sum(axis=1))


def _scale_rot_dataset(params, dataset):
    """
    Scale and rotate dataset for optimisation.

    Parameters
    ----------
    params : ndarray
        Optimising parameters, scale and angle.
    dataset : ndarray
        The data set to optimise.

    Returns
    -------
    new_set : ndarray
        The scaled and rotated dataset.
    """
    scale_mat = params[1] * np.eye(3)
    scale_mat[0, 0] = params[0]
    th = params[2]
    rot_mat = np.array([[1, 0, 0],
                        [0, np.cos(th), -np.sin(th)],
                        [0, np.sin(th),  np.cos(th)]])
    sys_mat = np.dot(scale_mat, rot_mat)
    return np.dot(sys_mat, dataset.T).T


def _cost_function_dataset(params, dataset, ground_truth):
    """
    Cost function for the optimisation of the scale for the R values.

    Parameters
    ----------
    params : ndarray
        Optimising parameters, L-scale, C-scale and angle.
    dataset : ndarray
        The data set to optimise.
    ground_truth : ndarray
        The ground truth dataset.

    Returns
    -------
    cost : float
        The cost for the current values of scale and angle.
    """
    return dataset_distance(
        _scale_rot_dataset(params, dataset), ground_truth).sum()


[docs]def minimal_dataset_distance(dataset, ground_truth):
    """
    Return the minimal dataset distance between a dataset and a ground truth.

    The dataset is assumed to be on the Lab form (as an Nx3 ndarray) and is
    changed by scaling and rotation about the L axis.

    Parameters
    ----------
    dataset : ndarray
        Nx3 ndarray with the colour data.
    ground_truth : ndarray
        Nx3 ndarray with the ground truth colour data.

    Returns
    -------
    diff : ndarray
        Array of minimal Euclidean distances.
    opt_data : ndarray
        The optimised data set by scaling and rotation.
    L-scale : float
        The optimal scale.
    C-scale : float
        The optimal scale.
    angle : float
        The optimal angle.
    """
    params = scipy.optimize.fmin(_cost_function_dataset, np.array([1, 1, 0]),
                                 (dataset, ground_truth))
    opt_data = _scale_rot_dataset(params, dataset)
    return dataset_distance(opt_data, ground_truth), \
        opt_data, params[0], params[1], params[2]