Source code for mousestyles.distributions.kolmogorov_test

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)

import numpy as np
from scipy import stats, optimize
from mousestyles.data import load_movement


[docs]def get_travel_distances(strain=0, mouse=0, day=0):
    """ Get distances travelled in 20ms for this strain, this mouse,
    on this day.

    Parameters
    ----------
    strain: int {0, 1, 2}
        The strain of mouse to test
    mouse: int {0, 1, 2, 3}
        The mouse twin id with in the strain
    day: int {0, 1, ..., 11}
        The day to calculate the distance

    Returns
    -------
    x: np.ndarray shape (n, 1)
        The distances travelled in 20ms for this mouse on this day, truncated
        at 1cm (i.e. only record mouse movement when it moves more than 1cm)

    Examples:
    >>> get_travel_distances(0, 0, 0)[:3]
    array([ 1.00648944,  1.02094319,  1.0178885 ])
    """
    df = load_movement(strain=strain, mouse=mouse, day=day)
    x = np.array(np.sqrt(df.x.diff()**2 + df.y.diff()**2))[1:]
    x = x[x >= 1]
    return x


[docs]def perform_kstest(x, distribution=stats.pareto,
                   verbose=True):
    """This function fits a distribution to data, and then test the
    fit of the distribution using Kolmogorov-Smirnov test.

    The Kolmogorov-Smirnov test constructs the test statistic, which is defined
    as $\sup |F_n(x) - F(x)|$, for $F_n$ is the sample CDF, and F is the
    theoretical CDF. This statistics can be considered as a measure of
    distance between the sample distribution and the theoretical distribution.
    The smaller it is, the more similar the two distributions.

    We first estimate the parameter using MLE, then by minimizing the KS test
    statistic.

    The Pareto distribution is sometimes known as the Power Law distribution,
    with the PDF:  $b / x**(b + 1)$ for $x >= 1, b > 0$.
    The truncated exponential distribution is the same as the rescaled
    exponential distribution.

    Parameters
    ----------
    x: np.ndarray (n,)
        The sample data to test the distribution
    distribution: A Scipy Stats Continuous Distribution
        {stats.pareto, stats.expon, stats.gamma}
        The distribution to test against. Currently support pareto, expon, and
        gamma, but any one-sided continuous distribution in Scipy.stats should
        work.
    verbose: boolean
        If True, will print out testing result

    Returns
    -------
    params: np.ndarray shape (p,)
        The optimal parameter for the distribution. Optimal in the sense of
        minimizing K-S statistics.
    The function also print out the Kolmogorov-Smirnov test result for three
    cases
    1. When comparing the empirical distribution against the distribution with
    parameters estimated with MLE
    2. When comparing the empirical distribution against the distribution with
    parameters estimated by explicitely minimizing KS statistics
    3. When comparing a resample with replacement of the empirical distribution
    against the Pareto in 2.
    A p-value > 0.05 means we fail to reject the Null hypothesis that the
    empirical distribution follow the specified distribution.

    Notes:
    ------
    The MLE often does not fit the data very well. We instead minimizing the
    K-S distance, and obtain a better fit (as seen by the PDF and CDF
    similarity between sample data and the fit)

    References:
    -----------
        1. Kolmogorov-Smirnov test:
            https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test
        2. Pareto Distribution (also known as power law distribution)
            https://en.wikipedia.org/wiki/Pareto_distribution

    Examples:
    ---------
    >>> x = get_travel_distances(0, 0, 0)
    >>> res = perform_kstest(x, verbose=False)
    >>> np.allclose(res, np.array([3.67593246, 0.62795748, 0.37224205]))
    True
    """
    dist_name = distribution.name
    # Fit the parameters by MLE
    mle_params = distribution.fit(x)

    # Fit the Parameter by explicitely minimize KS-Statistics, and perform
    # K-S test. First define helper function to minimize
    def calculate_ks_statistics(params):
        return stats.kstest(x, dist_name, args=params)[0]

    # Define good initial parameters to help optimizer find optimal values
    if dist_name == "pareto":
        init_params = [4.5, .5, .5]
    else:
        init_params = mle_params

    opt_params = optimize.fmin(calculate_ks_statistics, x0=init_params, disp=0)
    if verbose:
        print("1. Testing {} distribution with MLE parameters".format(
            dist_name))
        print(stats.kstest(x, dist_name, args=mle_params))
        print("2. Testing {} distribution with optimal parameters".
              format(dist_name))
        print(stats.kstest(x, dist_name, args=opt_params))
        # Resample x, and test again
        x_bag = np.random.choice(x, size=len(x), replace=True)
        print("3. Similar to 2., but on a resample of x")
        print(stats.kstest(x_bag, dist_name, args=opt_params))

    return opt_params