Source code for mousestyles.GLRT_distribution

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)


import numpy as np
from mousestyles import data


[docs]def random_powerlaw(n, a, seed=-1):
    """
    Random generate points of truncated power law.

    The method we generate is to inverse Cumulative Density Function
    of truncated powerlaw function, and put random number draw from
    Unif[0,1]. The theory behind it is $F^{-1}(U) \sim F$.

    Parameters
    ----------
    n : int
        number of points
    a : int>1
        power law parameter alpha

    Returns
    -------
    points : a vector of float number
        n points have the target distribution.

    Examples
    --------
    >>> random_powerlaw(4,2)
    array([  1.18097435,   1.04584078,   1.4650779 ,  36.03967524])
    """
    if seed != -1:
        np.random.seed(seed)
    y = np.random.sample(n)
    return (1 - y)**(-1 / (a - 1))


[docs]def random_exp(n, l, seed=-1):
    """
    Random generate points of truncated exponential.

    The method we generate is to use the memorylessness property
    of exponential distribution. As the survival function of
    exponential distribution is always the same, for truncated
    exponential distribution, it is just the same to draw from
    regular exponential distribtion and shift the truncated
    value.

    Parameters
    ----------
    n : int
        number of points
    l : int
        exponential parameter lambda

    Returns
    -------
    points : a vector of float number
        n points have the target distribution.

    Examples
    --------
    >>> random_exp(4,2)
    array([ 1.07592496,  1.19789646,  1.19759663,  1.03993227])
    """
    if seed != -1:
        np.random.seed(seed)
    y = np.random.exponential(1.0 / l, n)
    return y + 1


[docs]def hypo_powerLaw_null(strain, mouse, day, law_est=0, seed=-1):
    """
    Return the outcome from GLRT with null hypothesis law distribution.

    This function used the Generalized Likelihood Ratio Test to test the
    goodness of fit: in other words, which distribution is more likely.

    In this function, we choose the powerLaw distributin to be the null
    and exponential distribution to be the alternative. We derived the
    test statistics by theory and pluged in MLE as our estimation of
    best parameters.

    After we calculated the paramters, we need to find the rejection
    region, critical value or pvalue. To get a more general test, we
    want to use pvalue, instead of critical value under certain
    significance level.

    To find the p-value, we use simulation methods, and all random
    numbers are drawn from previous functions. Therefore, although
    p value should be a constant given data, it is not a constant in
    our function, if we did not set the seed.

    In general, in this function, if the p value is too small, then we
    will reject the null, and we say powerlaw is not a better fit
    compared to exponential distribution.

    Parameters
    ----------
    strain : int
        the strain number of the mouse
    mouse : int
        the mouse number in its strain
    day :  int
        the day number
    law_est: double (optional)
        the estimated parameter in law distribution

    Returns
    -------
    p_value:
        the probablity under null reject.

    Examples
    --------
    >>> hypo_law_null (0, 0, 0)
    0.0070000000000000001
    """
    if seed != -1:
        np.random.seed(seed)
    df = data.load_movement(strain, mouse, day)
    xcood = df["x"]
    ycood = df["y"]
    distance_vector = np.sqrt(np.diff(xcood)**2 + np.diff(ycood)**2)
    msk = distance_vector > 1
    cut_dist = distance_vector[msk]
    if law_est == 0:
        law_est = 1 + len(cut_dist) * 1 / \
            (np.sum(np.log(cut_dist / np.min(cut_dist))))
    n = len(cut_dist)
    log_cut = np.log(cut_dist)
    sum_cut = np.sum(log_cut)
    test_stat = n * (np.log(sum_cut - n) - np.log(sum_cut)) - law_est * sum_cut
    sample_stat = []
    for i in range(1000):
        sample = random_powerlaw(len(cut_dist), law_est)
        sum_sam = np.sum(sample)
        log_sam = np.log(sample)
        sum_log_sam = np.log(np.sum(log_sam))
        tmp = n * (np.log(sum_sam - n) - sum_log_sam) - \
            law_est * np.sum(log_sam)
        sample_stat.append(tmp)
    # critical_value = ss.mstats.mquantiles(sample_stat, prob = 0.05)[0]
    p_value = np.sum(sample_stat > test_stat) / len(sample_stat)
    return p_value


[docs]def hypo_exp_null(strain, mouse, day, law_est=0, exp_est=0, seed=-1):
    """
    Return the outcome from GLRT with null hypothesis law distribution.

    This function also used the Generalized Likelihood Ratio Test to test
    goodness of fit: in other words, which distribution is more likely.

    In this function, we choose the exponential distributin to be the null
    and powerlaw distribution to be the alternative. We derived the
    test statistics by theory and pluged in MLE as our estimation of
    best parameters.

    After we calculated the paramters, we need to find the rejection
    region, critical value or pvalue. To get a more general test, we
    want to use pvalue, instead of critical value under certain
    significance level.

    To find the p-value, we use simulation methods, and all random
    numbers are drawn from previous functions. Therefore, although
    p value should be a constant given data, it is not a constant in
    our function, if we did not set the seed.

    In general, in this function, if the p value is too small, then we
    will reject the null, and we say exponential is not a better fit
    compared to exponential distribution.

    Parameters
    ----------
    strain : int
        the strain number of the mouse
    mouse  : int
        the mouse number in its strain
    day       :  int
        the day number
    law_est: double (optional)
        the estimated parameter in law distribution
    exp_est: double (optional)
        the estimated parameter in exponential distribution

    Returns
    -------
    p_value:
        the probablity under null reject.

    Examples
    --------
    >>> hypo_exp_null (0, 0, 0)
    1.0
    """
    if seed != -1:
        np.random.seed(seed)
    df = data.load_movement(strain, mouse, day)
    xcood = df["x"]
    ycood = df["y"]
    distance_vector = np.sqrt(np.diff(xcood)**2 + np.diff(ycood)**2)
    msk = distance_vector > 1
    cut_dist = distance_vector[msk]
    if law_est == 0:
        law_est = 1 + len(cut_dist) * 1 / \
            (np.sum(np.log(cut_dist / np.min(cut_dist))))
    if exp_est == 0:
        exp_est = len(cut_dist) / (np.sum(cut_dist) - len(cut_dist))
    n = len(cut_dist)
    sum_cut = np.sum(cut_dist)
    log_cut = np.log(cut_dist)
    sum_log_cut = np.sum(log_cut)
    test_stat = n * (np.log(sum_cut - n) - np.log(sum_log_cut)) - \
        law_est * sum_log_cut
    sample_stat = []
    for i in range(1000):
        sample = random_exp(len(cut_dist), exp_est)
        sum_sam = np.sum(sample)
        log_sam = np.log(sample)
        sum_log_sam = np.sum(log_sam)
        tmp = n * (np.log(sum_sam - n) - np.log(sum_log_sam)) - \
            law_est * sum_log_sam
        sample_stat.append(tmp)
    # critical_value = ss.mstats.mquantiles(sample_stat, prob = 0.95)[0]
    p_value = np.sum(sample_stat <= test_stat) / len(sample_stat)
    return p_value