Source code for mousestyles.GLRT_distribution

from __future__ import (absolute_import, division,
                        print_function, unicode_literals)


import numpy as np
from mousestyles import data


[docs]def random_powerlaw(n, a, seed=-1): """ Random generate points of truncated power law. The method we generate is to inverse Cumulative Density Function of truncated powerlaw function, and put random number draw from Unif[0,1]. The theory behind it is $F^{-1}(U) \sim F$. Parameters ---------- n : int number of points a : int>1 power law parameter alpha Returns ------- points : a vector of float number n points have the target distribution. Examples -------- >>> random_powerlaw(4,2) array([ 1.18097435, 1.04584078, 1.4650779 , 36.03967524]) """ if seed != -1: np.random.seed(seed) y = np.random.sample(n) return (1 - y)**(-1 / (a - 1))
[docs]def random_exp(n, l, seed=-1): """ Random generate points of truncated exponential. The method we generate is to use the memorylessness property of exponential distribution. As the survival function of exponential distribution is always the same, for truncated exponential distribution, it is just the same to draw from regular exponential distribtion and shift the truncated value. Parameters ---------- n : int number of points l : int exponential parameter lambda Returns ------- points : a vector of float number n points have the target distribution. Examples -------- >>> random_exp(4,2) array([ 1.07592496, 1.19789646, 1.19759663, 1.03993227]) """ if seed != -1: np.random.seed(seed) y = np.random.exponential(1.0 / l, n) return y + 1
[docs]def hypo_powerLaw_null(strain, mouse, day, law_est=0, seed=-1): """ Return the outcome from GLRT with null hypothesis law distribution. This function used the Generalized Likelihood Ratio Test to test the goodness of fit: in other words, which distribution is more likely. In this function, we choose the powerLaw distributin to be the null and exponential distribution to be the alternative. We derived the test statistics by theory and pluged in MLE as our estimation of best parameters. After we calculated the paramters, we need to find the rejection region, critical value or pvalue. To get a more general test, we want to use pvalue, instead of critical value under certain significance level. To find the p-value, we use simulation methods, and all random numbers are drawn from previous functions. Therefore, although p value should be a constant given data, it is not a constant in our function, if we did not set the seed. In general, in this function, if the p value is too small, then we will reject the null, and we say powerlaw is not a better fit compared to exponential distribution. Parameters ---------- strain : int the strain number of the mouse mouse : int the mouse number in its strain day : int the day number law_est: double (optional) the estimated parameter in law distribution Returns ------- p_value: the probablity under null reject. Examples -------- >>> hypo_law_null (0, 0, 0) 0.0070000000000000001 """ if seed != -1: np.random.seed(seed) df = data.load_movement(strain, mouse, day) xcood = df["x"] ycood = df["y"] distance_vector = np.sqrt(np.diff(xcood)**2 + np.diff(ycood)**2) msk = distance_vector > 1 cut_dist = distance_vector[msk] if law_est == 0: law_est = 1 + len(cut_dist) * 1 / \ (np.sum(np.log(cut_dist / np.min(cut_dist)))) n = len(cut_dist) log_cut = np.log(cut_dist) sum_cut = np.sum(log_cut) test_stat = n * (np.log(sum_cut - n) - np.log(sum_cut)) - law_est * sum_cut sample_stat = [] for i in range(1000): sample = random_powerlaw(len(cut_dist), law_est) sum_sam = np.sum(sample) log_sam = np.log(sample) sum_log_sam = np.log(np.sum(log_sam)) tmp = n * (np.log(sum_sam - n) - sum_log_sam) - \ law_est * np.sum(log_sam) sample_stat.append(tmp) # critical_value = ss.mstats.mquantiles(sample_stat, prob = 0.05)[0] p_value = np.sum(sample_stat > test_stat) / len(sample_stat) return p_value
[docs]def hypo_exp_null(strain, mouse, day, law_est=0, exp_est=0, seed=-1): """ Return the outcome from GLRT with null hypothesis law distribution. This function also used the Generalized Likelihood Ratio Test to test goodness of fit: in other words, which distribution is more likely. In this function, we choose the exponential distributin to be the null and powerlaw distribution to be the alternative. We derived the test statistics by theory and pluged in MLE as our estimation of best parameters. After we calculated the paramters, we need to find the rejection region, critical value or pvalue. To get a more general test, we want to use pvalue, instead of critical value under certain significance level. To find the p-value, we use simulation methods, and all random numbers are drawn from previous functions. Therefore, although p value should be a constant given data, it is not a constant in our function, if we did not set the seed. In general, in this function, if the p value is too small, then we will reject the null, and we say exponential is not a better fit compared to exponential distribution. Parameters ---------- strain : int the strain number of the mouse mouse : int the mouse number in its strain day : int the day number law_est: double (optional) the estimated parameter in law distribution exp_est: double (optional) the estimated parameter in exponential distribution Returns ------- p_value: the probablity under null reject. Examples -------- >>> hypo_exp_null (0, 0, 0) 1.0 """ if seed != -1: np.random.seed(seed) df = data.load_movement(strain, mouse, day) xcood = df["x"] ycood = df["y"] distance_vector = np.sqrt(np.diff(xcood)**2 + np.diff(ycood)**2) msk = distance_vector > 1 cut_dist = distance_vector[msk] if law_est == 0: law_est = 1 + len(cut_dist) * 1 / \ (np.sum(np.log(cut_dist / np.min(cut_dist)))) if exp_est == 0: exp_est = len(cut_dist) / (np.sum(cut_dist) - len(cut_dist)) n = len(cut_dist) sum_cut = np.sum(cut_dist) log_cut = np.log(cut_dist) sum_log_cut = np.sum(log_cut) test_stat = n * (np.log(sum_cut - n) - np.log(sum_log_cut)) - \ law_est * sum_log_cut sample_stat = [] for i in range(1000): sample = random_exp(len(cut_dist), exp_est) sum_sam = np.sum(sample) log_sam = np.log(sample) sum_log_sam = np.sum(log_sam) tmp = n * (np.log(sum_sam - n) - np.log(sum_log_sam)) - \ law_est * sum_log_sam sample_stat.append(tmp) # critical_value = ss.mstats.mquantiles(sample_stat, prob = 0.95)[0] p_value = np.sum(sample_stat <= test_stat) / len(sample_stat) return p_value