Source code for mousestyles.kde

import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.grid_search import GridSearchCV


[docs]def kde(x, x_grid, symmetric_correction=False, cutoff=1):
    """
    Return a numpy.ndarray object of estimated density

    Parameters
    ----------
    x      : numpy.ndarray
        data, as realiztions of variable X
    x_grid : numpy.ndarray
        the grid points for the estimated density
    symmetric_correction : boolean
        a method indicator. If False, do common gaussian kernel density
        estimation (kde). If True, do common gaussian kde on data generated
        from x concatenating with its reflection around the cutoff point. Then
        transform the estimated kde back by a factor of 2. Used for e.g. kde
        for nonnegative kernel estimation
    cutoff : float
        the axis of symmetry for symmetric correction
    Returns
    -------
    pdf : numpy.ndarray
        estimated density at the specified grid points x_grid

    Examples
    --------
    >>> kde(x = np.array([2,3,1,0]), x_grid=np.linspace(0, 5, 10))
    array([ 0.17483395,  0.21599529,  0.23685855,  0.24007961,  0.22670763,
        0.19365019,  0.14228937,  0.08552725,  0.04043597,  0.01463953])
    >>> x1 = np.concatenate([norm(-1, 1.).rvs(400), norm(1, 0.3).rvs(100)])
    >>> pdf1 = kde(x=x1, x_grid=np.linspace(0, 5, 100), symmetric_correction
                   =True, cutoff=1)
    array([ 0.26625297,  0.26818492,  0.27105849,  0.27489486,  0.27968752, ...
        0.07764054,  0.07239964,  0.06736559,  0.06254175,  0.05793043])
    """
    # if we want to do a symmetric correction to do kde, we transform
    # the data to be the symmetric format by adding the counterpart
    # obtained by reflecting the data around x=cutoff
    if symmetric_correction:
        x = np.concatenate([x, 2*cutoff - x])

    # Use GridSearchCV to search for the best bandwidth by 5 fold
    # cross-validation max loglikelihood
    grid = GridSearchCV(KernelDensity(),
                        {'bandwidth': np.linspace(0.01, 0.5, 10)},
                        cv=min(5, len(x)))
    grid.fit(x[:, None])
    bandwidth = grid.best_params_['bandwidth']

    # do desity estimation
    kde_skl = KernelDensity(bandwidth=bandwidth)
    kde_skl.fit(x[:, np.newaxis])
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])

    # transform the fuction score function to density
    if symmetric_correction:
        # transform back to the one-sided density in symmetric correction
        pdf = 2 * np.exp(log_pdf)
    else:
        # vanilla case
        pdf = np.exp(log_pdf)
    return pdf