Last active
January 5, 2022 00:22
-
-
Save dustinstansbury/a880568dca5fcbec087bc120a6779e13 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy import stats | |
from typing import Tuple, Union | |
import numpy as np | |
import pandas as pd | |
def xi_corr( | |
x: np.ndarray, y: np.ndarray, return_p_values: bool = False | |
) -> Union[float, Tuple[float, float]]: | |
"""Implementation of Chatterjee's correlation, which can capture both monotonic | |
and non-monotonic coupling between variables. The metric incorporates degree | |
of uncertainty, so as the length of x, y increases, so too does the confidence | |
in the correlation. | |
Parameters | |
---------- | |
x, y: np.ndarray | |
Vectors of equal length | |
return_p_values : bool | |
Whether to return p-value for the correlation estimate. Note that we use | |
the asymptotic limit, assuming no ties amongst values. | |
Returns | |
------- | |
xi : float | |
The Chatterjee correlation | |
p_value : float (optional) | |
If return_p_values=True, returns the symptotic p-value for the correlation. | |
Just like any p-value, as the length of x, y increases, p-value decreases | |
if there is any relationship between the variables. | |
Note | |
---- | |
Unexpected behavior, when x, y are linearly independent, p-value does not approach | |
.5, which is expected. But maybe DS just needs to read the paper more deeply. | |
References | |
---------- | |
- "A new coefficient of correlation": https://arxiv.org/pdf/1909.10140.pdf | |
- R implementation: https://souravchatterjee.su.domains//xi.R | |
""" | |
def _preprocess(x, y): | |
# convert values to overlapping-valued DataFrame to take advantage | |
# of useful rank methods | |
df = pd.DataFrame(np.vstack([x.ravel(), y.ravel()]).T, columns=["x", "y"]) | |
return df.dropna(axis=0, how="any") | |
xy = _preprocess(x, y) | |
# this is equivalent to .rank(method='random'), which isn't readily available | |
# in scipy.rankdata or pandas.rank -- ties are broken randomly | |
pi = xy.x.sample(frac=1).rank(method="first").reindex_like(xy) | |
n = len(xy) | |
f = xy.y.rank(method="max").values / n | |
order = np.argsort(pi) | |
f = f[order] | |
g = (-xy.y).rank(method="max").values / n | |
A1 = np.mean(np.abs(f[:-1] - f[1:])) * (n - 1) / (2 * n) | |
C = np.mean(g * (1 - g)) | |
xi = 1 - (A1 / C) | |
if return_p_values: | |
pval = 1 - stats.norm.cdf(np.sqrt(n) * xi, scale=np.sqrt(2.0 / 5)) | |
return xi, pval | |
return xi | |
def xi_corr2(x, y, return_p_values: bool = False): | |
"""Alternative implementation that's suppose to be >300x faster | |
Reference: https://github.com/czbiohub/xicor/issues/17 | |
""" | |
n = x.size | |
xi = np.argsort(x, kind="quicksort") | |
y = y[xi] | |
_, b, c = np.unique(y, return_counts=True, return_inverse=True) | |
r = np.cumsum(c)[b] | |
_, b, c = np.unique(-y, return_counts=True, return_inverse=True) | |
l = np.cumsum(c)[b] | |
xi = 1 - n * np.abs(np.diff(r)).sum() / (2 * (l * (n - l)).sum()) | |
if return_p_values: | |
pval = 1 - stats.norm.cdf(np.sqrt(n) * xi, scale=np.sqrt(2.0 / 5)) | |
return xi, pval | |
return xi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Not 300x faster, but the second implementation does look a little over 2x faster.