Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: added support for ordered categoricals in kendall/spearman correlation #60493

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ Other enhancements
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
Expand Down
24 changes: 24 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -11034,6 +11034,10 @@ def corr(
data = self._get_numeric_data() if numeric_only else self
cols = data.columns
idx = cols.copy()

if method in ("spearman", "kendall"):
data = data._convert_ordered_cat_to_code()

mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

if method == "pearson":
Expand Down Expand Up @@ -11321,6 +11325,8 @@ def corrwith(
correl = num / dom

elif method in ["kendall", "spearman"] or callable(method):
left = left._convert_ordered_cat_to_code()
right = right._convert_ordered_cat_to_code()

def c(x):
return nanops.nancorr(x[0], x[1], method=method)
Expand Down Expand Up @@ -11352,6 +11358,24 @@ def c(x):

return correl

def _convert_ordered_cat_to_code(self) -> DataFrame:
"""
Converts all category columns to their codes wherever possible
(i.e. wherever they are ordered) otherwise leaves shape unchanged
"""
categ = self.select_dtypes("category")
if len(categ.columns) == 0:
return self

cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns
if len(cols_convert) > 0:
data = self.copy(deep=False)
data[cols_convert] = data[cols_convert].transform(
lambda x: x.cat.codes.replace(-1, np.nan)
)

return data

# ----------------------------------------------------------------------
# ndarray-like stats methods

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2680,6 +2680,12 @@ def corr(
if len(this) == 0:
return np.nan

if method in ("spearman", "kendall"):
if this.dtype == "category" and this.cat.ordered:
this = this.cat.codes.replace(-1, np.nan)
if other.dtype == "category" and other.cat.ordered:
other = other.cat.codes.replace(-1, np.nan)

this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False)
other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False)

Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import pandas as pd
from pandas import (
Categorical,
DataFrame,
Index,
Series,
Expand All @@ -16,6 +17,19 @@
import pandas._testing as tm


@pytest.fixture
def categorical_frame():
frame = DataFrame(
{
"A": Categorical(list("abcde") * 6, list("bacde"), ordered=True),
"B": Categorical(list("123") * 10, list("321"), ordered=True),
}
)
frame.loc[frame.index[:5], "A"] = np.nan
frame.loc[frame.index[3:6], "B"] = np.nan
return frame


class TestDataFrameCov:
def test_cov(self, float_frame, float_string_frame):
# min_periods no NAs (corner case)
Expand Down Expand Up @@ -116,6 +130,13 @@ def test_corr_scipy_method(self, float_frame, method):
expected = float_frame["A"].corr(float_frame["C"], method=method)
tm.assert_almost_equal(correls["A"]["C"], expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
def test_corr_scipy_method_category(self, method, categorical_frame):
pytest.importorskip("scipy")
correls = categorical_frame.corr(method=method)
expected = categorical_frame["A"].corr(categorical_frame["B"], method=method)
tm.assert_almost_equal(correls["A"]["B"], expected)

# ---------------------------------------------------------------------

def test_corr_non_numeric(self, float_string_frame):
Expand Down Expand Up @@ -303,6 +324,15 @@ def test_corrwith(self, datetime_frame, dtype):
dropped = a.corrwith(b, axis=1, drop=True)
assert a.index[-1] not in dropped.index

@pytest.mark.parametrize("method", ["spearman", "kendall"])
def test_corrwith_categorical(self, categorical_frame, method):
pytest.importorskip("scipy")
other = categorical_frame["B"]
result = categorical_frame.corrwith(other, method=method)
expected = categorical_frame.agg(lambda x: x.corr(other, method=method))
tm.assert_almost_equal(result["A"], expected["A"])
tm.assert_almost_equal(result["B"], expected["B"])

def test_corrwith_non_timeseries_data(self):
index = ["a", "b", "c", "d", "e"]
columns = ["one", "two", "three", "four"]
Expand Down
76 changes: 57 additions & 19 deletions pandas/tests/series/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,43 @@

import pandas as pd
from pandas import (
Categorical,
Series,
date_range,
isna,
)
import pandas._testing as tm


@pytest.fixture
def A():
return Series(
np.concatenate([np.arange(5, dtype=np.float64)] * 2),
index=date_range("2020-01-01", periods=10),
name="ts",
)


@pytest.fixture
def B():
return Series(
np.arange(10, dtype=np.float64),
index=date_range("2020-01-01", periods=10),
name="ts",
)


@pytest.fixture
def C():
s = Series(
data=Categorical(list("12345") * 2, categories=list("54321"), ordered=True),
index=date_range("2020-01-01", periods=10),
name="categorical",
)
s["2020-01-03"] = np.nan
return s


class TestSeriesCov:
def test_cov(self, datetime_series):
# full overlap
Expand Down Expand Up @@ -56,7 +86,7 @@ def test_cov_ddof(self, test_ddof, dtype):


class TestSeriesCorr:
def test_corr(self, datetime_series, any_float_dtype):
def test_corr(self, B, datetime_series, any_float_dtype):
stats = pytest.importorskip("scipy.stats")

datetime_series = datetime_series.astype(any_float_dtype)
Expand All @@ -81,29 +111,14 @@ def test_corr(self, datetime_series, any_float_dtype):
cp[:] = np.nan
assert isna(cp.corr(cp))

A = Series(
np.arange(10, dtype=np.float64),
index=date_range("2020-01-01", periods=10),
name="ts",
)
result = A.corr(A)
expected, _ = stats.pearsonr(A, A)
result = B.corr(B)
expected, _ = stats.pearsonr(B, B)
tm.assert_almost_equal(result, expected)

def test_corr_rank(self):
def test_corr_rank(self, A, B):
stats = pytest.importorskip("scipy.stats")

# kendall and spearman
B = Series(
np.arange(10, dtype=np.float64),
index=date_range("2020-01-01", periods=10),
name="ts",
)
A = Series(
np.concatenate([np.arange(5, dtype=np.float64)] * 2),
index=date_range("2020-01-01", periods=10),
name="ts",
)
result = A.corr(B, method="kendall")
expected = stats.kendalltau(A, B)[0]
tm.assert_almost_equal(result, expected)
Expand Down Expand Up @@ -146,6 +161,29 @@ def test_corr_rank(self):
tm.assert_almost_equal(A.corr(B, method="kendall"), kexp)
tm.assert_almost_equal(A.corr(B, method="spearman"), sexp)

def test_corr_category(self, A, C):
stats = pytest.importorskip("scipy.stats")

def get_codes(s: Series) -> Series:
return C.cat.codes.replace(-1, np.nan)

result = A.corr(C, method="pearson")
expected = stats.pearsonr(A[C.notna()], C.dropna().astype("float"))[0]
tm.assert_almost_equal(result, expected)
tm.assert_almost_equal(result, 1)

result = A.corr(C, method="spearman")
expected = stats.spearmanr(A, get_codes(C), nan_policy="omit")[0]
expected_pearson = stats.pearsonr(A[C.notna()], get_codes(C).dropna())[0]

tm.assert_almost_equal(result, expected)
tm.assert_almost_equal(result, expected_pearson)
tm.assert_almost_equal(result, -1)

result = A.corr(C, method="kendall")
expected = stats.kendalltau(A, get_codes(C), nan_policy="omit")[0]
tm.assert_almost_equal(result, expected)

def test_corr_invalid_method(self):
# GH PR #22298
s1 = Series(np.random.default_rng(2).standard_normal(10))
Expand Down
Loading