Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 1 addition & 34 deletions sdc/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2196,40 +2196,7 @@ def hpat_pandas_series_corr(self, other, method='pearson', min_periods=None):
ty_checker.raise_exc(min_periods, 'int64', 'min_periods')

def hpat_pandas_series_corr_impl(self, other, method='pearson', min_periods=None):
if method not in ('pearson', ''):
raise ValueError("Method corr(). Unsupported parameter. Given method != 'pearson'")

if min_periods is None:
min_periods = 1

if len(self._data) == 0 or len(other._data) == 0:
return numpy.nan

self_arr = self._data[:min(len(self._data), len(other._data))]
other_arr = other._data[:min(len(self._data), len(other._data))]

invalid = numpy.isnan(self_arr) | numpy.isnan(other_arr)
if invalid.any():
self_arr = self_arr[~invalid]
other_arr = other_arr[~invalid]

if len(self_arr) < min_periods:
return numpy.nan

new_self = pandas.Series(self_arr)
new_other = pandas.Series(other_arr)

n = new_self.count()
ma = new_self.sum()
mb = new_other.sum()
a = n * (self_arr * other_arr).sum() - ma * mb
b1 = n * (self_arr * self_arr).sum() - ma * ma
b2 = n * (other_arr * other_arr).sum() - mb * mb

if b1 == 0 or b2 == 0:
return numpy.nan

return a / numpy.sqrt(b1 * b2)
return numpy_like.corr(self._data, other._data, method, min_periods)

return hpat_pandas_series_corr_impl

Expand Down
74 changes: 74 additions & 0 deletions sdc/functions/numpy_like.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from sdc.utilities.sdc_typing_utils import TypeChecker
from sdc.str_arr_ext import (StringArrayType, pre_alloc_string_array, get_utf8_size, str_arr_is_na)
from sdc.utilities.utils import sdc_overload, sdc_register_jitable
from sdc.utilities.prange_utils import parallel_chunks


def astype(self, dtype):
Expand Down Expand Up @@ -73,6 +74,10 @@ def nansum(self):
pass


def corr(self):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate function definition. Other definition in line 507.

pass


@sdc_overload(astype)
def sdc_astype_overload(self, dtype):
"""
Expand Down Expand Up @@ -499,6 +504,75 @@ def nanmean_impl(a):
return nanmean_impl


def corr(self, other, method='pearson', min_periods=None):
pass


@sdc_overload(corr)
def corr_overload(self, other, method='pearson', min_periods=None):
dtype_self = self.dtype
dtype_other = other.dtype
isnan_self = get_isnan(dtype_self)
isnan_other = get_isnan(dtype_other)

def corr_impl(self, other, method='pearson', min_periods=None):
len_self = len(self)
len_other = len(other)
if method not in ('pearson', ''):
raise ValueError("Method corr(). Unsupported parameter. Given method != 'pearson'")

if min_periods is None:
min_periods = 1

if len_self == 0 or len_other == 0:
return numpy.nan

min_len = min(len_self, len_other)
chunks = parallel_chunks(min_len)
arr_len = numpy.empty(len(chunks), dtype=numpy.int64)
length = 0

for i in prange(len(chunks)):
chunk = chunks[i]
res = 0
for j in range(chunk.start, chunk.stop):
if not isnan_self(self[j]) or not isnan_other(other[j]):
res += 1
length += res
arr_len[i] = res

result_self = numpy.empty(shape=length, dtype=dtype_self)
result_other = numpy.empty(shape=length, dtype=dtype_other)
for i in prange(len(chunks)):
chunk = chunks[i]
new_start = int(sum(arr_len[0:i]))
new_stop = new_start + arr_len[i]
current_pos = new_start

for j in range(chunk.start, chunk.stop):
if not isnan_self(self[j]) or not isnan_other(other[j]):
result_self[current_pos] = self[j]
result_other[current_pos] = other[j]
current_pos += 1

if len(result_self) < min_periods:
return numpy.nan

n = length
ma = sum(result_self)
mb = sum(result_other)
a = n * (result_self * result_other).sum() - ma * mb
b1 = n * (result_self * result_self).sum() - ma * ma
b2 = n * (result_other * result_other).sum() - mb * mb

if b1 == 0 or b2 == 0:
return numpy.nan

return a / numpy.sqrt(b1 * b2)

return corr_impl


def nanvar(a):
pass

Expand Down