Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 14bda78

Browse files
authored
capitalize, casefold, title, swapcase with use prange & work with None (#575)
* parallel and add mask for methods: capitalize, casefold, swapcase, title * correct master * change methods: capitalize, casefold, swapcase, title * correct doc * correct doc * add variable len_arr * change tests * use subtest * fix problem with PEP8
1 parent 0f2c5d6 commit 14bda78

4 files changed

Lines changed: 51 additions & 19 deletions

File tree

sdc/datatypes/hpat_pandas_stringmethods_functions.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ def hpat_pandas_stringmethods_upper_impl(self):
8787
from sdc.utilities.sdc_typing_utils import TypeChecker
8888
from sdc.datatypes.hpat_pandas_stringmethods_types import StringMethodsType
8989
from sdc.utilities.utils import sdc_overload_method
90+
from sdc.hiframes.api import get_nan_mask
91+
from sdc.str_arr_ext import str_arr_set_na_by_mask, create_str_arr_from_list
9092

9193
_hpat_pandas_stringmethods_autogen_global_dict = {
9294
'pandas': pandas,
@@ -1037,10 +1039,13 @@ def hpat_pandas_stringmethods_capitalize(self):
10371039
ty_checker.check(self, StringMethodsType)
10381040

10391041
def hpat_pandas_stringmethods_capitalize_impl(self):
1042+
mask = get_nan_mask(self._data._data)
10401043
item_count = len(self._data)
1041-
result = [''] * item_count
1044+
res_list = [''] * item_count
10421045
for idx in numba.prange(item_count):
1043-
result[idx] = self._data._data[idx].capitalize()
1046+
res_list[idx] = self._data._data[idx].capitalize()
1047+
str_arr = create_str_arr_from_list(res_list)
1048+
result = str_arr_set_na_by_mask(str_arr, mask)
10441049

10451050
return pandas.Series(result, self._data._index, name=self._data._name)
10461051

@@ -1053,10 +1058,13 @@ def hpat_pandas_stringmethods_title(self):
10531058
ty_checker.check(self, StringMethodsType)
10541059

10551060
def hpat_pandas_stringmethods_title_impl(self):
1061+
mask = get_nan_mask(self._data._data)
10561062
item_count = len(self._data)
1057-
result = [''] * item_count
1063+
res_list = [''] * item_count
10581064
for idx in numba.prange(item_count):
1059-
result[idx] = self._data._data[idx].title()
1065+
res_list[idx] = self._data._data[idx].title()
1066+
str_arr = create_str_arr_from_list(res_list)
1067+
result = str_arr_set_na_by_mask(str_arr, mask)
10601068

10611069
return pandas.Series(result, self._data._index, name=self._data._name)
10621070

@@ -1069,10 +1077,13 @@ def hpat_pandas_stringmethods_swapcase(self):
10691077
ty_checker.check(self, StringMethodsType)
10701078

10711079
def hpat_pandas_stringmethods_swapcase_impl(self):
1080+
mask = get_nan_mask(self._data._data)
10721081
item_count = len(self._data)
1073-
result = [''] * item_count
1082+
res_list = [''] * item_count
10741083
for idx in numba.prange(item_count):
1075-
result[idx] = self._data._data[idx].swapcase()
1084+
res_list[idx] = self._data._data[idx].swapcase()
1085+
str_arr = create_str_arr_from_list(res_list)
1086+
result = str_arr_set_na_by_mask(str_arr, mask)
10761087

10771088
return pandas.Series(result, self._data._index, name=self._data._name)
10781089

@@ -1085,10 +1096,13 @@ def hpat_pandas_stringmethods_casefold(self):
10851096
ty_checker.check(self, StringMethodsType)
10861097

10871098
def hpat_pandas_stringmethods_casefold_impl(self):
1099+
mask = get_nan_mask(self._data._data)
10881100
item_count = len(self._data)
1089-
result = [''] * item_count
1101+
res_list = [''] * item_count
10901102
for idx in numba.prange(item_count):
1091-
result[idx] = self._data._data[idx].casefold()
1103+
res_list[idx] = self._data._data[idx].casefold()
1104+
str_arr = create_str_arr_from_list(res_list)
1105+
result = str_arr_set_na_by_mask(str_arr, mask)
10921106

10931107
return pandas.Series(result, self._data._index, name=self._data._name)
10941108

sdc/hiframes/api.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,11 @@ def get_nan_mask(arr):
570570
def get_nan_mask_overload(arr):
571571

572572
def get_nan_mask_via_isna_impl(arr):
573-
return np.array([isna(arr, i) for i in np.arange(len(arr))])
573+
len_arr = len(arr)
574+
res = np.empty(len_arr, dtype=np.bool_)
575+
for i in numba.prange(len_arr):
576+
res[i] = isna(arr, i)
577+
return res
574578

575579
if isinstance(arr, types.Array):
576580
dtype = arr.dtype

sdc/str_arr_ext.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ def typer(string_list=None):
7474
def iternext_str_array(context, builder, sig, args, result):
7575
"""
7676
Implementation of iternext() for the StringArrayIterator type
77-
7877
:param context: context descriptor
7978
:param builder: llvmlite IR Builder
8079
:param sig: iterator signature

sdc/tests/test_series.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,7 +1301,6 @@ def test_impl(A, n):
13011301
-3 6
13021302
-3 3
13031303
dtype: int64
1304-
13051304
>>>S.loc[0:-3]
13061305
0 6
13071306
-3 6
@@ -3196,32 +3195,48 @@ def test_impl(S):
31963195
return S.str.capitalize()
31973196

31983197
sdc_func = self.jit(test_impl)
3199-
s = pd.Series(test_global_input_data_unicode_kind4)
3200-
pd.testing.assert_series_equal(sdc_func(s), test_impl(s))
3198+
test_data = [test_global_input_data_unicode_kind4,
3199+
['lower', None, 'CAPITALS', None, 'this is a sentence', 'SwApCaSe', None]]
3200+
for data in test_data:
3201+
with self.subTest(data=data):
3202+
s = pd.Series(data)
3203+
pd.testing.assert_series_equal(sdc_func(s), test_impl(s))
32013204

32023205
def test_series_title_str(self):
32033206
def test_impl(S):
32043207
return S.str.title()
32053208

32063209
sdc_func = self.jit(test_impl)
3207-
s = pd.Series(test_global_input_data_unicode_kind4)
3208-
pd.testing.assert_series_equal(sdc_func(s), test_impl(s))
3210+
test_data = [test_global_input_data_unicode_kind4,
3211+
['lower', None, 'CAPITALS', None, 'this is a sentence', 'SwApCaSe', None]]
3212+
for data in test_data:
3213+
with self.subTest(data=data):
3214+
s = pd.Series(data)
3215+
pd.testing.assert_series_equal(sdc_func(s), test_impl(s))
32093216

32103217
def test_series_swapcase_str(self):
32113218
def test_impl(S):
32123219
return S.str.swapcase()
32133220

32143221
sdc_func = self.jit(test_impl)
3215-
s = pd.Series(test_global_input_data_unicode_kind4)
3216-
pd.testing.assert_series_equal(sdc_func(s), test_impl(s))
3222+
test_data = [test_global_input_data_unicode_kind4,
3223+
['lower', None, 'CAPITALS', None, 'this is a sentence', 'SwApCaSe', None]]
3224+
for data in test_data:
3225+
with self.subTest(data=data):
3226+
s = pd.Series(data)
3227+
pd.testing.assert_series_equal(sdc_func(s), test_impl(s))
32173228

32183229
def test_series_casefold_str(self):
32193230
def test_impl(S):
32203231
return S.str.casefold()
32213232

32223233
sdc_func = self.jit(test_impl)
3223-
s = pd.Series(test_global_input_data_unicode_kind4)
3224-
pd.testing.assert_series_equal(sdc_func(s), test_impl(s))
3234+
test_data = [test_global_input_data_unicode_kind4,
3235+
['lower', None, 'CAPITALS', None, 'this is a sentence', 'SwApCaSe', None]]
3236+
for data in test_data:
3237+
with self.subTest(data=data):
3238+
s = pd.Series(data)
3239+
pd.testing.assert_series_equal(sdc_func(s), test_impl(s))
32253240

32263241
@sdc_limitation
32273242
def test_series_append_same_names(self):

0 commit comments

Comments
 (0)