Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 7fc0df6

Browse files
Overload df.getitem with idx of tuple (#573)
Co-authored-by: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com>
1 parent 566d961 commit 7fc0df6

2 files changed

Lines changed: 107 additions & 29 deletions

File tree

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 84 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -929,21 +929,46 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):
929929
return sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns)
930930

931931

932-
def df_getitem_slice_idx_main_codelines(self):
933-
"""Generate main code lines for df.getitem"""
932+
def df_index_codelines(self):
933+
"""Generate code lines to get or create index of DF"""
934934
if isinstance(self.index, types.NoneType):
935935
func_lines = [' length = len(get_dataframe_data(self, 0))',
936936
' _index = numpy.arange(length)',
937-
' res_index = _index[idx]']
937+
' res_index = _index']
938938
else:
939-
func_lines = [' res_index = self._index[idx]']
939+
func_lines = [' res_index = self._index']
940940

941+
return func_lines
942+
943+
944+
def df_getitem_slice_idx_main_codelines(self, idx):
945+
"""Generate main code lines for df.getitem with idx of slice"""
941946
results = []
947+
func_lines = df_index_codelines(self)
942948
for i, col in enumerate(self.columns):
943949
res_data = f'res_data_{i}'
944950
func_lines += [
945951
f' data_{i} = get_dataframe_data(self, {i})',
946-
f' {res_data} = pandas.Series(data_{i}[idx], index=res_index, name="{col}")'
952+
f' {res_data} = pandas.Series(data_{i}[idx], index=res_index[idx], name="{col}")'
953+
]
954+
results.append((col, res_data))
955+
956+
data = ', '.join(f'"{col}": {data}' for col, data in results)
957+
func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index[idx])']
958+
959+
return func_lines
960+
961+
962+
def df_getitem_tuple_idx_main_codelines(self, literal_idx):
963+
"""Generate main code lines for df.getitem with idx of tuple"""
964+
results = []
965+
func_lines = df_index_codelines(self)
966+
needed_cols = {col: i for i, col in enumerate(self.columns) if col in literal_idx}
967+
for col, i in needed_cols.items():
968+
res_data = f'res_data_{i}'
969+
func_lines += [
970+
f' data_{i} = get_dataframe_data(self, {i})',
971+
f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")'
947972
]
948973
results.append((col, res_data))
949974

@@ -953,20 +978,20 @@ def df_getitem_slice_idx_main_codelines(self):
953978
return func_lines
954979

955980

956-
def df_getitem_str_slice_codegen(self):
981+
def df_getitem_slice_idx_codegen(self, idx):
957982
"""
958983
Example of generated implementation with provided index:
959-
def _df_getitem_slice_idx_impl(self, idx):
960-
res_index = self._index[idx]
984+
def _df_getitem_slice_idx_impl(self, idx)
985+
res_index = self._index
961986
data_0 = get_dataframe_data(self, 0)
962-
res_data_0 = pandas.Series(data_0[idx], index=res_index, name="A")
987+
res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A")
963988
data_1 = get_dataframe_data(self, 1)
964989
res_data_1 = pandas.Series(data_1[idx], index=res_index, name="B")
965-
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index)
990+
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx])
966991
"""
967992
func_lines = ['def _df_getitem_slice_idx_impl(self, idx):']
968993
if self.columns:
969-
func_lines += df_getitem_slice_idx_main_codelines(self)
994+
func_lines += df_getitem_slice_idx_main_codelines(self, idx)
970995
else:
971996
# raise KeyError if input DF is empty
972997
func_lines += [' raise KeyError']
@@ -978,14 +1003,52 @@ def _df_getitem_slice_idx_impl(self, idx):
9781003
return func_text, global_vars
9791004

9801005

981-
def gen_df_getitem_slice_idx_impl(self):
982-
func_text, global_vars = df_getitem_str_slice_codegen(self)
1006+
def df_getitem_tuple_idx_codegen(self, idx):
1007+
"""
1008+
Example of generated implementation with provided index:
1009+
def _df_getitem_tuple_idx_impl(self, idx)
1010+
res_index = self._index
1011+
data_1 = get_dataframe_data(self, 1)
1012+
res_data_1 = pandas.Series(data_1, index=res_index, name="B")
1013+
data_2 = get_dataframe_data(self, 2)
1014+
res_data_2 = pandas.Series(data_2, index=res_index, name="C")
1015+
return pandas.DataFrame({"B": res_data_1, "C": res_data_2}, index=res_index)
1016+
"""
1017+
func_lines = ['def _df_getitem_tuple_idx_impl(self, idx):']
1018+
literal_idx = {col.literal_value for col in idx}
1019+
key_error = any(i not in self.columns for i in literal_idx)
9831020

984-
loc_vars = {}
985-
exec(func_text, global_vars, loc_vars)
986-
_impl = loc_vars['_df_getitem_slice_idx_impl']
1021+
if self.columns and not key_error:
1022+
func_lines += df_getitem_tuple_idx_main_codelines(self, literal_idx)
1023+
else:
1024+
# raise KeyError if input DF is empty or idx is invalid
1025+
func_lines += [' raise KeyError']
1026+
1027+
func_text = '\n'.join(func_lines)
1028+
global_vars = {'pandas': pandas, 'numpy': numpy,
1029+
'get_dataframe_data': get_dataframe_data}
1030+
1031+
return func_text, global_vars
1032+
1033+
1034+
def gen_df_getitem_impl_generator(codegen, impl_name):
1035+
"""Generate generator of df.getitem"""
1036+
def _df_getitem_impl_generator(self, idx):
1037+
func_text, global_vars = codegen(self, idx)
1038+
1039+
loc_vars = {}
1040+
exec(func_text, global_vars, loc_vars)
1041+
_impl = loc_vars[impl_name]
9871042

988-
return _impl
1043+
return _impl
1044+
1045+
return _df_getitem_impl_generator
1046+
1047+
1048+
gen_df_getitem_slice_idx_impl = gen_df_getitem_impl_generator(
1049+
df_getitem_slice_idx_codegen, '_df_getitem_slice_idx_impl')
1050+
gen_df_getitem_tuple_idx_impl = gen_df_getitem_impl_generator(
1051+
df_getitem_tuple_idx_codegen, '_df_getitem_tuple_idx_impl')
9891052

9901053

9911054
@sdc_overload(operator.getitem)
@@ -1018,8 +1081,11 @@ def _df_getitem_unicode_idx_impl(self, idx):
10181081

10191082
return _df_getitem_unicode_idx_impl
10201083

1084+
if isinstance(idx, types.Tuple):
1085+
return gen_df_getitem_tuple_idx_impl(self, idx)
1086+
10211087
if isinstance(idx, types.SliceType):
1022-
return gen_df_getitem_slice_idx_impl(self)
1088+
return gen_df_getitem_slice_idx_impl(self, idx)
10231089

10241090
ty_checker = TypeChecker('Operator getitem().')
10251091
ty_checker.raise_exc(idx, 'str', 'idx')

sdc/tests/test_dataframe.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,6 +1236,17 @@ def test_impl(df, start, end):
12361236
ref_result = test_impl(df, start, end)
12371237
pd.testing.assert_frame_equal(jit_result, ref_result)
12381238

1239+
@skip_sdc_jit('DF.getitem unsupported Series name')
1240+
def _test_df_getitem_tuple_idx(self, df):
1241+
def test_impl(df):
1242+
# pd.df.getitem does not support idx as a tuple
1243+
return df[['A', 'C']]
1244+
1245+
# SDC pd.df.getitem does not support idx as a list
1246+
sdc_func = self.jit(lambda df: df[('A', 'C')])
1247+
1248+
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
1249+
12391250
@skip_sdc_jit('DF.getitem unsupported exceptions')
12401251
def test_df_getitem_str_literal_idx_exception_key_error(self):
12411252
def test_impl(df):
@@ -1260,17 +1271,27 @@ def test_impl(df, idx):
12601271
with self.assertRaises(KeyError):
12611272
sdc_func(df, 'ABC')
12621273

1274+
@skip_sdc_jit('DF.getitem unsupported Series name')
1275+
def test_df_getitem_tuple_idx_exception_key_error(self):
1276+
sdc_func = self.jit(lambda df: df[('A', 'Z')])
1277+
1278+
for df in [gen_df(test_global_input_data_float64), pd.DataFrame()]:
1279+
with self.subTest(df=df):
1280+
with self.assertRaises(KeyError):
1281+
sdc_func(df)
1282+
12631283
@skip_sdc_jit('DF.getitem unsupported Series name')
12641284
def test_df_getitem_idx(self):
12651285
dfs = [gen_df(test_global_input_data_float64),
12661286
gen_df(test_global_input_data_float64, with_index=True),
1267-
pd.DataFrame({'A': []})]
1287+
pd.DataFrame({'A': [], 'B': [], 'C': []})]
12681288
for df in dfs:
12691289
with self.subTest(df=df):
12701290
self._test_df_getitem_str_literal_idx(df)
12711291
self._test_df_getitem_unicode_idx(df, 'A')
12721292
self._test_df_getitem_slice_idx(df)
12731293
self._test_df_getitem_unbox_slice_idx(df, 1, 3)
1294+
self._test_df_getitem_tuple_idx(df)
12741295

12751296
@skip_sdc_jit('DF.getitem unsupported Series name')
12761297
def test_df_getitem_idx_multiple_types(self):
@@ -1284,6 +1305,7 @@ def test_df_getitem_idx_multiple_types(self):
12841305
self._test_df_getitem_unicode_idx(df, 'A')
12851306
self._test_df_getitem_slice_idx(df)
12861307
self._test_df_getitem_unbox_slice_idx(df, 1, 3)
1308+
self._test_df_getitem_tuple_idx(df)
12871309

12881310
@unittest.skip('DF.getitem unsupported integer columns')
12891311
def test_df_getitem_int_literal_idx(self):
@@ -1295,16 +1317,6 @@ def test_impl(df):
12951317

12961318
pd.testing.assert_series_equal(sdc_func(df), test_impl(df))
12971319

1298-
@unittest.skip('DF.getitem unsupported idx as a tuple')
1299-
def test_df_getitem_unicode_tuple_idx(self):
1300-
def test_impl(df):
1301-
return df[['A', 'B']]
1302-
1303-
sdc_func = self.jit(lambda df: df[('A', 'B')])
1304-
df = gen_df(test_global_input_data_float64)
1305-
1306-
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
1307-
13081320
@skip_numba_jit
13091321
def test_isin_df1(self):
13101322
def test_impl(df, df2):

0 commit comments

Comments
 (0)