Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit e8ffa05

Browse files
Refactoring Series.setitem to support pd.Series as value and index based assignment (#536)
* Refactor Series.setitem to support index based assignment * Applying comments from review * Fixing code style * Skipping new tests in old-pipeline * Adding one more utility function for tests * Fixing issues from merge conflicts Co-authored-by: Alexander Kalistratov <alexander.kalistratov@intel.com>
1 parent 7fc0df6 commit e8ffa05

3 files changed

Lines changed: 855 additions & 186 deletions

File tree

sdc/datatypes/hpat_pandas_series_functions.py

Lines changed: 256 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -432,12 +432,19 @@ def hpat_pandas_series_getitem_idx_series_impl(self, idx):
432432

433433

434434
@sdc_overload(operator.setitem)
435-
def hpat_pandas_series_setitem(self, idx, value):
435+
def sdc_pandas_series_setitem(self, idx, value):
436436
"""
437437
Intel Scalable Dataframe Compiler User Guide
438438
********************************************
439439
Pandas API: pandas.Series.__setitem__
440440
441+
Limitations
442+
-----------
443+
Not supported for idx as a string slice, e.g. S['a':'f'] = value
444+
Not supported for string series
445+
Not supported for a case of setting value for non existing index
446+
Not supported for cases when setting causes change of the Series dtype
447+
441448
Examples
442449
--------
443450
.. literalinclude:: ../../../examples/series_setitem_int.py
@@ -477,56 +484,278 @@ def hpat_pandas_series_setitem(self, idx, value):
477484
478485
Intel Scalable Dataframe Compiler Developer Guide
479486
*************************************************
480-
Pandas Series operator :attr:`pandas.Series.set` implementation
487+
Pandas Series operator :attr:`pandas.Series.set` implementation
481488
482489
Test: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_setitem*
483490
484491
Parameters
485492
----------
486493
series: :obj:`pandas.Series`
487494
input series
488-
idx: :obj:`int`, :obj:`slice` or :obj:`pandas.Series`
495+
idx: :obj:`scalar`, :obj:`slice`, :obj:`array` or :obj:`pandas.Series`
489496
input index
490-
value: :object
497+
value: :object:`scalar`, :obj:`array` or :obj:`pandas.Series`
491498
input value
492499
493500
Returns
494501
-------
495-
:class:`pandas.Series` or an element of the underneath type
502+
:class:`pandas.Series`
496503
object of :class:`pandas.Series`
497504
"""
498505

499-
ty_checker = TypeChecker('Operator setitem.')
506+
_func_name = 'Operator setitem().'
507+
ty_checker = TypeChecker(_func_name)
500508
ty_checker.check(self, SeriesType)
501509

502-
if not (isinstance(idx, (types.Integer, types.SliceType, SeriesType))):
503-
ty_checker.raise_exc(idx, 'int, Slice, Series', 'idx')
510+
if not (isinstance(idx, (types.Number, types.UnicodeType, types.SliceType, types.Array, SeriesType))):
511+
ty_checker.raise_exc(idx, 'scalar, Slice, Array or Series', 'idx')
512+
513+
all_supported_scalar_types = (types.Number, types.UnicodeType, types.Boolean)
514+
if not (isinstance(value, all_supported_scalar_types) or isinstance(value, (SeriesType, types.Array))):
515+
ty_checker.raise_exc(value, 'scalar, Array or Series', 'value')
516+
517+
if not check_types_comparable(self, value):
518+
msg = '{} The value and Series data must be comparable. Given: self.dtype={}, value={}'
519+
raise TypingError(msg.format(_func_name, self.dtype, value))
520+
521+
# idx is not necessarily of the same dtype as self.index, e.g. it might be a Boolean indexer or a Slice
522+
if not (check_types_comparable(idx, self.index)
523+
or isinstance(idx, (types.Integer, types.SliceType))
524+
or (isinstance(idx, (SeriesType, types.Array)) and isinstance(idx.dtype, (types.Integer, types.Boolean)))):
525+
msg = '{} The idx is not comparable to Series index, not a Boolean or integer indexer or a Slice. ' + \
526+
'Given: self.index={}, idx={}'
527+
raise TypingError(msg.format(_func_name, self.index, idx))
528+
529+
value_is_series = isinstance(value, SeriesType)
530+
value_is_array = isinstance(value, types.Array)
531+
532+
# for many cases pandas setitem assigns values along positions in self._data
533+
# not considering Series index, so a common implementation exists
534+
idx_is_boolean_array = isinstance(idx, types.Array) and isinstance(idx.dtype, types.Boolean)
535+
idx_is_boolean_series = isinstance(idx, SeriesType) and isinstance(idx.dtype, types.Boolean)
536+
idx_and_self_index_comparable = check_types_comparable(self.index, idx)
537+
self_index_is_none = isinstance(self.index, types.NoneType)
538+
assign_along_positions = ((self_index_is_none
539+
or isinstance(idx, types.SliceType)
540+
or not idx_and_self_index_comparable)
541+
and not idx_is_boolean_series
542+
and not idx_is_boolean_array)
543+
544+
idx_is_scalar = isinstance(idx, (types.Number, types.UnicodeType))
545+
if assign_along_positions or idx_is_scalar:
546+
547+
idx_is_numeric_or_boolean_series = (isinstance(idx, SeriesType)
548+
and isinstance(idx.dtype, (types.Number, types.Boolean)))
549+
assign_via_idx_mask = idx_is_scalar and idx_and_self_index_comparable
550+
assign_via_idx_data = idx_is_numeric_or_boolean_series and not idx_and_self_index_comparable
551+
552+
def sdc_pandas_series_setitem_no_reindexing_impl(self, idx, value):
553+
554+
if assign_via_idx_mask == True: # noqa
555+
_idx = self._index == idx
556+
elif assign_via_idx_data == True: # noqa
557+
_idx = idx._data
558+
else:
559+
_idx = idx
504560

505-
if not((isinstance(value, SeriesType) and isinstance(value.dtype, self.dtype)) or \
506-
isinstance(value, type(self.dtype))):
507-
ty_checker.raise_exc(value, self.dtype, 'value')
561+
_value = value._data if value_is_series == True else value # noqa
508562

509-
if isinstance(idx, types.Integer) or isinstance(idx, types.SliceType):
510-
def hpat_pandas_series_setitem_idx_integer_impl(self, idx, value):
511-
"""
512-
Test: python -m sdc.runtests sdc.tests.test_series.TestSeries.test_series_setitem_for_value
513-
Test: python -m sdc.runtests sdc.tests.test_series.TestSeries.test_series_setitem_for_slice
514-
"""
515-
self._data[idx] = value
563+
self._data[_idx] = _value
516564
return self
517565

518-
return hpat_pandas_series_setitem_idx_integer_impl
566+
return sdc_pandas_series_setitem_no_reindexing_impl
567+
568+
if (idx_is_boolean_array or idx_is_boolean_series) and value_is_series:
569+
570+
self_index_dtype = types.int64 if isinstance(self.index, types.NoneType) else self.index.dtype
571+
value_index_dtype = types.int64 if isinstance(value.index, types.NoneType) else value.index.dtype
572+
if (isinstance(self_index_dtype, types.Number) and isinstance(value_index_dtype, types.Number)):
573+
indexes_common_dtype = find_common_dtype_from_numpy_dtypes([self_index_dtype, value_index_dtype], [])
574+
elif (isinstance(self_index_dtype, types.UnicodeType) and isinstance(value_index_dtype, types.UnicodeType)):
575+
indexes_common_dtype = types.unicode_type
576+
else:
577+
msg = '{} The self and value indexes must be comparable. Given: self.dtype={}, value.dtype={}'
578+
raise TypingError(msg.format(_func_name, self_index_dtype, value_index_dtype))
579+
580+
if idx_is_boolean_array:
581+
582+
def sdc_pandas_series_setitem_idx_bool_array_align_impl(self, idx, value):
583+
584+
# if idx is a Boolean array (and value is a series) it's used as a mask for self.index
585+
# and filtered indexes are looked in value.index, and if found corresponding value is set
586+
if value_is_series == True: # noqa
587+
value_index, self_index = value.index, self.index
588+
unique_value_indices, unique_self_indices = set(value_index), set(self_index)
589+
590+
# pandas behaves differently if value.index has duplicates and if it has no
591+
# in case of duplicates in value.index assignment is made via positions
592+
# in case there are no duplicates, value.index is used as reindexer
593+
self_index_has_duplicates = len(unique_self_indices) != len(self_index)
594+
value_index_has_duplicates = len(unique_value_indices) != len(value_index)
595+
if (self_index_has_duplicates or value_index_has_duplicates):
596+
self._data[idx] = value._data
597+
else:
598+
map_index_to_position = Dict.empty(
599+
key_type=indexes_common_dtype,
600+
value_type=types.int32
601+
)
602+
for i, index_value in enumerate(value_index):
603+
map_index_to_position[index_value] = types.int32(i)
604+
605+
# such iterative setitem on a StringArray will be inefficient
606+
# TODO: refactor this when str_arr setitem is fully supported
607+
for i in numba.prange(len(self_index)):
608+
if idx[i]:
609+
self_index_value = self_index[i]
610+
if self_index_value in map_index_to_position:
611+
self._data[i] = value._data[map_index_to_position[self_index_value]]
612+
else:
613+
sdc.hiframes.join.setitem_arr_nan(self._data, i)
614+
615+
else:
616+
# if value has no index - nothing to reindex and assignment is made along positions set by idx mask
617+
self._data[idx] = value
519618

520-
if isinstance(idx, SeriesType):
521-
def hpat_pandas_series_setitem_idx_series_impl(self, idx, value):
522-
"""
523-
Test: python -m sdc.runtests sdc.tests.test_series.TestSeries.test_series_setitem_for_series
524-
"""
525-
super_index = idx._data
526-
self._data[super_index] = value
527619
return self
528620

529-
return hpat_pandas_series_setitem_idx_series_impl
621+
return sdc_pandas_series_setitem_idx_bool_array_align_impl
622+
623+
elif idx_is_boolean_series:
624+
625+
def sdc_pandas_series_setitem_idx_bool_series_align_impl(self, idx, value):
626+
627+
self_index, idx_index = self.index, idx.index
628+
# FIXME: for now just use sorted, as == is not implemented for sets of unicode strings
629+
if (sorted(self_index) != sorted(idx_index)):
630+
msg = "Unalignable boolean Series provided as indexer " + \
631+
"(index of the boolean Series and of the indexed object do not match)"
632+
raise ValueError(msg)
633+
634+
# if idx is a Boolean Series it's data is used as a mask for it's index
635+
# and filtered indexes are either looked in value.index (if value is a Series)
636+
# or in self.index (if value is scalar or array)
637+
filtered_idx_indices = idx_index[idx._data]
638+
filtered_idx_indices_set = set(filtered_idx_indices)
639+
if value_is_series == True: # noqa
640+
641+
if len(filtered_idx_indices_set) != len(filtered_idx_indices):
642+
raise ValueError("cannot reindex from a duplicate axis")
643+
644+
map_self_index_to_position = Dict.empty(
645+
key_type=indexes_common_dtype,
646+
value_type=types.int32
647+
)
648+
for i, index_value in enumerate(self_index):
649+
map_self_index_to_position[index_value] = types.int32(i)
650+
651+
value_index = value.index
652+
map_value_index_to_position = Dict.empty(
653+
key_type=indexes_common_dtype,
654+
value_type=types.int32
655+
)
656+
for i, index_value in enumerate(value_index):
657+
map_value_index_to_position[index_value] = types.int32(i)
658+
659+
# for all index values in filtered index assign element of value with this index
660+
# to element of self with this index
661+
for i in numba.prange(len(filtered_idx_indices)):
662+
idx_index_value = filtered_idx_indices[i]
663+
if idx_index_value in map_value_index_to_position:
664+
self_index_pos = map_self_index_to_position[idx_index_value]
665+
value_index_pos = map_value_index_to_position[idx_index_value]
666+
self._data[self_index_pos] = value._data[value_index_pos]
667+
else:
668+
sdc.hiframes.join.setitem_arr_nan(self._data, map_self_index_to_position[idx_index_value])
669+
else:
670+
# use filtered index values to create a set mask, then make assignment to self
671+
# using this mask (i.e. the order of filtered indices in self.index does not matter)
672+
self_index_size = len(self_index)
673+
set_mask = numpy.zeros(self_index_size, dtype=numpy.bool_)
674+
for i in numba.prange(self_index_size):
675+
if self_index[i] in filtered_idx_indices_set:
676+
set_mask[i] = True
677+
self._data[set_mask] = value
678+
679+
return self
680+
681+
return sdc_pandas_series_setitem_idx_bool_series_align_impl
682+
683+
elif isinstance(idx, (SeriesType, types.Array)) and idx_and_self_index_comparable:
684+
685+
# idx is numeric Series or array comparable with self.index, hence reindexing is possible
686+
if isinstance(self.index.dtype, types.Number):
687+
688+
idx_is_series = isinstance(idx, SeriesType)
689+
value_is_scalar = not (value_is_series or value_is_array)
690+
def sdc_pandas_series_setitem_idx_int_series_align_impl(self, idx, value):
691+
692+
_idx = idx._data if idx_is_series == True else idx # noqa
693+
_value = value._data if value_is_series == True else value # noqa
694+
695+
self_index_size = len(self._index)
696+
idx_size = len(_idx)
697+
valid_indices = numpy.repeat(-1, self_index_size)
698+
for i in numba.prange(self_index_size):
699+
for j in numpy.arange(idx_size):
700+
if self._index[i] == _idx[j]:
701+
valid_indices[i] = j
702+
703+
valid_indices_positions = numpy.arange(self_index_size)[valid_indices != -1]
704+
valid_indices_masked = valid_indices[valid_indices != -1]
705+
706+
indexes_found = self._index[valid_indices_positions]
707+
if len(numpy.unique(indexes_found)) != len(indexes_found):
708+
raise ValueError("Reindexing only valid with uniquely valued Index objects")
709+
710+
if len(valid_indices_masked) != idx_size:
711+
raise ValueError("Reindexing not possible: idx has index not found in Series")
712+
713+
if value_is_scalar == True: # noqa
714+
self._data[valid_indices_positions] = _value
715+
else:
716+
self._data[valid_indices_positions] = numpy.take(_value, valid_indices_masked)
717+
718+
return self
719+
720+
return sdc_pandas_series_setitem_idx_int_series_align_impl
721+
722+
elif isinstance(self.index.dtype, types.UnicodeType):
723+
724+
def sdc_pandas_series_setitem_idx_str_series_align_impl(self, idx, value):
725+
726+
map_index_to_position = Dict.empty(
727+
key_type=types.unicode_type,
728+
value_type=types.int32
729+
)
730+
for i, index_value in enumerate(self._index):
731+
if index_value in map_index_to_position:
732+
raise ValueError("Reindexing only valid with uniquely valued Index objects")
733+
map_index_to_position[index_value] = types.int32(i)
734+
735+
idx_data_size = len(idx._data)
736+
number_of_found = 0
737+
set_positions = numpy.empty(idx_data_size, dtype=types.int32)
738+
for i in numba.prange(len(idx._data)):
739+
index_value = idx._data[i]
740+
if index_value in map_index_to_position:
741+
number_of_found += 1
742+
set_positions[i] = map_index_to_position[index_value]
743+
744+
if number_of_found != idx_data_size:
745+
raise ValueError("Reindexing not possible: idx has index not found in Series")
746+
747+
if value_is_series == True: # noqa
748+
self._data[set_positions] = value._data
749+
else:
750+
self._data[set_positions] = value
751+
return self
752+
753+
return sdc_pandas_series_setitem_idx_str_series_align_impl
754+
755+
else: # self.index.dtype other than types.Number or types.Unicode
756+
return None
757+
758+
return None
530759

531760

532761
@sdc_overload_attribute(SeriesType, 'iloc')

0 commit comments

Comments
 (0)