WIP: Port to numba master (#338)

Hardcode84 · shssf · commit e3f53bcdd6ed · 2019-11-26T07:35:22.000-06:00
* remove boost.regex dependency

* adapt for get_parfor_reductions interface changes

* disable tests due dead code_parfor regression

* lambda type_infer quickfix

* quick fix

* fix define sig

* old style: fixes for lambda inlining

* fix series combine

* some work on df.apply

* fix rolling

* remove commented code

* expected failures

* style
diff --git a/sdc/distributed.py b/sdc/distributed.py
@@ -1770,7 +1770,7 @@ def _gen_parfor_reductions(self, parfor, namevar_table):
         _, reductions = get_parfor_reductions(
             parfor, parfor.params, self.state.calltypes)
 
-        for reduce_varname, (init_val, reduce_nodes) in reductions.items():
+        for reduce_varname, (init_val, reduce_nodes, _) in reductions.items():
             reduce_op = guard(self._get_reduce_op, reduce_nodes)
             # TODO: initialize reduction vars (arrays)
             reduce_var = namevar_table[reduce_varname]
diff --git a/sdc/hiframes/aggregate.py b/sdc/hiframes/aggregate.py
@@ -438,7 +438,7 @@ def aggregate_array_analysis(aggregate_node, equiv_set, typemap,
         equiv_set.insert_equiv(col_var, shape)
         post.extend(c_post)
         all_shapes.append(shape[0])
-        equiv_set.define(col_var)
+        equiv_set.define(col_var, {})
 
     if len(all_shapes) > 1:
         equiv_set.insert_equiv(*all_shapes)
diff --git a/sdc/hiframes/dataframe_pass.py b/sdc/hiframes/dataframe_pass.py
@@ -140,7 +140,8 @@ def run_pass(self):
                 out_nodes = [inst]
 
                 if isinstance(inst, ir.Assign):
-                    self.state.func_ir._definitions[inst.target.name].remove(inst.value)
+                    if inst.value in self.state.func_ir._definitions[inst.target.name]:
+                        self.state.func_ir._definitions[inst.target.name].remove(inst.value)
                     out_nodes = self._run_assign(inst)
                 elif isinstance(inst, (ir.SetItem, ir.StaticSetItem)):
                     out_nodes = self._run_setitem(inst)
diff --git a/sdc/hiframes/filter.py b/sdc/hiframes/filter.py
@@ -100,7 +100,7 @@ def filter_array_analysis(filter_node, equiv_set, typemap, array_analysis):
         equiv_set.insert_equiv(col_var, shape)
         post.extend(c_post)
         all_shapes.append(shape[0])
-        equiv_set.define(col_var)
+        equiv_set.define(col_var, {})
 
     if len(all_shapes) > 1:
         equiv_set.insert_equiv(*all_shapes)
diff --git a/sdc/hiframes/hiframes_typed.py b/sdc/hiframes/hiframes_typed.py
@@ -1212,10 +1212,7 @@ def _handle_series_map(self, assign, lhs, rhs, series_var):
         # error checking: make sure there is function input only
         if len(rhs.args) != 1:
             raise ValueError("map expects 1 argument")
-        func = guard(get_definition, self.state.func_ir, rhs.args[0])
-        if func is None or not (isinstance(func, ir.Expr)
-                                and func.op == 'make_function'):
-            raise ValueError("lambda for map not found")
+        func = guard(get_definition, self.state.func_ir, rhs.args[0]).value.py_func
 
         dtype = self.state.typemap[series_var.name].dtype
         nodes = []
@@ -1382,11 +1379,7 @@ def _handle_series_combine(self, assign, lhs, rhs, series_var):
             raise ValueError("not enough arguments in call to combine")
         if len(rhs.args) > 3:
             raise ValueError("too many arguments in call to combine")
-        func = guard(get_definition, self.state.func_ir, rhs.args[1])
-        if func is None or not (isinstance(func, ir.Expr)
-                                and func.op == 'make_function'):
-            raise ValueError("lambda for combine not found")
-
+        func = guard(get_definition, self.state.func_ir, rhs.args[1]).value.py_func
         out_typ = self.state.typemap[lhs.name].dtype
         other = rhs.args[0]
         nodes = []
@@ -1533,19 +1526,16 @@ def f(arr, w, center):  # pragma: no cover
     def _handle_rolling_apply_func(self, func_node, dtype, out_dtype):
         if func_node is None:
             raise ValueError("cannot find kernel function for rolling.apply() call")
+        func_node = func_node.value.py_func
         # TODO: more error checking on the kernel to make sure it doesn't
         # use global/closure variables
-        if func_node.closure is not None:
-            raise ValueError("rolling apply kernel functions cannot have closure variables")
-        if func_node.defaults is not None:
-            raise ValueError("rolling apply kernel functions cannot have default arguments")
         # create a function from the code object
         glbs = self.state.func_ir.func_id.func.__globals__
         lcs = {}
         exec("def f(A): return A", glbs, lcs)
         kernel_func = lcs['f']
-        kernel_func.__code__ = func_node.code
-        kernel_func.__name__ = func_node.code.co_name
+        kernel_func.__code__ = func_node.__code__
+        kernel_func.__name__ = func_node.__code__.co_name
         # use hpat's sequential pipeline to enable pandas operations
         # XXX seq pipeline used since dist pass causes a hang
         m = numba.ir_utils._max_label
diff --git a/sdc/hiframes/join.py b/sdc/hiframes/join.py
@@ -131,7 +131,7 @@ def join_array_analysis(join_node, equiv_set, typemap, array_analysis):
         equiv_set.insert_equiv(col_var, shape)
         post.extend(c_post)
         all_shapes.append(shape[0])
-        equiv_set.define(col_var)
+        equiv_set.define(col_var, {})
 
     if len(all_shapes) > 1:
         equiv_set.insert_equiv(*all_shapes)
diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py
@@ -142,10 +142,6 @@ def resolve_values(self, ary):
     def resolve_apply(self, df, args, kws):
         kws = dict(kws)
         func = args[0] if len(args) > 0 else kws.get('func', None)
-        # check lambda
-        if not isinstance(func, types.MakeFunctionLiteral):
-            raise ValueError("df.apply(): lambda not found")
-
         # check axis
         axis = args[1] if len(args) > 1 else kws.get('axis', None)
         if (axis is None or not isinstance(axis, types.IntegerLiteral)
@@ -165,12 +161,8 @@ def resolve_apply(self, df, args, kws):
             dtypes.append(el_typ)
 
         row_typ = types.NamedTuple(dtypes, Row)
-        code = func.literal_value.code
-        f_ir = numba.ir_utils.get_ir_of_code({'np': np}, code)
-        _, f_return_type, _ = numba.typed_passes.type_inference_stage(
-            self.context, f_ir, (row_typ,), None)
-
-        return signature(SeriesType(f_return_type), *args)
+        t = func.get_call_type(self.context, (row_typ,), {})
+        return signature(SeriesType(t.return_type), *args)
 
     @bound_function("df.describe")
     def resolve_describe(self, df, args, kws):
diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py
@@ -564,18 +564,8 @@ def _resolve_map_func(self, ary, args, kws):
         # getitem returns Timestamp for dt_index and series(dt64)
         if dtype == types.NPDatetime('ns'):
             dtype = pandas_timestamp_type
-        code = args[0].literal_value.code
-        _globals = {'np': np}
-        # XXX hack in hiframes_typed to make globals available
-        if hasattr(args[0].literal_value, 'globals'):
-            # TODO: use code.co_names to find globals actually used?
-            _globals = args[0].literal_value.globals
-
-        f_ir = numba.ir_utils.get_ir_of_code(_globals, code)
-        f_typemap, f_return_type, f_calltypes = numba.typed_passes.type_inference_stage(
-            self.context, f_ir, (dtype,), None)
-
-        return signature(SeriesType(f_return_type), *args)
+        t = args[0].get_call_type(self.context, (dtype,), {})
+        return signature(SeriesType(t.return_type), *args)
 
     @bound_function("series.map")
     def resolve_map(self, ary, args, kws):
@@ -594,11 +584,8 @@ def _resolve_combine_func(self, ary, args, kws):
         dtype2 = args[0].dtype
         if dtype2 == types.NPDatetime('ns'):
             dtype2 = pandas_timestamp_type
-        code = args[1].literal_value.code
-        f_ir = numba.ir_utils.get_ir_of_code({'np': np}, code)
-        f_typemap, f_return_type, f_calltypes = numba.typed_passes.type_inference_stage(
-            self.context, f_ir, (dtype1, dtype2,), None)
-        return signature(SeriesType(f_return_type), *args)
+        t = args[1].get_call_type(self.context, (dtype1, dtype2,), {})
+        return signature(SeriesType(t.return_type), *args)
 
     @bound_function("series.combine")
     def resolve_combine(self, ary, args, kws):
diff --git a/sdc/io/csv_ext.py b/sdc/io/csv_ext.py
@@ -93,7 +93,7 @@ def csv_array_analysis(csv_node, equiv_set, typemap, array_analysis):
         equiv_set.insert_equiv(col_var, shape)
         post.extend(c_post)
         all_shapes.append(shape[0])
-        equiv_set.define(col_var)
+        equiv_set.define(col_var, {})
 
     if len(all_shapes) > 1:
         equiv_set.insert_equiv(*all_shapes)
diff --git a/sdc/tests/test_basic.py b/sdc/tests/test_basic.py
@@ -327,8 +327,7 @@ def test_array_reduce(self):
             self.assertEqual(count_array_OneDs(), 0)
             self.assertEqual(count_parfor_OneDs(), 1)
 
-    @unittest.skipIf(check_numba_version('0.46.0'),
-                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
+    @unittest.expectedFailure  # https://github.com/numba/numba/issues/4690
     def test_dist_return(self):
         def test_impl(N):
             A = np.arange(N)
@@ -345,8 +344,7 @@ def test_impl(N):
         self.assertEqual(count_array_OneDs(), 1)
         self.assertEqual(count_parfor_OneDs(), 1)
 
-    @unittest.skipIf(check_numba_version('0.46.0'),
-                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
+    @unittest.expectedFailure # https://github.com/numba/numba/issues/4690
     def test_dist_return_tuple(self):
         def test_impl(N):
             A = np.arange(N)
@@ -375,8 +373,7 @@ def test_impl(A):
         np.testing.assert_allclose(hpat_func(arr) / self.num_ranks, test_impl(arr))
         self.assertEqual(count_array_OneDs(), 1)
 
-    @unittest.skipIf(check_numba_version('0.46.0'),
-                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
+    @unittest.expectedFailure  # https://github.com/numba/numba/issues/4690
     def test_rebalance(self):
         def test_impl(N):
             A = np.arange(n)
@@ -394,8 +391,7 @@ def test_impl(N):
         finally:
             sdc.distributed_analysis.auto_rebalance = False
 
-    @unittest.skipIf(check_numba_version('0.46.0'),
-                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
+    @unittest.expectedFailure  # https://github.com/numba/numba/issues/4690
     def test_rebalance_loop(self):
         def test_impl(N):
             A = np.arange(n)
diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py
@@ -160,8 +160,7 @@ def test_impl(df):
                                           dtype=pd.api.types.CategoricalDtype(['N', 'Y']))})
         pd.testing.assert_frame_equal(hpat_func(df.copy(deep=True)), test_impl(df))
 
-    @unittest.skipIf(check_numba_version('0.46.0'),
-                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
+    @unittest.expectedFailure  # https://github.com/numba/numba/issues/4690
     def test_box_dist_return(self):
         def test_impl(n):
             df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)})
diff --git a/sdc/tests/test_ml.py b/sdc/tests/test_ml.py
@@ -117,8 +117,7 @@ def test_impl(n):
         self.assertEqual(count_array_OneDs(), 1)
         self.assertEqual(count_parfor_OneDs(), 2)
 
-    @unittest.skipIf(check_numba_version('0.46.0'),
-                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
+    @unittest.expectedFailure  # https://github.com/numba/numba/issues/4690
     def test_kmeans(self):
         def test_impl(numCenter, numIter, N, D):
             A = np.ones((N, D))
diff --git a/setup.py b/setup.py
@@ -202,9 +202,6 @@ def readme():
 
 str_libs = np_compile_args['libraries']
 
-if not is_win:
-    str_libs += ['boost_regex']
-
 ext_str = Extension(name="sdc.hstr_ext",
                     sources=["sdc/_str_ext.cpp"],
                     libraries=str_libs,