IntelPython
diff --git a/‎dpctl_ext/tensor/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎dpctl_ext/tensor/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp‎
Lines changed: 185 additions & 0 deletions b/‎dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp‎
Lines changed: 185 additions & 0 deletions
diff --git a/‎dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp‎
Lines changed: 190 additions & 0 deletions b/‎dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp‎
Lines changed: 190 additions & 0 deletions
@@ -72,9 +72,9 @@ set(_accumulator_sources
 set(_sorting_sources
     #{CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
     #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
 )
 
@@ -0,0 +1,185 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+template <typename sorting_contig_impl_fnT>
+std::pair<sycl::event, sycl::event>
+    py_argsort(const dpctl::tensor::usm_ndarray &src,
+               const int trailing_dims_to_sort,
+               const dpctl::tensor::usm_ndarray &dst,
+               sycl::queue &exec_q,
+               const std::vector<sycl::event> &depends,
+               const sorting_contig_impl_fnT &sort_contig_fns)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+    if (src_nd != dst_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_sort;
+    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
+        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
+                              "greater than rank of the array being sorted");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    std::size_t sort_nelems(1);
+    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
+        sort_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if ((iter_nelems == 0) || (sort_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    // check that dst and src do not overlap
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, dst)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
+        dst, sort_nelems * iter_nelems);
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
+
+    if ((dst_typeid != static_cast<int>(td_ns::typenum_t::INT64)) &&
+        (dst_typeid != static_cast<int>(td_ns::typenum_t::INT32)))
+    {
+        throw py::value_error(
+            "Output index array must have data type int32 or int64");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_dst_c_contig = dst.is_c_contiguous();
+
+    if (is_src_c_contig && is_dst_c_contig) {
+        if (sort_nelems > 1) {
+            static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+            auto fn = sort_contig_fns[src_typeid][dst_typeid];
+
+            if (fn == nullptr) {
+                throw py::value_error(
+                    "Not implemented for dtypes of input arrays");
+            }
+
+            sycl::event comp_ev =
+                fn(exec_q, iter_nelems, sort_nelems, src.get_data(),
+                   dst.get_data(), zero_offset, zero_offset, zero_offset,
+                   zero_offset, depends);
+
+            sycl::event keep_args_alive_ev =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
+
+            return std::make_pair(keep_args_alive_ev, comp_ev);
+        }
+        else {
+            assert(dst.get_size() == iter_nelems);
+            int dst_elemsize = dst.get_elemsize();
+            static constexpr int memset_val(0);
+
+            sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(depends);
+
+                cgh.memset(reinterpret_cast<void *>(dst.get_data()), memset_val,
+                           iter_nelems * dst_elemsize);
+            });
+
+            sycl::event keep_args_alive_ev =
+                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {fill_ev});
+
+            return std::make_pair(keep_args_alive_ev, fill_ev);
+        }
+    }
+
+    throw py::value_error(
+        "Both source and destination arrays must be C-contiguous");
+}
+
+} // namespace dpctl::tensor::py_internal
@@ -0,0 +1,190 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===----------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
+
+#include "py_argsort_common.hpp"
+#include "radix_argsort.hpp"
+#include "radix_sort_support.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace impl_ns = dpctl::tensor::kernels::radix_sort_details;
+
+using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+static sort_contig_fn_ptr_t
+    ascending_radix_argsort_contig_dispatch_table[td_ns::num_types]
+                                                 [td_ns::num_types];
+static sort_contig_fn_ptr_t
+    descending_radix_argsort_contig_dispatch_table[td_ns::num_types]
+                                                  [td_ns::num_types];
+
+namespace
+{
+
+template <bool is_ascending, typename T, typename I>
+sycl::event argsort_axis1_contig_caller(sycl::queue &q,
+                                        std::size_t iter_nelems,
+                                        std::size_t sort_nelems,
+                                        const char *arg_cp,
+                                        char *res_cp,
+                                        ssize_t iter_arg_offset,
+                                        ssize_t iter_res_offset,
+                                        ssize_t sort_arg_offset,
+                                        ssize_t sort_res_offset,
+                                        const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::kernels::radix_argsort_axis1_contig_impl;
+
+    return radix_argsort_axis1_contig_impl<T, I>(
+        q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp,
+        iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset,
+        depends);
+}
+
+} // end of anonymous namespace
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct AscendingRadixArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined &&
+                      (std::is_same_v<IndexTy, std::int64_t> ||
+                       std::is_same_v<IndexTy, std::int32_t>))
+        {
+            return argsort_axis1_contig_caller<
+                /*ascending*/ true, argTy, IndexTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+template <typename fnT, typename argTy, typename IndexTy>
+struct DescendingRadixArgSortContigFactory
+{
+    fnT get()
+    {
+        if constexpr (RadixSortSupportVector<argTy>::is_defined &&
+                      (std::is_same_v<IndexTy, std::int64_t> ||
+                       std::is_same_v<IndexTy, std::int32_t>))
+        {
+            return argsort_axis1_contig_caller<
+                /*ascending*/ false, argTy, IndexTy>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_radix_argsort_dispatch_tables(void)
+{
+    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                AscendingRadixArgSortContigFactory,
+                                td_ns::num_types>
+        dtb1;
+    dtb1.populate_dispatch_table(ascending_radix_argsort_contig_dispatch_table);
+
+    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
+                                DescendingRadixArgSortContigFactory,
+                                td_ns::num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(
+        descending_radix_argsort_contig_dispatch_table);
+}
+
+void init_radix_argsort_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::init_radix_argsort_dispatch_tables();
+
+    auto py_radix_argsort_ascending =
+        [](const dpctl::tensor::usm_ndarray &src,
+           const int trailing_dims_to_sort,
+           const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_argsort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                ascending_radix_argsort_contig_dispatch_table);
+    };
+    m.def("_radix_argsort_ascending", py_radix_argsort_ascending,
+          py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    auto py_radix_argsort_descending =
+        [](const dpctl::tensor::usm_ndarray &src,
+           const int trailing_dims_to_sort,
+           const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
+           const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        return dpctl::tensor::py_internal::py_argsort(
+            src, trailing_dims_to_sort, dst, exec_q, depends,
+            dpctl::tensor::py_internal::
+                descending_radix_argsort_contig_dispatch_table);
+    };
+    m.def("_radix_argsort_descending", py_radix_argsort_descending,
+          py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+    return;
+}
+
+} // namespace dpctl::tensor::py_internal