Skip to content

Commit 400c94d

Browse files
committed
use tbb::concurrent_vector for multithreaded use of matrix_cl types
1 parent 3789de6 commit 400c94d

5 files changed

Lines changed: 50 additions & 134 deletions

File tree

stan/math/opencl/copy.hpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,10 @@ inline auto from_matrix_cl(const T& src) {
9898
try {
9999
cl::Event copy_event;
100100
const cl::CommandQueue queue = opencl_context.queue();
101+
std::vector<cl::Event> copy_write_events(src.write_events().begin(), src.write_events().end());
101102
queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0,
102103
sizeof(T_val) * dst.size(), dst.data(),
103-
&src.write_events(), &copy_event);
104+
&copy_write_events, &copy_event);
104105
copy_event.wait();
105106
src.clear_write_events();
106107
} catch (const cl::Error& e) {
@@ -151,8 +152,9 @@ inline T_dst from_matrix_cl(const matrix_cl<T>& src) {
151152
try {
152153
cl::Event copy_event;
153154
const cl::CommandQueue queue = opencl_context.queue();
155+
std::vector<cl::Event> copy_write_events(src.write_events().begin(), src.write_events().end());
154156
queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0,
155-
sizeof(T), &dst, &src.write_events(), &copy_event);
157+
sizeof(T), &dst, &copy_write_events, &copy_event);
156158
copy_event.wait();
157159
src.clear_write_events();
158160
} catch (const cl::Error& e) {
@@ -183,9 +185,10 @@ inline T_dst from_matrix_cl(const matrix_cl<T>& src) {
183185
try {
184186
cl::Event copy_event;
185187
const cl::CommandQueue queue = opencl_context.queue();
188+
std::vector<cl::Event> copy_write_events(src.write_events().begin(), src.write_events().end());
186189
queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0,
187190
sizeof(T) * src.rows(), dst.data(),
188-
&src.write_events(), &copy_event);
191+
&copy_write_events, &copy_event);
189192
copy_event.wait();
190193
src.clear_write_events();
191194
} catch (const cl::Error& e) {
@@ -257,7 +260,7 @@ inline auto packed_copy(const T& src) {
257260
packed, src, src.rows(), src.rows(),
258261
src.view());
259262
const std::vector<cl::Event> mat_events
260-
= vec_concat(packed.read_write_events(), src.write_events());
263+
= vec_concat(std::vector<cl::Event>{}, packed.read_write_events(), src.write_events());
261264
cl::Event copy_event;
262265
queue.enqueueReadBuffer(packed.buffer(), opencl_context.in_order(), 0,
263266
sizeof(T_val) * packed_size, dst.data(),

stan/math/opencl/kernel_cl.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -109,17 +109,17 @@ inline void assign_events(const cl::Event& new_event, CallArg& m,
109109
* @return A vector of OpenCL events.
110110
*/
111111
template <typename T, require_not_matrix_cl_t<T>* = nullptr>
112-
inline std::vector<cl::Event> select_events(const T& m) {
113-
return {};
112+
inline tbb::concurrent_vector<cl::Event> select_events(const T& m) {
113+
return tbb::concurrent_vector<cl::Event>{};
114114
}
115115
template <typename T, typename K, require_matrix_cl_t<K>* = nullptr,
116116
require_same_t<T, in_buffer>* = nullptr>
117-
inline const std::vector<cl::Event>& select_events(const K& m) {
117+
inline const tbb::concurrent_vector<cl::Event>& select_events(const K& m) {
118118
return m.write_events();
119119
}
120120
template <typename T, typename K, require_matrix_cl_t<K>* = nullptr,
121121
require_any_same_t<T, out_buffer, in_out_buffer>* = nullptr>
122-
inline std::vector<cl::Event> select_events(K& m) {
122+
inline tbb::concurrent_vector<cl::Event> select_events(K& m) {
123123
static_assert(!std::is_const<K>::value, "Can not write to const matrix_cl!");
124124
return m.read_write_events();
125125
}
@@ -205,7 +205,7 @@ struct kernel_cl {
205205
opencl_context.register_kernel_cache(&kernel_);
206206
}
207207
cl::EnqueueArgs eargs(opencl_context.queue(),
208-
vec_concat(internal::select_events<Args>(args)...),
208+
vec_concat(std::vector<cl::Event>{}, internal::select_events<Args>(args)...),
209209
global_thread_size);
210210
cl::KernelFunctor<internal::to_const_buffer_t<Args>&...> kernel_functor(
211211
kernel_);
@@ -232,7 +232,7 @@ struct kernel_cl {
232232
opencl_context.register_kernel_cache(&kernel_);
233233
}
234234
cl::EnqueueArgs eargs(opencl_context.queue(),
235-
vec_concat(internal::select_events<Args>(args)...),
235+
vec_concat(std::vector<cl::Event>{}, internal::select_events<Args>(args)...),
236236
global_thread_size, thread_block_size);
237237
cl::KernelFunctor<internal::to_const_buffer_t<Args>&...> kernel_functor(
238238
kernel_);

stan/math/opencl/matrix_cl.hpp

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <stan/math/prim/fun/Eigen.hpp>
1313
#include <stan/math/prim/fun/vec_concat.hpp>
1414
#include <CL/opencl.hpp>
15+
#include <tbb/concurrent_vector.h>
1516
#include <algorithm>
1617
#include <iostream>
1718
#include <string>
@@ -50,8 +51,8 @@ class matrix_cl : public matrix_cl_base {
5051
int cols_{0}; // Number of columns.
5152
// Holds info on if matrix is a special type
5253
matrix_cl_view view_{matrix_cl_view::Entire};
53-
mutable std::vector<cl::Event> write_events_; // Tracks write jobs
54-
mutable std::vector<cl::Event> read_events_; // Tracks reads
54+
mutable tbb::concurrent_vector<cl::Event> write_events_; // Tracks write jobs
55+
mutable tbb::concurrent_vector<cl::Event> read_events_; // Tracks reads
5556

5657
public:
5758
using Scalar = T; // Underlying type of the matrix
@@ -99,23 +100,23 @@ class matrix_cl : public matrix_cl_base {
99100
* Get the events from the event stacks.
100101
* @return The write event stack.
101102
*/
102-
inline const std::vector<cl::Event>& write_events() const {
103+
inline const tbb::concurrent_vector<cl::Event>& write_events() const {
103104
return write_events_;
104105
}
105106

106107
/**
107108
* Get the events from the event stacks.
108109
* @return The read/write event stack.
109110
*/
110-
inline const std::vector<cl::Event>& read_events() const {
111+
inline const tbb::concurrent_vector<cl::Event>& read_events() const {
111112
return read_events_;
112113
}
113114

114115
/**
115116
* Get the events from the event stacks.
116117
* @return The read/write event stack.
117118
*/
118-
inline const std::vector<cl::Event> read_write_events() const {
119+
inline const tbb::concurrent_vector<cl::Event> read_write_events() const {
119120
return vec_concat(this->read_events(), this->write_events());
120121
}
121122

@@ -615,15 +616,29 @@ class matrix_cl : public matrix_cl_base {
615616
* @param A matrix_cl
616617
*/
617618
void initialize_buffer_cl(const matrix_cl<T>& A) {
619+
cl::Event cstr_event;
620+
std::vector<cl::Event>* dep_events =
621+
new std::vector<cl::Event>(A.write_events().begin(),
622+
A.write_events().end());
618623
try {
619-
cl::Event cstr_event;
620624
opencl_context.queue().enqueueCopyBuffer(A.buffer(), this->buffer(), 0, 0,
621625
A.size() * sizeof(T),
622-
&A.write_events(), &cstr_event);
626+
dep_events, &cstr_event);
627+
if (opencl_context.device()[0].getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>()) {
628+
buffer_cl_.setDestructorCallback(
629+
&delete_it_destructor<std::vector<cl::Event>>, dep_events);
630+
} else {
631+
cstr_event.setCallback(CL_COMPLETE,
632+
&delete_it_event<std::vector<cl::Event>>, dep_events);
633+
}
623634
this->add_write_event(cstr_event);
624635
A.add_read_event(cstr_event);
625636
} catch (const cl::Error& e) {
637+
delete dep_events;
626638
check_opencl_error("copy (OpenCL)->(OpenCL)", e);
639+
} catch (...) {
640+
delete dep_events;
641+
throw;
627642
}
628643
}
629644

stan/math/prim/fun/vec_concat.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@ inline void append_vectors(VecInOut& x) {}
3737
template <typename VecInOut, typename VecIn, typename... VecArgs>
3838
inline void append_vectors(VecInOut& x, const VecIn& y,
3939
const VecArgs&... args) {
40-
x.insert(x.end(), y.begin(), y.end());
40+
for (auto& yy : y) {
41+
x.push_back(yy);
42+
}
4143
append_vectors(x, args...);
4244
}
4345
} // namespace internal
@@ -53,7 +55,7 @@ inline void append_vectors(VecInOut& x, const VecIn& y,
5355
*/
5456
template <typename Vec, typename... Args>
5557
inline auto vec_concat(const Vec& v1, const Args&... args) {
56-
std::vector<value_type_t<Vec>> vec;
58+
Vec vec;
5759
vec.reserve(internal::sum_vector_sizes(v1, args...));
5860
internal::append_vectors(vec, v1, args...);
5961
return vec;

test/unit/math/opencl/rev/normal_lccdf_test.cpp

Lines changed: 11 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -5,139 +5,35 @@
55
#include <test/unit/math/opencl/util.hpp>
66
#include <vector>
77

8-
TEST(ProbDistributionsNormalLccdf, error_checking) {
9-
int N = 3;
10-
11-
Eigen::VectorXd y(N);
12-
y << 0.3, 0.8, 1.0;
13-
Eigen::VectorXd y_size(N - 1);
14-
y_size << 0.3, 0.8;
15-
Eigen::VectorXd y_value(N);
16-
y_value << 0.3, NAN, 0.5;
17-
18-
Eigen::VectorXd mu(N);
19-
mu << 0.3, 0.8, 1.0;
20-
Eigen::VectorXd mu_size(N - 1);
21-
mu_size << 0.3, 0.8;
22-
Eigen::VectorXd mu_value(N);
23-
mu_value << 0.3, -INFINITY, 0.5;
24-
25-
Eigen::VectorXd sigma(N);
26-
sigma << 0.3, 0.8, 1.0;
27-
Eigen::VectorXd sigma_size(N - 1);
28-
sigma_size << 0.3, 0.8;
29-
Eigen::VectorXd sigma_value(N);
30-
sigma_value << 0.3, 0, 0.5;
31-
32-
stan::math::matrix_cl<double> y_cl(y);
33-
stan::math::matrix_cl<double> y_size_cl(y_size);
34-
stan::math::matrix_cl<double> y_value_cl(y_value);
35-
stan::math::matrix_cl<double> mu_cl(mu);
36-
stan::math::matrix_cl<double> mu_size_cl(mu_size);
37-
stan::math::matrix_cl<double> mu_value_cl(mu_value);
38-
stan::math::matrix_cl<double> sigma_cl(sigma);
39-
stan::math::matrix_cl<double> sigma_size_cl(sigma_size);
40-
stan::math::matrix_cl<double> sigma_value_cl(sigma_value);
41-
42-
EXPECT_NO_THROW(stan::math::normal_lccdf(y_cl, mu_cl, sigma_cl));
43-
44-
EXPECT_THROW(stan::math::normal_lccdf(y_size_cl, mu_cl, sigma_cl),
45-
std::invalid_argument);
46-
EXPECT_THROW(stan::math::normal_lccdf(y_cl, mu_size_cl, sigma_cl),
47-
std::invalid_argument);
48-
EXPECT_THROW(stan::math::normal_lccdf(y_cl, mu_cl, sigma_size_cl),
49-
std::invalid_argument);
50-
51-
EXPECT_THROW(stan::math::normal_lccdf(y_value_cl, mu_cl, sigma_cl),
52-
std::domain_error);
53-
EXPECT_THROW(stan::math::normal_lccdf(y_cl, mu_value_cl, sigma_cl),
54-
std::domain_error);
55-
EXPECT_THROW(stan::math::normal_lccdf(y_cl, mu_cl, sigma_value_cl),
56-
std::domain_error);
57-
}
588

599
auto normal_lccdf_functor
6010
= [](const auto& y, const auto& mu, const auto& sigma) {
6111
return stan::math::normal_lccdf(y, mu, sigma);
6212
};
6313

64-
TEST(ProbDistributionsNormalLccdf, opencl_matches_cpu_small) {
65-
int N = 3;
66-
int M = 2;
6714

68-
Eigen::VectorXd y(N);
69-
y << 0.3, 0.8, 1.0;
70-
Eigen::VectorXd mu(N);
71-
mu << -0.3, -0.8, 1.01;
72-
Eigen::VectorXd sigma(N);
73-
sigma << 0.3, 0.1, 1.0;
74-
75-
stan::math::test::compare_cpu_opencl_prim_rev(normal_lccdf_functor, y, mu,
76-
sigma);
77-
stan::math::test::compare_cpu_opencl_prim_rev(
78-
normal_lccdf_functor, y.transpose().eval(), mu.transpose().eval(),
79-
sigma.transpose().eval());
80-
}
81-
82-
TEST(ProbDistributionsNormalLccdf, opencl_broadcast_y) {
83-
int N = 3;
84-
85-
double y_scal = 12.3;
86-
Eigen::VectorXd mu(N);
87-
mu << 0.5, 1.2, 1.0;
88-
Eigen::VectorXd sigma(N);
89-
sigma << 0.3, 0.8, 1.0;
90-
91-
stan::math::test::test_opencl_broadcasting_prim_rev<0>(normal_lccdf_functor,
92-
y_scal, mu, sigma);
93-
stan::math::test::test_opencl_broadcasting_prim_rev<0>(
94-
normal_lccdf_functor, y_scal, mu.transpose().eval(), sigma);
95-
}
96-
97-
TEST(ProbDistributionsNormalLccdf, opencl_broadcast_mu) {
98-
int N = 3;
99-
100-
Eigen::VectorXd y(N);
101-
y << 0.3, 0.8, 1.0;
102-
double mu_scal = 12.3;
103-
Eigen::VectorXd sigma(N);
104-
sigma << 0.3, 0.8, 1.0;
105-
106-
stan::math::test::test_opencl_broadcasting_prim_rev<1>(normal_lccdf_functor,
107-
y, mu_scal, sigma);
108-
stan::math::test::test_opencl_broadcasting_prim_rev<1>(
109-
normal_lccdf_functor, y.transpose().eval(), mu_scal, sigma);
110-
}
111-
112-
TEST(ProbDistributionsNormalLccdf, opencl_broadcast_sigma) {
113-
int N = 3;
114-
115-
Eigen::VectorXd y(N);
116-
y << 0.3, 0.8, 1.0;
117-
Eigen::VectorXd mu(N);
118-
mu << 0.3, 0.8, 1.0;
119-
double sigma_scal = 12.3;
120-
121-
stan::math::test::test_opencl_broadcasting_prim_rev<2>(normal_lccdf_functor,
122-
y, mu, sigma_scal);
123-
stan::math::test::test_opencl_broadcasting_prim_rev<2>(
124-
normal_lccdf_functor, y.transpose().eval(), mu, sigma_scal);
125-
}
12615

12716
TEST(ProbDistributionsNormalLccdf, opencl_matches_cpu_big) {
12817
int N = 153;
12918

130-
Eigen::Matrix<double, Eigen::Dynamic, 1> y
131-
= Eigen::Array<double, Eigen::Dynamic, 1>::Random(N, 1).abs();
19+
std::srand(123);
20+
for (int i = 0; i < 10; ++i) {
13221
Eigen::Matrix<double, Eigen::Dynamic, 1> mu
133-
= Eigen::Array<double, Eigen::Dynamic, 1>::Random(N, 1).abs();
22+
= Eigen::Array<double, Eigen::Dynamic, 1>::Random(N, 1) + 1.0;
13423
Eigen::Matrix<double, Eigen::Dynamic, 1> sigma
13524
= Eigen::Array<double, Eigen::Dynamic, 1>::Random(N, 1).abs() + 0.01;
136-
25+
Eigen::Matrix<double, Eigen::Dynamic, 1> y = (mu.array() * sigma.array()).matrix();
26+
std::cout << "Iter: " << i << " mu, sigma, y" << std::endl;
27+
for (int j = 0; j < N; j++) {
28+
std::cout << mu(j) << ", " << sigma(j) << ", " << y(j) << std::endl;
29+
}
30+
std::cout << "-----------compare_cpu_opencl_prim_rev" << std::endl;
13731
stan::math::test::compare_cpu_opencl_prim_rev(normal_lccdf_functor, y, mu,
13832
sigma);
33+
std::cout << "-----------compare_cpu_opencl_prim_rev transpose" << std::endl;
13934
stan::math::test::compare_cpu_opencl_prim_rev(
14035
normal_lccdf_functor, y.transpose().eval(), mu.transpose().eval(),
14136
sigma.transpose().eval());
14237
}
38+
}
14339
#endif

0 commit comments

Comments
 (0)