1616#include < utility> // std::pair
1717#include < vector>
1818
19-
2019#ifdef __EMSCRIPTEN__
2120#include " emscripten/emscripten.h"
2221#endif
@@ -1106,7 +1105,7 @@ inline Context createContext(const WGPUInstanceDescriptor &desc = {},
11061105 * @param ctx The Context containing the WebGPU instance handle.
11071106 * @return std::vector<dawn::native::Adapter> A vector of available GPU
11081107 * adapters.
1109- *
1108+ *
11101109 * @code
11111110 * std::vector<dawn::native::Adapter> adapters = getAdapters(ctx);
11121111 * @endcode
@@ -1118,21 +1117,25 @@ inline std::vector<dawn::native::Adapter> getAdapters(Context &ctx) {
11181117}
11191118
11201119/* *
1121- * @brief Formats the given vector of Dawn adapters into a single concatenated string.
1120+ * @brief Formats the given vector of Dawn adapters into a single concatenated
1121+ * string.
11221122 *
1123- * This function iterates over each Dawn adapter in the provided vector, retrieves its
1124- * description using the WebGPU API, and converts the description from a WGPUStringView
1125- * to an std::string using the formatWGPUStringView helper. The resulting descriptions
1126- * are concatenated into a single string separated by newline characters.
1123+ * This function iterates over each Dawn adapter in the provided vector,
1124+ * retrieves its description using the WebGPU API, and converts the description
1125+ * from a WGPUStringView to an std::string using the formatWGPUStringView
1126+ * helper. The resulting descriptions are concatenated into a single string
1127+ * separated by newline characters.
11271128 *
11281129 * @param adapters A vector of Dawn adapters obtained from a WebGPU instance.
1129- * @return std::string A newline-delimited string listing each adapter's description.
1130- *
1130+ * @return std::string A newline-delimited string listing each adapter's
1131+ * description.
1132+ *
11311133 * @code
11321134 * std::string adapterList = formatAdapters(adapters);
11331135 * @endcode
11341136 */
1135- inline std::string formatAdapters (const std::vector<dawn::native::Adapter> &adapters) {
1137+ inline std::string
1138+ formatAdapters (const std::vector<dawn::native::Adapter> &adapters) {
11361139 std::string adapterList;
11371140 for (size_t i = 0 ; i < adapters.size (); ++i) {
11381141 auto adapterPtr = adapters[i].Get ();
@@ -1157,7 +1160,7 @@ inline std::string formatAdapters(const std::vector<dawn::native::Adapter> &adap
11571160 * @param ctx The Context containing the WebGPU instance handle.
11581161 * @return std::string A newline-delimited string listing each adapter's
11591162 * description.
1160- *
1163+ *
11611164 * @code
11621165 * std::string adapterList = listAdapters(ctx);
11631166 * @endcode
@@ -1181,7 +1184,7 @@ inline std::string listAdapters(Context &ctx) {
11811184 * @param devDescriptor Device descriptor for the WebGPU device (optional)
11821185 * @return std::future<Context> A future that will eventually hold the created
11831186 * Context.
1184- *
1187+ *
11851188 * @code
11861189 * std::future<Context> contextFuture = createContextByGpuIdxAsync(0);
11871190 * Context ctx = waitForContextFuture(contextFuture);
@@ -1270,9 +1273,9 @@ createContextByGpuIdxAsync(int gpuIdx, const WGPUInstanceDescriptor &desc = {},
12701273 * Context ctx = createContextByGpuIdx(0);
12711274 * @endcode
12721275 */
1273- inline Context createContextByGpuIdx ( int gpuIdx,
1274- const WGPUInstanceDescriptor &desc = {},
1275- const WGPUDeviceDescriptor &devDescriptor = {}) {
1276+ inline Context
1277+ createContextByGpuIdx ( int gpuIdx, const WGPUInstanceDescriptor &desc = {},
1278+ const WGPUDeviceDescriptor &devDescriptor = {}) {
12761279 std::future<Context> contextFuture =
12771280 createContextByGpuIdxAsync (gpuIdx, desc, devDescriptor);
12781281 return waitForContextFuture<Context>(contextFuture);
@@ -1365,17 +1368,19 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
13651368/* *
13661369 * @brief Copies data from a GPU buffer to CPU memory.
13671370 * @param[in] ctx Context instance to manage the operation
1368- * @param[in] tensor Tensor instance representing the GPU buffer to copy from
13691371 * @param[out] data Pointer to the CPU memory to copy the data to
13701372 * @param[in] bufferSize Size of the data buffer in bytes
13711373 * @param[in] op StagingBuffer instance to manage the operation
1374+ * @param[in] sourceOffset Offset in the GPU buffer to start copying from.
13721375 *
13731376 * @code
13741377 * toCPU(ctx, tensor, data, bufferSize);
13751378 * @endcode
13761379 */
1377- inline std::future<void > toCPUAsync (Context &ctx, Tensor &tensor, void *data,
1378- size_t bufferSize, CopyData &op) {
1380+
1381+ // NOTE: I think this one is redundant? CopyData not used externally.
1382+ inline std::future<void > toCPUAsync (Context &ctx, void *data, size_t bufferSize,
1383+ CopyData &op, size_t sourceOffset = 0 ) {
13791384 // Submit the command buffer and release it.
13801385 wgpuQueueSubmit (ctx.queue , 1 , &op.commandBuffer );
13811386 wgpuCommandBufferRelease (op.commandBuffer );
@@ -1388,8 +1393,8 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
13881393 CallbackData *cbData = new CallbackData{
13891394 op.readbackBuffer , // The GPU buffer to be read back.
13901395 bufferSize,
1391- data, // CPU memory destination.
1392- promise // The promise to be signaled.
1396+ data, // CPU memory destination.
1397+ promise, // The promise to be signaled.
13931398 };
13941399
13951400 // Set up the work-done callback to initiate the buffer mapping.
@@ -1402,6 +1407,11 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
14021407 // Begin the asynchronous chain by registering the queue work-done callback.
14031408 wgpuQueueOnSubmittedWorkDone (ctx.queue , workDoneCallbackInfo);
14041409
1410+ // Release the readback buffer as it is no longer needed.
1411+ if (op.readbackBuffer ) {
1412+ wgpuBufferRelease (op.readbackBuffer );
1413+ }
1414+
14051415 return promise->get_future ();
14061416}
14071417
@@ -1417,11 +1427,13 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
14171427 *
14181428 * @param[in] ctx Context instance to manage the operation
14191429 * @param[in] tensor Tensor instance representing the GPU buffer to copy from
1420- * @param[in] bufferSize Size of the data buffer in bytes
1430+ * @param[in] bufferSize Size to read in bytes as out data.
14211431 * @param[out] data Pointer to the CPU memory to copy the data to
1432+ * @param[in] sourceOffset Offset in the GPU buffer to start copying from.
14221433 */
14231434inline std::future<void > toCPUAsync (Context &ctx, Tensor &tensor, void *data,
1424- size_t bufferSize) {
1435+ size_t bufferSize,
1436+ size_t sourceOffset = 0 ) {
14251437 // Create a promise that will later be satisfied when the async copy
14261438 // completes.
14271439 auto promise = std::make_shared<std::promise<void >>();
@@ -1430,16 +1442,17 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
14301442 WGPUBufferDescriptor readbackBufferDescriptor = {
14311443 .label = {.data = nullptr , .length = 0 },
14321444 .usage = WGPUBufferUsage_CopyDst | WGPUBufferUsage_MapRead,
1433- .size = bufferSize,
1445+ .size = bufferSize, // Size of the readback buffer.
14341446 };
14351447 WGPUBuffer readbackBuffer =
14361448 wgpuDeviceCreateBuffer (ctx.device , &readbackBufferDescriptor);
14371449
14381450 // Create a command encoder and record a copy from the tensor GPU buffer
14391451 WGPUCommandEncoder commandEncoder =
14401452 wgpuDeviceCreateCommandEncoder (ctx.device , nullptr );
1441- wgpuCommandEncoderCopyBufferToBuffer (commandEncoder, tensor.data .buffer , 0 ,
1442- readbackBuffer, 0 , bufferSize);
1453+ wgpuCommandEncoderCopyBufferToBuffer (commandEncoder, tensor.data .buffer ,
1454+ sourceOffset, readbackBuffer, 0 ,
1455+ bufferSize);
14431456 // Finish recording by creating a command buffer and release the encoder.
14441457 WGPUCommandBuffer commandBuffer =
14451458 wgpuCommandEncoderFinish (commandEncoder, nullptr );
@@ -1472,13 +1485,16 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor, void *data,
14721485 // queueWorkDoneCallback.
14731486 wgpuQueueOnSubmittedWorkDone (ctx.queue , workDoneCallbackInfo);
14741487
1488+ if (readbackBuffer) {
1489+ wgpuBufferRelease (readbackBuffer);
1490+ }
1491+
14751492 return promise->get_future ();
14761493}
14771494
14781495inline std::future<void > toCPUAsync (Context &ctx, WGPUBuffer buffer, void *data,
1479- size_t size) {
1480- // The size (in bytes) for the copy.
1481- uint64_t bufferSize = size;
1496+ size_t bufferSize,
1497+ size_t sourceOffset = 0 ) {
14821498
14831499 // Create an operation structure (here we reuse CopyData solely for its
14841500 // members that we need to create a readback buffer and command buffer).
@@ -1503,7 +1519,7 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
15031519 {
15041520 WGPUCommandEncoder commandEncoder =
15051521 wgpuDeviceCreateCommandEncoder (ctx.device , nullptr );
1506- wgpuCommandEncoderCopyBufferToBuffer (commandEncoder, buffer, 0 ,
1522+ wgpuCommandEncoderCopyBufferToBuffer (commandEncoder, buffer, sourceOffset ,
15071523 op.readbackBuffer , 0 , bufferSize);
15081524 op.commandBuffer = wgpuCommandEncoderFinish (commandEncoder, nullptr );
15091525 wgpuCommandEncoderRelease (commandEncoder);
@@ -1516,10 +1532,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
15161532
15171533 // Allocate callback data
15181534 CallbackData *cbData = new CallbackData{
1519- op.readbackBuffer , // The readback buffer created above.
1520- static_cast < size_t >( bufferSize), // Size of the copy.
1521- data, // Destination CPU memory .
1522- promise // Our promise to satisfy when done.
1535+ op.readbackBuffer , // The readback buffer created above.
1536+ bufferSize, // Size of the copy.
1537+ data, // Destination CPU memory. // Offset in the GPU buffer .
1538+ promise // Our promise to satisfy when done.
15231539 };
15241540
15251541 // Set up the queue work-done callback info.
@@ -1532,6 +1548,10 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
15321548 // Start the asynchronous chain by registering the work-done callback.
15331549 wgpuQueueOnSubmittedWorkDone (ctx.queue , workDoneCallbackInfo);
15341550
1551+ if (op.readbackBuffer ) {
1552+ wgpuBufferRelease (op.readbackBuffer );
1553+ }
1554+
15351555 return promise->get_future ();
15361556}
15371557
@@ -1548,9 +1568,11 @@ inline std::future<void> toCPUAsync(Context &ctx, WGPUBuffer buffer, void *data,
15481568 * @endcode
15491569 */
15501570template <size_t N>
1551- inline std::future<void > toCPUAsync (Context &ctx, Tensor &tensor,
1552- std::array<float , N> &data) {
1553- return toCPUAsync (ctx, tensor, data.data (), sizeof (data));
1571+ inline std::future<void >
1572+ toCPUAsync (Context &ctx, Tensor &tensor, std::array<float , N> &data,
1573+ size_t sourceOffset = 0 ) {
1574+ return toCPUAsync (ctx, tensor, data.data (), sizeof (data), sourceOffset
1575+ );
15541576}
15551577
15561578/* *
@@ -1571,8 +1593,10 @@ inline std::future<void> toCPUAsync(Context &ctx, Tensor &tensor,
15711593 * toCPU(ctx, tensor, data, bufferSize, instance);
15721594 * @endcode
15731595 */
1574- inline void toCPU (Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
1575- auto future = toCPUAsync (ctx, tensor, data, bufferSize);
1596+ inline void toCPU (Context &ctx, Tensor &tensor, void *data, size_t bufferSize,
1597+ size_t sourceOffset = 0 ) {
1598+ auto future =
1599+ toCPUAsync (ctx, tensor, data, bufferSize, sourceOffset);
15761600 wait (ctx, future);
15771601}
15781602
@@ -1593,8 +1617,9 @@ inline void toCPU(Context &ctx, Tensor &tensor, void *data, size_t bufferSize) {
15931617 * toCPU(ctx, buffer, data, size, instance);
15941618 * @endcode
15951619 */
1596- inline void toCPU (Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
1597- auto future = toCPUAsync (ctx, buffer, data, size);
1620+ inline void toCPU (Context &ctx, WGPUBuffer buffer, void *data, size_t size,
1621+ size_t sourceOffset = 0 ) {
1622+ auto future = toCPUAsync (ctx, buffer, data, size, sourceOffset);
15981623 wait (ctx, future);
15991624}
16001625
@@ -1616,8 +1641,9 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, void *data, size_t size) {
16161641 * @endcode
16171642 */
16181643template <size_t N>
1619- inline void toCPU (Context &ctx, Tensor &tensor, std::array<float , N> &data) {
1620- auto future = toCPUAsync (ctx, tensor, data);
1644+ inline void toCPU (Context &ctx, Tensor &tensor, std::array<float , N> &data,
1645+ size_t sourceOffset = 0 ) {
1646+ auto future = toCPUAsync (ctx, tensor, data, sourceOffset);
16211647 wait (ctx, future);
16221648}
16231649
0 commit comments