bashbaug · bashbaug · Jan 4, 2024 · Jan 4, 2024 · Jan 5, 2024 · Jan 5, 2024
diff --git a/include/bfloat16.hpp b/include/bfloat16.hpp
@@ -0,0 +1,130 @@
+/*
+// Copyright (c) 2024-2026 Ben Ashbaugh
+//
+// SPDX-License-Identifier: MIT
+*/
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+
+class bfloat16;
+
+class bfloat16 {
+  using StorageType = uint16_t;
+  StorageType value;
+
+  static StorageType from_float(const float &a) {
+    if (std::isnan(a))
+      return 0xffc1;
+    union {
+      uint32_t intStorage;
+      float floatValue;
+    };
+    floatValue = a;
+    // Do RNE and truncate
+    uint32_t roundingBias = ((intStorage >> 16) & 0x1) + 0x00007FFF;
+    return static_cast<StorageType>((intStorage + roundingBias) >> 16);
+  }
+
+  static float to_float(const StorageType &a) {
+    union {
+      uint32_t intStorage;
+      float floatValue;
+    };
+    intStorage = a << 16;
+    return floatValue;
+  }
+
+public:
+  bfloat16() = default;
+  bfloat16(const bfloat16 &) = default;
+  ~bfloat16() = default;
+
+  // Implicit conversion from float to bfloat16
+  bfloat16(const float &a) { value = from_float(a); }
+
+  bfloat16 &operator=(const float &rhs) {
+    value = from_float(rhs);
+    return *this;
+  }
+
+  // Implicit conversion from bfloat16 to float
+  operator float() const { return to_float(value); }
+
+  // Logical operators (!,||,&&) are covered if we can cast to bool
+  explicit operator bool() const { return to_float(value) != 0.0f; }
+
+  // Unary minus operator overloading
+  friend bfloat16 operator-(const bfloat16 &lhs) {
+    return -to_float(lhs.value);
+  }
+
+  // Increment and decrement operators overloading
+#define OP(op)                                                                 \
+  friend bfloat16 &operator op(bfloat16 &lhs) {                                \
+    float f = to_float(lhs.value);                                             \
+    lhs.value = from_float(op f);                                              \
+    return lhs;                                                                \
+  }                                                                            \
+  friend bfloat16 operator op(bfloat16 &lhs, int) {                            \
+    bfloat16 old = lhs;                                                        \
+    operator op(lhs);                                                          \
+    return old;                                                                \
+  }
+  OP(++)
+  OP(--)
+#undef OP
+
+  // Assignment operators overloading
+#define OP(op)                                                                 \
+  friend bfloat16 &operator op(bfloat16 &lhs, const bfloat16 &rhs) {           \
+    float f = static_cast<float>(lhs);                                         \
+    f op static_cast<float>(rhs);                                              \
+    return lhs = f;                                                            \
+  }                                                                            \
+  template <typename T>                                                        \
+  friend bfloat16 &operator op(bfloat16 &lhs, const T &rhs) {                  \
+    float f = static_cast<float>(lhs);                                         \
+    f op static_cast<float>(rhs);                                              \
+    return lhs = f;                                                            \
+  }                                                                            \
+  template <typename T> friend T &operator op(T &lhs, const bfloat16 &rhs) {   \
+    float f = static_cast<float>(lhs);                                         \
+    f op static_cast<float>(rhs);                                              \
+    return lhs = f;                                                            \
+  }
+  OP(+=)
+  OP(-=)
+  OP(*=)
+  OP(/=)
+#undef OP
+
+// Binary operators overloading
+#define OP(type, op)                                                           \
+  friend type operator op(const bfloat16 &lhs, const bfloat16 &rhs) {          \
+    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
+  }                                                                            \
+  template <typename T>                                                        \
+  friend type operator op(const bfloat16 &lhs, const T &rhs) {                 \
+    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
+  }                                                                            \
+  template <typename T>                                                        \
+  friend type operator op(const T &lhs, const bfloat16 &rhs) {                 \
+    return type{static_cast<float>(lhs) op static_cast<float>(rhs)};           \
+  }
+  OP(bfloat16, +)
+  OP(bfloat16, -)
+  OP(bfloat16, *)
+  OP(bfloat16, /)
+  OP(bool, ==)
+  OP(bool, !=)
+  OP(bool, <)
+  OP(bool, >)
+  OP(bool, <=)
+  OP(bool, >=)
+#undef OP
+
+  // Bitwise(|,&,~,^), modulo(%) and shift(<<,>>) operations are not supported
+  // for floating-point types.
+};
diff --git a/include/util.hpp b/include/util.hpp
@@ -6,6 +6,12 @@
 #pragma once
 
 #include <CL/opencl.hpp>
+
+#include <cctype>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iterator>
 #include <string>
 
 static cl_version getDeviceOpenCLVersion(
@@ -68,6 +74,22 @@ static bool checkDeviceForExtension(
     return supported;
 }
 
+static std::string readStringFromFile(
+    const std::string& filename )
+{
+    std::ifstream is(filename, std::ios::binary);
+    if (!is.good()) {
+        printf("Couldn't open file '%s'!\n", filename.c_str());
+        return "";
+    }
+
+    std::string source{
+        std::istreambuf_iterator<char>(is),
+        std::istreambuf_iterator<char>() };
+
+    return source;
+}
+
 static bool checkPlatformIndex(
     const std::vector<cl::Platform>& platforms,
     int platformIndex)

diff --git a/samples/05_kernelfromfile/main.cpp b/samples/05_kernelfromfile/main.cpp
@@ -13,27 +13,6 @@
 
 #include "util.hpp"
 
-static std::string readStringFromFile(
-    const std::string& filename )
-{
-    std::ifstream is(filename, std::ios::binary);
-    if (!is.good()) {
-        printf("Couldn't open file '%s'!\n", filename.c_str());
-        return "";
-    }
-
-    size_t filesize = 0;
-    is.seekg(0, std::ios::end);
-    filesize = (size_t)is.tellg();
-    is.seekg(0, std::ios::beg);
-
-    std::string source{
-        std::istreambuf_iterator<char>(is),
-        std::istreambuf_iterator<char>() };
-
-    return source;
-}
-
 int main(
     int argc,
     char** argv )

diff --git a/samples/06_ndrangekernelfromfile/main.cpp b/samples/06_ndrangekernelfromfile/main.cpp
@@ -13,27 +13,6 @@
 
 #include "util.hpp"
 
-static std::string readStringFromFile(
-    const std::string& filename )
-{
-    std::ifstream is(filename, std::ios::binary);
-    if (!is.good()) {
-        printf("Couldn't open file '%s'!\n", filename.c_str());
-        return "";
-    }
-
-    size_t filesize = 0;
-    is.seekg(0, std::ios::end);
-    filesize = (size_t)is.tellg();
-    is.seekg(0, std::ios::beg);
-
-    std::string source{
-        std::istreambuf_iterator<char>(is),
-        std::istreambuf_iterator<char>() };
-
-    return source;
-}
-
 int main(
     int argc,
     char** argv )

diff --git a/samples/20_matrixexperiments-bf16/CMakeLists.txt b/samples/20_matrixexperiments-bf16/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright (c) 2024-2026 Ben Ashbaugh
+#
+# SPDX-License-Identifier: MIT
+
+add_opencl_sample(
+    TEST
+    NUMBER 20
+    TARGET matrixexperiments-bf16
+    VERSION 200 # for clSetKernelExecInfo
+    SOURCES main.cpp
+    KERNELS matrix_helpers_bf16.cl matrix_kernels_bf16.cl matrix_kernel_tiled_bf16.cl)
diff --git a/samples/20_matrixexperiments-bf16/README.md b/samples/20_matrixexperiments-bf16/README.md
@@ -0,0 +1,60 @@
+# matrixexperiments-bf16
+
+## Sample Purpose
+
+This sample demonstrates various techniques to perform a large matrix multiplication where the matrix elements contain 16-bit `bfloat16` data.
+The sample includes many different implementations:
+
+1. The "naive" implementation is a very simple implementation.
+It is not very fast, but it is easy to understand, and it has no extension dependencies so it will run on many devices.
+2. The "dpas" kernels use sub-group extensions to improve performance.
+On some devices, they will also use specialized matrix multiplication extensions to further improve performance.
+Because these kernels require certain extensions or a specific sub-group size, they may not run on all devices.
+3. The "dpas blockread" kernels use additional sub-group extensions to further improve performance.
+
+Most of the optimized kernels operate on fixed size tiles of matrix data.
+For some of these kernels, parameters such as the number of matrix tiles per-sub-group or the number of sub-groups per work-group may be modified via program build options.
+Experiment with different options to see what performs the best!
+
+A good place to start for some devices is:
+
+```sh
+./matrixexperiments-bf16 -m4096 --options="-DSGS_PER_WG_X=4 -DSGS_PER_WG_Y=8 -DKK=2 -cl-intel-256-GRF-per-thread" --zero
+```
+
+## Key APIs and Concepts
+
+This sample will optionally use the following OpenCL extensions:
+
+* cl_intel_bfloat16_conversions
+* cl_intel_required_subgroup_size
+* cl_intel_split_work_group_barrier
+* cl_intel_subgroup_2d_block_io
+* cl_intel_subgroup_matrix_multiply_accumulate
+* cl_intel_subgroups
+* cl_intel_subgroups_short
+
+## Command Line Options
+
+| Option | Default Value | Description |
+|:--|:-:|:--|
+| `-p <index>` | 0 | Specify the index of the OpenCL platform to execute the sample on.
+| `-d <index>` | 0 | Specify the index of the OpenCL device in the platform to execute on the sample on.
+| `--file <string>` | `matrix_kernels_bf16.cl` | Specify the name of the file with the OpenCL kernel source.
+| `--options <string>` | None | Specify optional program build options.
+| `--matrixsize <int>` | 512 | Specify the dimensions of the matrix.
+| `--iterations <int>` | 16 | Specify the number of iterations for performance testing.
+| `--validate` | n/a | Validate results for correctness.
+| `--zero` | n/a | Initialize all matrices to zero.
+| `--identity` | n/a | Initialize all matrices to one.
+| `--fixed` | n/a | Initialize all matrices to values computed from the matrix row and column.
+| `--emulate` | n/a | Do not use specialized matrix multiplication extensions.
+| `--wallclock` | n/a | Measure performance using wallclock time instead of event profiling.
+| `--skipinit` | n/a | Skip initialization of source matrices.
+| `--roundrobin` | n/a | Use round robin thread scheduling.
+| `--threshold <float>` | 0.01 | Set the threshold used when validating results.
+| `--mask <int>` | ~0 | Set a mask to only run a subset of tests.
+
+By default, the source matrices are populated with random data.
+When validating results, it is recommended to use either "fixed" or "identity" data.
+For best performance, use "zero" data.