diff --git a/.gitmodules b/.gitmodules
index 496c7d22f..30109eddc 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,5 +1,5 @@
 [submodule "ggml"]
-    path = ggml
+	path = ggml
 	url = https://github.com/tetherto/qvac-ext-ggml.git
 	branch = 2026-06-06
 [submodule "examples/server/frontend"]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2804bad46..242b84292 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,11 @@ if (MSVC)
         $<$<COMPILE_LANGUAGE:C>:/utf-8>
         $<$<COMPILE_LANGUAGE:CXX>:/MP>
         $<$<COMPILE_LANGUAGE:CXX>:/utf-8>
+        # stable-diffusion.cpp is a large translation unit; with the LTX-2
+        # additions it exceeds the COFF 2^16 section limit, so MSVC needs
+        # /bigobj (clang/gcc have no equivalent limit). Fatal error C1128.
+        $<$<COMPILE_LANGUAGE:C>:/bigobj>
+        $<$<COMPILE_LANGUAGE:CXX>:/bigobj>
     )
 endif()
 
@@ -298,13 +303,28 @@ endif()
 # Only add ggml if it hasn't been added yet
 if (NOT TARGET ggml)
     if (SD_USE_SYSTEM_GGML)
-        find_package(ggml REQUIRED)
+        # System ggml (e.g. the qvac-ext-ggml vcpkg port). The port exports
+        # GGML_MAX_NAME=128 as a PUBLIC/INTERFACE compile definition on
+        # ggml::ggml, so consumers inherit it automatically (no need for the
+        # add_definitions() above under system ggml).
+        find_package(ggml CONFIG)
         if (NOT ggml_FOUND)
-            message(FATAL_ERROR "System-installed GGML library not found.")
+            message(FATAL_ERROR
+                "SD_USE_SYSTEM_GGML is ON but no system GGML was found. Provide ggml "
+                "via the qvac-ext-ggml vcpkg port (or any package exporting the "
+                "ggml::ggml CMake target) and configure with the vcpkg toolchain "
+                "file, e.g. -DCMAKE_TOOLCHAIN_FILE=<vcpkg>/scripts/buildsystems/vcpkg.cmake")
         endif()
         add_library(ggml ALIAS ggml::ggml)
-    else()
+    elseif (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/ggml/CMakeLists.txt")
+        # Vendored submodule (default): standalone builds with no external tooling.
         add_subdirectory(ggml)
+    else()
+        message(FATAL_ERROR
+            "ggml not found. Either initialize the bundled submodule "
+            "(git submodule update --init ggml, or clone with --recursive), or "
+            "build against system ggml with -DSD_USE_SYSTEM_GGML=ON together with "
+            "the vcpkg toolchain file (qvac-ext-ggml port).")
     endif()
 endif()
 
diff --git a/docs/build.md b/docs/build.md
index d33f9329a..616817326 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -3,19 +3,46 @@
 ## Get the Code
 
 ```
-git clone --recursive https://github.com/leejet/stable-diffusion.cpp
-cd stable-diffusion.cpp
+git clone --recursive https://github.com/tetherto/qvac-ext-stable-diffusion.cpp
+cd qvac-ext-stable-diffusion.cpp
 ```
 
-- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
+- If you have already cloned the repository, you can use the following command to update the repository to the latest code and fetch the submodules.
 
 ```
-cd stable-diffusion.cpp
-git pull origin master
-git submodule init
-git submodule update
+cd qvac-ext-stable-diffusion.cpp
+git pull
+git submodule update --init --recursive
 ```
 
+## GGML dependency (vendored submodule or system / vcpkg)
+
+`ggml` can be provided in two ways:
+
+- **Vendored submodule (default).** `SD_USE_SYSTEM_GGML` defaults to `OFF`, and
+  CMake builds the bundled `ggml` submodule
+  ([qvac-ext-ggml](https://github.com/tetherto/qvac-ext-ggml)) via
+  `add_subdirectory(ggml)`. This is the no-extra-tooling path for building the
+  repository standalone — just clone with `--recursive` (or run
+  `git submodule update --init --recursive`) and the plain `cmake ..` invocations
+  below work as-is.
+- **System / vcpkg ggml.** Pass `-DSD_USE_SYSTEM_GGML=ON` and configure with the
+  vcpkg toolchain file so CMake resolves the `ggml::ggml` target from the
+  qvac-ext-ggml vcpkg port:
+
+  ```shell
+  mkdir build && cd build
+  cmake .. -DSD_USE_SYSTEM_GGML=ON -DCMAKE_TOOLCHAIN_FILE=<vcpkg>/scripts/buildsystems/vcpkg.cmake
+  cmake --build . --config Release
+  ```
+
+The qvac-ext-ggml port is built with `GGML_MAX_NAME=128` and exports it as a
+PUBLIC compile definition, so system-ggml consumers inherit it automatically; for
+vendored builds the in-tree `add_definitions(-DGGML_MAX_NAME=128)` applies.
+
+The GPU backend flags below (`-DSD_METAL=ON`, `-DSD_CUDA=ON`, ...) apply to both
+paths.
+
 ## WebP and WebM Support in Examples
 
 The example applications (`examples/cli` and `examples/server`) use `libwebp` to support WebP image I/O, and `examples/cli` can also use `libwebm` for `.webm` video output. Both are enabled by default. WebM output currently reuses `libwebp` to encode each frame as VP8 before muxing with `libwebm`.
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index 0ecc72dc6..6ca3b5e36 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -808,6 +808,7 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool vae_decode_only, bool f
         stream_layers,
         backend.c_str(),
         params_backend.c_str(),
+        SD_BACKEND_PREF_GPU,  // qvac: default to GPU; honored only when --backend is unset
     };
     return sd_ctx_params;
 }
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 085c80b5c..da2afd7d0 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -1338,8 +1338,17 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_attention_ext(ggml_context* ctx,
         }
         k_in = ggml_cast(ctx, k_in, GGML_TYPE_F16);
 
-        auto v_fused = ggml_rope_flux(ctx, v_in, nullptr);
-        if (ggml_backend_supports_op(backend, v_fused)) {
+        // ggml_rope_flux(ctx, v_in, nullptr): the null position tensor means NO
+        // rotation is applied — V is never RoPE-rotated (only q/k are). With a
+        // null pe the fused kernel degenerates to exactly the permute(0,2,1,3) +
+        // reshape_3d layout transform in the else branch below; we use it purely
+        // as a fused-kernel fast path for that reshape. Gate it on the same
+        // GGML_ROPE_FLUX_DISABLE switch as the q/k fused path in rope.hpp so the
+        // whole fused-RoPE kernel family can be turned off together for
+        // debugging / backend bring-up.
+        static const bool rope_flux_disabled = std::getenv("GGML_ROPE_FLUX_DISABLE") != nullptr;
+        ggml_tensor* v_fused                 = rope_flux_disabled ? nullptr : ggml_rope_flux(ctx, v_in, nullptr);
+        if (v_fused != nullptr && ggml_backend_supports_op(backend, v_fused)) {
             v_in = v_fused;
         } else {
             v_in = ggml_ext_cont(ctx, ggml_permute(ctx, v_in, 0, 2, 1, 3));
diff --git a/src/ggml_graph_cut.cpp b/src/ggml_graph_cut.cpp
index 61234eaf2..d22f24230 100644
--- a/src/ggml_graph_cut.cpp
+++ b/src/ggml_graph_cut.cpp
@@ -12,7 +12,6 @@
 #include "ggml-backend.h"
 #include "util.h"
 
-#include "../ggml/src/ggml-impl.h"
 
 namespace sd::ggml_graph_cut {
 
@@ -31,8 +30,8 @@ namespace sd::ggml_graph_cut {
     static int graph_leaf_index(ggml_cgraph* gf, const ggml_tensor* tensor) {
         GGML_ASSERT(gf != nullptr);
         GGML_ASSERT(tensor != nullptr);
-        for (int i = 0; i < gf->n_leafs; ++i) {
-            if (gf->leafs[i] == tensor) {
+        for (int i = 0; i < ggml_graph_n_leafs(gf); ++i) {
+            if (ggml_graph_leaf(gf, i) == tensor) {
                 return i;
             }
         }
@@ -293,15 +292,15 @@ namespace sd::ggml_graph_cut {
 
     int leaf_count(ggml_cgraph* gf) {
         GGML_ASSERT(gf != nullptr);
-        return gf->n_leafs;
+        return ggml_graph_n_leafs(gf);
     }
 
     ggml_tensor* leaf_tensor(ggml_cgraph* gf, int leaf_index) {
         GGML_ASSERT(gf != nullptr);
-        if (leaf_index < 0 || leaf_index >= gf->n_leafs) {
+        if (leaf_index < 0 || leaf_index >= ggml_graph_n_leafs(gf)) {
             return nullptr;
         }
-        return gf->leafs[leaf_index];
+        return ggml_graph_leaf(gf, leaf_index);
     }
 
     ggml_backend_buffer_t tensor_buffer(const ggml_tensor* tensor) {
@@ -333,14 +332,14 @@ namespace sd::ggml_graph_cut {
 
     bool plan_matches_graph(ggml_cgraph* gf, const Plan& plan) {
         GGML_ASSERT(gf != nullptr);
-        if (ggml_graph_n_nodes(gf) != plan.n_nodes || gf->n_leafs != plan.n_leafs) {
+        if (ggml_graph_n_nodes(gf) != plan.n_nodes || ggml_graph_n_leafs(gf) != plan.n_leafs) {
             return false;
         }
         for (const auto& input_shape_ref : plan.input_shapes) {
-            if (input_shape_ref.leaf_index < 0 || input_shape_ref.leaf_index >= gf->n_leafs) {
+            if (input_shape_ref.leaf_index < 0 || input_shape_ref.leaf_index >= ggml_graph_n_leafs(gf)) {
                 return false;
             }
-            ggml_tensor* leaf = gf->leafs[input_shape_ref.leaf_index];
+            ggml_tensor* leaf = ggml_graph_leaf(gf, input_shape_ref.leaf_index);
             if (leaf == nullptr || input_shape_ref.type != leaf->type) {
                 return false;
             }
@@ -373,7 +372,7 @@ namespace sd::ggml_graph_cut {
             }
             return ggml_graph_node(gf, input_ref.node_index);
         }
-        if (input_ref.leaf_index < 0 || input_ref.leaf_index >= gf->n_leafs) {
+        if (input_ref.leaf_index < 0 || input_ref.leaf_index >= ggml_graph_n_leafs(gf)) {
             return nullptr;
         }
         return leaf_tensor(gf, input_ref.leaf_index);
@@ -459,8 +458,7 @@ namespace sd::ggml_graph_cut {
             if (current_input == nullptr) {
                 continue;
             }
-            GGML_ASSERT(segment_graph->n_leafs < segment_graph->size);
-            segment_graph->leafs[segment_graph->n_leafs++] = current_input;
+            ggml_graph_add_leaf(segment_graph, current_input);
         }
 
         for (int output_node_index : segment.output_node_indices) {
@@ -518,9 +516,9 @@ namespace sd::ggml_graph_cut {
             return plan;
         }
         plan.n_nodes = n_nodes;
-        plan.n_leafs = gf->n_leafs;
-        for (int i = 0; i < gf->n_leafs; ++i) {
-            ggml_tensor* leaf = gf->leafs[i];
+        plan.n_leafs = ggml_graph_n_leafs(gf);
+        for (int i = 0; i < ggml_graph_n_leafs(gf); ++i) {
+            ggml_tensor* leaf = ggml_graph_leaf(gf, i);
             if (is_params_tensor(params_tensor_set, leaf)) {
                 continue;
             }