RWKV
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/build.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/convert_pytorch_to_ggml.py‎
Lines changed: 18 additions & 3 deletions b/‎python/convert_pytorch_to_ggml.py‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎python/merge_lora_into_ggml.py‎
Lines changed: 13 additions & 3 deletions b/‎python/merge_lora_into_ggml.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎rwkv.cpp‎
Lines changed: 2 additions & 0 deletions b/‎rwkv.cpp‎
Lines changed: 2 additions & 0 deletions
@@ -284,6 +284,9 @@ jobs:
       - macOS-latest-cmake
       - windows-latest-cmake
 
+    permissions:
+      contents: write
+
     steps:
       - name: Download artifacts
         id: download-artifact
 
@@ -105,6 +105,7 @@ endif()
 
 if (RWKV_CUBLAS)
     cmake_minimum_required(VERSION 3.17)
+    set(CMAKE_CUDA_COMPILER_FORCED TRUE)
 
     find_package(CUDAToolkit)
 
@@ -417,6 +418,11 @@ target_compile_features(ggml PUBLIC c_std_11) # Don't bump
 
 if (MSVC)
     target_link_libraries(ggml PUBLIC ${RWKV_EXTRA_LIBS} Threads::Threads)
+    if (RWKV_CUBLAS)
+        target_compile_options(ggml PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
+            -allow-unsupported-compiler
+        >)
+    endif()
 else()
     if (WIN32 AND RWKV_HIPBLAS)
         target_link_libraries(ggml PUBLIC ${RWKV_EXTRA_LIBS} Threads::Threads)
 
@@ -10,6 +10,8 @@ This project provides [a C library rwkv.h](rwkv.h) and [a convinient Python wrap
 
 [RWKV v5](https://huggingface.co/BlinkDL/rwkv-5-world) is a major upgrade to RWKV architecture, making it competitive with Transformers in quality. RWKV v5 models are supported.
 
+[RWKV v6](https://huggingface.co/BlinkDL/rwkv-6-world) is a further improvement to RWKV architecture, with better quality. RWKV v6 models are supported.
+
 Loading LoRA checkpoints in [Blealtan's format](https://github.com/Blealtan/RWKV-LM-LoRA) is supported through [merge_lora_into_ggml.py script](rwkv%2Fmerge_lora_into_ggml.py).
 
 ## Quality and performance
 
@@ -34,8 +34,11 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
 
     is_v5_1_or_2: bool = 'blocks.0.att.ln_x.weight' in state_dict
     is_v5_2: bool = 'blocks.0.att.gate.weight' in state_dict
+    is_v6_0: bool = 'blocks.0.att.time_maa_x' in state_dict
 
-    if is_v5_2:
+    if is_v6_0:
+        print('Detected RWKV v6.0')
+    elif is_v5_2:
         print('Detected RWKV v5.2')
     elif is_v5_1_or_2:
         print('Detected RWKV v5.1')
@@ -57,13 +60,25 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
             1 if is_FP16 else 0
         ))
 
+        if is_v6_0:
+            n_head: int = state_dict['blocks.0.att.time_faaaa'].shape[0]
         for k in state_dict.keys():
             tensor: torch.Tensor = state_dict[k].float()
 
             if '.time_' in k:
                 tensor = tensor.squeeze()
 
-            if is_v5_1_or_2:
+            if is_v6_0:
+                if '.time_faaaa' in k:
+                    tensor = tensor.unsqueeze(-1)
+                if '.time_maa_w1' in k or '.time_decay_w' in k:
+                    tensor = tensor.transpose(0, 1)
+                if '.time_maa_w2' in k:
+                    tensor = tensor.transpose(1, 2)
+                if '.time_decay' in k and '_w' not in k:
+                    tensor = tensor.reshape(n_head, -1, 1)
+
+            elif is_v5_1_or_2:
                 if '.time_decay' in k:
                     if is_v5_2:
                         tensor = torch.exp(-torch.exp(tensor)).unsqueeze(-1)
@@ -105,7 +120,7 @@ def write_state_dict(state_dict: Dict[str, torch.Tensor], dest_path: str, data_t
 
             out_file.write(k_encoded)
 
-            tensor.numpy().tofile(out_file)
+            tensor.detach().numpy().tofile(out_file)
 
 def main() -> None:
     args = parse_args()
 
@@ -13,7 +13,7 @@
 def parse_args():
     parser = argparse.ArgumentParser(description='Merge a PyTorch LoRA checkpoint (.pth) into an rwkv.cpp model file')
     parser.add_argument('src_path', help='Path to source rwkv.cpp model')
-    parser.add_argument('rwkv_arch_version', help='Version of RWKV architecture: v4, v5.1, v5.2', type=str, choices=['v4', 'v5.1', 'v5.2'])
+    parser.add_argument('rwkv_arch_version', help='Version of RWKV architecture: v4, v5.1, v5.2, v6.0', type=str, choices=['v4', 'v5.1', 'v5.2', 'v6.0'])
     parser.add_argument('lora_path', help='Path to LoRA checkpoint in PyTorch format')
     parser.add_argument('lora_alpha', help='Value of lora_alpha parameter used when training this LoRA checkpoint', type=int)
     parser.add_argument('dest_path', help='Path to destination rwkv.cpp model, will be overwitten with the merged model')
@@ -47,7 +47,7 @@ def main() -> None:
 
     arch_version: str = args.rwkv_arch_version
 
-    if not (arch_version == 'v4' or arch_version == 'v5.1' or arch_version == 'v5.2'):
+    if not (arch_version == 'v4' or arch_version == 'v5.1' or arch_version == 'v5.2' or arch_version == 'v6.0'):
         raise ValueError(f'Invalid RWKV architecture version {arch_version}')
 
     print(f'Reading {args.lora_path}')
@@ -108,7 +108,17 @@ def main() -> None:
                 if '.time_' in key:
                     replacement = replacement.squeeze()
 
-                if arch_version == 'v5.1' or arch_version == 'v5.2':
+                if arch_version == 'v6.0':
+                    if '.time_faaaa' in k:
+                        replacement = replacement.unsqueeze(-1)
+                    if '.time_maa_w1' in k or '.time_decay_w' in k:
+                        replacement = replacement.transpose(0, 1)
+                    if '.time_maa_w2' in k:
+                        n_head: int = replacement.shape[1]
+                        replacement = replacement.transpose(1, 2)
+                    if '.time_decay' in k and '_w' not in k:
+                        replacement = replacement.reshape(n_head, -1, 1)
+                elif arch_version == 'v5.1' or arch_version == 'v5.2':
                     if '.time_decay' in key:
                         if arch_version == 'v5.2':
                             replacement = torch.exp(-torch.exp(replacement)).unsqueeze(-1)
 
@@ -49,6 +49,8 @@ static_assert(sizeof(decltype(ftell(NULL))) >= 8, "File offsets should be 64-bit
 
 #include "rwkv_operators_wkv_v5.inc"
 
+#include "rwkv_operators_wkv_v6.inc"
+
 #include "rwkv_graph.inc"
 
 // API function.