Introduce and use _aes_enc_full

pthariensflame · pthariensflame · commit 77e98ebbf9a1 · 2023-12-25T22:03:39.000-08:00
diff --git a/src/aarch64/aesni.jl b/src/aarch64/aesni.jl
@@ -211,22 +211,8 @@ get_ctr_uint64x2(o::AESNI1x)::Tuple{uint64x2} = (o.ctr,)
 get_key(o::Union{AESNI1x, AESNI4x})::NTuple{11,UInt128} = map(UInt128, get_key_uint64x2(o))
 get_ctr(o::Union{AESNI1x, AESNI4x})::Tuple{UInt128} = map(UInt128, get_ctr_uint64x2(o))
 
-@inline function aesni(key::NTuple{11,uint64x2}, ctr::Tuple{uint64x2})::Tuple{uint64x2}
-    key1, key2, key3, key4, key5, key6, key7, key8, key9, key10, key11 = key
-    ctr1 = only(ctr)
-    x = key1 ⊻ ctr1
-    x = _aes_enc(x, key2)
-    x = _aes_enc(x, key3)
-    x = _aes_enc(x, key4)
-    x = _aes_enc(x, key5)
-    x = _aes_enc(x, key6)
-    x = _aes_enc(x, key7)
-    x = _aes_enc(x, key8)
-    x = _aes_enc(x, key9)
-    x = _aes_enc(x, key10)
-    x = _aes_enc_last(x, key11)
-    (x,)
-end
+@inline aesni(key::NTuple{11,uint64x2}, ctr::Tuple{uint64x2})::Tuple{uint64x2} =
+    (_aes_enc_full(only(ctr), key),)
 
 """
     aesni(key::NTuple{11,UInt128}, ctr::Tuple{UInt128})::Tuple{UInt128}
diff --git a/src/aarch64/aesni_common.jl b/src/aarch64/aesni_common.jl
@@ -10,51 +10,51 @@ const uint64x2_lvec = NTuple{2, VecElement{UInt64}}
 struct uint64x2
     data::uint64x2_lvec
 end
-Base.convert(::Type{uint64x2}, x::UInt128) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x))))
-Base.convert(::Type{UInt128}, x::uint64x2) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x))))
-UInt128(x::uint64x2) = convert(UInt128, x)
-uint64x2(x::UInt128) = convert(uint64x2, x)
-Base.convert(::Type{uint64x2}, x::Union{Signed, Unsigned}) = convert(uint64x2, UInt128(x))
-Base.convert(::Type{T}, x::uint64x2) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x))
-
-uint64x2(hi::UInt64, lo::UInt64) = if LITTLE_ENDIAN
+@inline Base.convert(::Type{uint64x2}, x::UInt128) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x))))
+@inline Base.convert(::Type{UInt128}, x::uint64x2) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x))))
+@inline UInt128(x::uint64x2) = convert(UInt128, x)
+@inline uint64x2(x::UInt128) = convert(uint64x2, x)
+@inline Base.convert(::Type{uint64x2}, x::Union{Signed, Unsigned}) = convert(uint64x2, UInt128(x))
+@inline Base.convert(::Type{T}, x::uint64x2) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x))
+
+@inline uint64x2(hi::UInt64, lo::UInt64) = @static if LITTLE_ENDIAN
     uint64x2((VecElement(lo), VecElement(hi)))
 else
     uint64x2((VecElement(hi), VecElement(lo)))
 end
 
-Base.zero(::Type{uint64x2}) = convert(uint64x2, 0)
-Base.one(::Type{uint64x2}) = uint64x2(zero(UInt64), one(UInt64))
-Base.xor(a::uint64x2, b::uint64x2) = llvmcall(
+@inline Base.zero(::Type{uint64x2}) = convert(uint64x2, 0)
+@inline Base.one(::Type{uint64x2}) = uint64x2(zero(UInt64), one(UInt64))
+@inline Base.xor(a::uint64x2, b::uint64x2) = llvmcall(
     """%3 = xor <2 x i64> %1, %0
     ret <2 x i64> %3""",
     uint64x2_lvec, Tuple{uint64x2_lvec, uint64x2_lvec},
     a.data, b.data,
 ) |> uint64x2
-(+)(a::uint64x2, b::uint64x2) = llvmcall(
+@inline (+)(a::uint64x2, b::uint64x2) = llvmcall(
     """%3 = add <2 x i64> %1, %0
     ret <2 x i64> %3""",
     uint64x2_lvec, Tuple{uint64x2_lvec, uint64x2_lvec},
     a.data, b.data,
 ) |> uint64x2
-(+)(a::uint64x2, b::Integer) = a + uint64x2(UInt128(b))
+@inline (+)(a::uint64x2, b::Integer) = a + uint64x2(UInt128(b))
 
 const uint8x16_lvec = NTuple{16, VecElement{UInt8}}
 struct uint8x16
     data::uint8x16_lvec
 end
-Base.convert(::Type{uint64x2}, x::uint8x16) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x))))
-Base.convert(::Type{uint8x16}, x::uint64x2) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x))))
-uint8x16(x::uint64x2) = convert(uint8x16, x)
-uint64x2(x::uint8x16) = convert(uint64x2, x)
-Base.convert(::Type{uint8x16}, x::UInt128) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x))))
-Base.convert(::Type{UInt128}, x::uint8x16) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x))))
-UInt128(x::uint8x16) = convert(UInt128, x)
-uint8x16(x::UInt128) = convert(uint8x16, x)
-Base.convert(::Type{uint8x16}, x::Union{Signed, Unsigned}) = convert(uint8x16, UInt128(x))
-Base.convert(::Type{T}, x::uint8x16) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x))
-
-function uint8x16(bytes::Vararg{UInt8, 16})
+@inline Base.convert(::Type{uint64x2}, x::uint8x16) = unsafe_load(Ptr{uint64x2}(pointer_from_objref(Ref(x))))
+@inline Base.convert(::Type{uint8x16}, x::uint64x2) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x))))
+@inline uint8x16(x::uint64x2) = convert(uint8x16, x)
+@inline uint64x2(x::uint8x16) = convert(uint64x2, x)
+@inline Base.convert(::Type{uint8x16}, x::UInt128) = unsafe_load(Ptr{uint8x16}(pointer_from_objref(Ref(x))))
+@inline Base.convert(::Type{UInt128}, x::uint8x16) = unsafe_load(Ptr{UInt128}(pointer_from_objref(Ref(x))))
+@inline UInt128(x::uint8x16) = convert(UInt128, x)
+@inline uint8x16(x::UInt128) = convert(uint8x16, x)
+@inline Base.convert(::Type{uint8x16}, x::Union{Signed, Unsigned}) = convert(uint8x16, UInt128(x))
+@inline Base.convert(::Type{T}, x::uint8x16) where T <: Union{Signed, Unsigned} = convert(T, UInt128(x))
+
+@inline function uint8x16(bytes::Vararg{UInt8, 16})
     bytes_prepped = bytes
     if LITTLE_ENDIAN
         bytes_prepped = reverse(bytes_prepped)
@@ -63,23 +63,23 @@ function uint8x16(bytes::Vararg{UInt8, 16})
     return uint8x16(bytes_vec)
 end
 
-Base.zero(::Type{uint8x16}) = convert(uint8x16, 0)
-Base.xor(a::uint8x16, b::uint8x16) = llvmcall(
+@inline Base.zero(::Type{uint8x16}) = convert(uint8x16, 0)
+@inline Base.xor(a::uint8x16, b::uint8x16) = llvmcall(
     """%3 = xor <16 x i8> %1, %0
     ret <16 x i8> %3""",
     uint8x16_lvec, Tuple{uint8x16_lvec, uint8x16_lvec},
     a.data, b.data,
 ) |> uint8x16
 
 # Raw NEON instrinsics, provided by FEAT_AES
-_vaese(a::uint8x16, b::uint8x16) = ccall(
+@inline _vaese(a::uint8x16, b::uint8x16) = ccall(
     "llvm.aarch64.crypto.aese",
     llvmcall,
     uint8x16_lvec,
     (uint8x16_lvec, uint8x16_lvec),
     a.data, b.data,
 ) |> uint8x16
-_vaesmc(a::uint8x16) = ccall(
+@inline _vaesmc(a::uint8x16) = ccall(
     "llvm.aarch64.crypto.aesmc",
     llvmcall,
     uint8x16_lvec,
@@ -104,7 +104,7 @@ uint8x16_t _mm_aeskeygenassist_helper(uint8x16_t a)
 ```
 Then made architecture-agnostic as LLVM IR.
 """
-_aes_key_gen_shuffle_helper(a::uint8x16) = llvmcall(
+@inline _aes_key_gen_shuffle_helper(a::uint8x16) = llvmcall(
     """%2 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 4, i32 1, i32 14, i32 11, i32 1, i32 14, i32 11, i32 4, i32 12, i32 9, i32 6, i32 3, i32 9, i32 6, i32 3, i32 12>
     ret <16 x i8> %2""",
     uint8x16_lvec, Tuple{uint8x16_lvec},
@@ -116,21 +116,39 @@ _aes_key_gen_shuffle_helper(a::uint8x16) = llvmcall(
 # Algorithm translations courtesy of the SIMD Everywhere and SSE2NEON projects:
 # https://github.com/simd-everywhere/simde/blob/v0.8.0-rc1/simde/x86/aes.h
 # https://github.com/DLTcollab/sse2neon/blob/v1.6.0/sse2neon.h
-function _aes_enc(a::uint64x2, round_key::uint64x2)
+@inline function _aes_enc(a::uint64x2, round_key::uint64x2)
     res = _vaesmc(_vaese(uint8x16(a), zero(uint8x16)))
     return uint64x2(res) ⊻ round_key
 end
-function _aes_enc_last(a::uint64x2, round_key::uint64x2)
+@inline function _aes_enc_last(a::uint64x2, round_key::uint64x2)
     res = _vaese(uint8x16(a), zero(uint8x16))
     return uint64x2(res) ⊻ round_key
 end
-
-function _aes_key_gen_assist(a::uint64x2, ::Val{R}) where {R}
+@inline function _aes_key_gen_assist(a::uint64x2, ::Val{R}) where {R}
     res = _aes_key_gen_shuffle_helper(_vaese(uint8x16(a), zero(uint8x16)))
     r = R % UInt64
     return uint64x2(res) ⊻ uint64x2(r, r)
 end
 
+"""
+    _aes_enc_full(a::uint64x2, round_keys::NTuple{N,uint64x2})::uint64x2 where {N}
+
+Full AES encryption flow for N rounds.
+"""
+@inline function _aes_enc_full(a::uint64x2, round_keys::NTuple{N,uint64x2})::uint64x2 where {N}
+    res = uint8x16(a)
+    for (i, key) in enumerate(round_keys)
+        if i ≢ N
+            res = _vaese(res, uint8x16(key))
+            if i ≢ N - 1
+                res = _vaesmc(res)
+            end
+        else
+            return uint64x2(res ⊻ uint8x16(key))
+        end
+    end
+end
+
 "Abstract RNG that generates one number at a time and is based on AESNI."
 abstract type AbstractAESNI1x <: R123Generator1x{UInt128} end
 "Abstract RNG that generates four numbers at a time and is based on AESNI."
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -23,7 +23,7 @@ using Printf: @printf
         (Philox4x(UInt32  , seed2) , philox  , (Val(10),)) ,
         (Philox4x(UInt64  , seed2) , philox  , (Val(10),)) ,
     ]
-    if R123_USE_AESNI
+    @static if R123_USE_AESNI
         append!(alg_choices, AlgChoice[
             (AESNI1x(seed1) , aesni , ()        ) ,
             (AESNI4x(seed4) , aesni , ()        ) ,
@@ -172,7 +172,7 @@ redirect_stdout(stdout_)
 compare_dirs("expected", "actual")
 cd(pwd_)
 
-if Random123.R123_USE_X86_AES_NI
+@static if Random123.R123_USE_X86_AES_NI
     include("./x86/aesni.jl")
     include("./x86/ars.jl")
 elseif Random123.R123_USE_AARCH64_FEAT_AES