@@ -16,13 +16,25 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
1616*/
1717#include <stdint.h>
1818
19- #ifdef _MSC_VER
20- #if (defined(_M_AMD64 ) || defined(_M_X64 ) || (_M_IX86_FP == 2 ))
21- #define __SSE2__ // MSVC does not have __SSE2__ macro
22- #endif
23- #include <emmintrin.h>
24- #else
25- #include <x86intrin.h>
19+ #if defined(__aarch64__ ) || defined(_M_ARM64 )
20+ #define HAVE_NEON 1
21+ #include <arm_neon.h>
22+
23+ #elif defined(_MSC_VER )
24+ /* MSVC */
25+ #if defined(_M_AMD64 ) || defined(_M_X64 ) || \
26+ (defined(_M_IX86_FP ) && (_M_IX86_FP == 2 ))
27+ #define HAVE_SSE2 1
28+ #include <emmintrin.h>
29+ #endif
30+
31+ #elif defined(__x86_64__ )
32+ /* GCC / Clang */
33+ #if defined(__SSE2__ )
34+ #define HAVE_SSE2 1
35+ #include <x86intrin.h>
36+ #endif
37+
2638#endif
2739
2840void map_yuv420_yuyv (uint8_t * * data , uint32_t * linesize , uint8_t * dst ,
@@ -50,7 +62,7 @@ void map_yuv420_yuyv(uint8_t** data, uint32_t *linesize, uint8_t* dst,
5062 }
5163
5264 // Each row N and N+1 use the same UV values (4:2:0 -> 4:2:2)
53- #ifdef __SSE2__
65+ #if HAVE_SSE2
5466 if (is_aligned_128b )
5567 {
5668 for (int y = 0 ; y < (height >>1 ); ++ y ) {
@@ -65,7 +77,6 @@ void map_yuv420_yuyv(uint8_t** data, uint32_t *linesize, uint8_t* dst,
6577 __m128i yuv0 = _mm_unpacklo_epi8(y, uv); \
6678 __m128i yuv1 = _mm_unpackhi_epi8(y, uv); \
6779 _mm_stream_si128((__m128i*)(dst + (x<<1)), yuv0); \
68- \
6980 _mm_stream_si128((__m128i*)(dst + (x<<1) + 16), yuv1); \
7081 } \
7182 if (shift_x) dst += shift_x2;
@@ -83,10 +94,45 @@ void map_yuv420_yuyv(uint8_t** data, uint32_t *linesize, uint8_t* dst,
8394
8495 return ;
8596 }
97+ #elif HAVE_NEON
98+ if (is_aligned_128b )
99+ {
100+ for (int y = 0 ; y < (height >>1 ); ++ y ) {
101+ #define CONVERT_ROW \
102+ if (shift_x) dst += shift_x; \
103+ for (int x = 0; x < width; x += 16) { \
104+ uint8x16_t yq = vld1q_u8(src_y + x); \
105+ uint8x8_t u8 = vld1_u8(src_u + (x >> 1)); \
106+ uint8x8_t v8 = vld1_u8(src_v + (x >> 1)); \
107+ /*interleave u and v */ \
108+ uint8x8x2_t uvz = vzip_u8 (u8 , v8 ); \
109+ /* combine into one 16-byte vector */ \
110+ uint8x16_t uvq = vcombine_u8 (uvz .val [0 ], uvz .val [1 ]); \
111+ /* interleave Y and UV bytes */ \
112+ uint8x16x2_t yuv = vzipq_u8 (yq , uvq ); \
113+ vst1q_u8 (dst + (x << 1 ), yuv .val [0 ]); \
114+ vst1q_u8 (dst + (x << 1 ) + 16 , yuv .val [1 ]); \
115+ } \
116+ if (shift_x ) dst += shift_x2 ;
117+
118+ CONVERT_ROW
119+ dst += linesize_dst ;
120+ src_y += linesize [0 ];
121+
122+ CONVERT_ROW
123+ dst += linesize_dst ;
124+ src_y += linesize [0 ];
125+ src_u += linesize [1 ];
126+ src_v += linesize [2 ];
127+ }
128+
129+ return ;
130+ }
86131 #else // not __SSE2__
87132 (void ) is_aligned_128b ;
88133 #endif
89134
135+ #undef CONVERT_ROW
90136 for (int y = 0 ; y < (height >>1 ); y ++ ) {
91137 #define CONVERT_ROW \
92138 if (shift_x) dst += shift_x; \
0 commit comments