@@ -47,11 +47,46 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
4747
4848 if ( (inc_x == 1 ) && (inc_y == 1 ) )
4949 {
50- #if V_SIMD && !defined(DSDOT )
51- const int vstep = v_nlanes_f32 ;
52- const int unrollx4 = n & (- vstep * 4 );
53- const int unrollx = n & - vstep ;
54- v_f32 vsum0 = v_zero_f32 ();
50+ #if defined(DOUBLE ) && V_SIMD && V_SIMD_F64 && !defined(DSDOT )
51+ const int vstep = v_nlanes_f64 ;
52+ const int unrollx4 = n & (- vstep * 4 );
53+ const int unrollx = n & - vstep ;
54+ v_f64 vsum0 = v_zero_f64 ();
55+ v_f64 vsum1 = v_zero_f64 ();
56+ v_f64 vsum2 = v_zero_f64 ();
57+ v_f64 vsum3 = v_zero_f64 ();
58+ while (i < unrollx4 )
59+ {
60+ vsum0 = v_muladd_f64 (
61+ v_loadu_f64 (x + i ), v_loadu_f64 (y + i ), vsum0
62+ );
63+ vsum1 = v_muladd_f64 (
64+ v_loadu_f64 (x + i + vstep ), v_loadu_f64 (y + i + vstep ), vsum1
65+ );
66+ vsum2 = v_muladd_f64 (
67+ v_loadu_f64 (x + i + vstep * 2 ), v_loadu_f64 (y + i + vstep * 2 ), vsum2
68+ );
69+ vsum3 = v_muladd_f64 (
70+ v_loadu_f64 (x + i + vstep * 3 ), v_loadu_f64 (y + i + vstep * 3 ), vsum3
71+ );
72+ i += vstep * 4 ;
73+ }
74+ vsum0 = v_add_f64 (
75+ v_add_f64 (vsum0 , vsum1 ), v_add_f64 (vsum2 , vsum3 )
76+ );
77+ while (i < unrollx )
78+ {
79+ vsum0 = v_muladd_f64 (
80+ v_loadu_f64 (x + i ), v_loadu_f64 (y + i ), vsum0
81+ );
82+ i += vstep ;
83+ }
84+ dot = v_sum_f64 (vsum0 );
85+ #elif V_SIMD && !defined(DSDOT )
86+ const int vstep = v_nlanes_f32 ;
87+ const int unrollx4 = n & (- vstep * 4 );
88+ const int unrollx = n & - vstep ;
89+ v_f32 vsum0 = v_zero_f32 ();
5590 v_f32 vsum1 = v_zero_f32 ();
5691 v_f32 vsum2 = v_zero_f32 ();
5792 v_f32 vsum3 = v_zero_f32 ();
@@ -82,10 +117,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
82117 i += vstep ;
83118 }
84119 dot = v_sum_f32 (vsum0 );
85- #elif defined(DSDOT )
86- int n1 = n & -4 ;
87- for (; i < n1 ; i += 4 )
88- {
120+ #elif defined(DSDOT )
121+ int n1 = n & -4 ;
122+ for (; i < n1 ; i += 4 )
123+ {
89124 dot += (double ) y [i ] * (double ) x [i ]
90125 + (double ) y [i + 1 ] * (double ) x [i + 1 ]
91126 + (double ) y [i + 2 ] * (double ) x [i + 2 ]
@@ -133,5 +168,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
133168 return (dot );
134169
135170}
136-
137-
0 commit comments