712 lines
24 KiB
PHP
Vendored
712 lines
24 KiB
PHP
Vendored
// Auto-generated by `genhbc`, do not edit!
|
|
|
|
#if defined( R8B_SSE2 )
|
|
|
|
R8BHBC1( convolve1 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve2 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve3 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
op[ 1 ] += flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve4 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
__m128d v3, v4, m3, s3;
|
|
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = m3;
|
|
s1 = _mm_add_pd( s1, s3 );
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve5 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
__m128d v3, v4, m3, s3;
|
|
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = m3;
|
|
s1 = _mm_add_pd( s1, s3 );
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
op[ 1 ] += flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve6 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
__m128d v3, v4, m3, s3;
|
|
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = m3;
|
|
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
s1 = _mm_add_pd( s1, s3 );
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve7 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
__m128d v3, v4, m3, s3;
|
|
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = m3;
|
|
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
s1 = _mm_add_pd( s1, s3 );
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
op[ 1 ] += flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve8 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
__m128d v3, v4, m3, s3;
|
|
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = m3;
|
|
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = _mm_add_pd( s3, m3 );
|
|
s1 = _mm_add_pd( s1, s3 );
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve9 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
__m128d v3, v4, m3, s3;
|
|
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = m3;
|
|
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = _mm_add_pd( s3, m3 );
|
|
s1 = _mm_add_pd( s1, s3 );
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
op[ 1 ] += flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve10 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
__m128d v3, v4, m3, s3;
|
|
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = m3;
|
|
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = _mm_add_pd( s3, m3 );
|
|
v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
s1 = _mm_add_pd( s1, s3 );
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve11 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
__m128d v3, v4, m3, s3;
|
|
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = m3;
|
|
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = _mm_add_pd( s3, m3 );
|
|
v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
s1 = _mm_add_pd( s1, s3 );
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
op[ 1 ] += flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve12 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
__m128d v3, v4, m3, s3;
|
|
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = m3;
|
|
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = _mm_add_pd( s3, m3 );
|
|
v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = _mm_add_pd( s3, m3 );
|
|
s1 = _mm_add_pd( s1, s3 );
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve13 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
__m128d v3, v4, m3, s3;
|
|
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = m3;
|
|
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = _mm_add_pd( s3, m3 );
|
|
v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = _mm_add_pd( s3, m3 );
|
|
s1 = _mm_add_pd( s1, s3 );
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
op[ 1 ] += flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve14 )
|
|
__m128d v1, v2, m1, s1;
|
|
v2 = _mm_loadu_pd( rp - 1 ); v1 = _mm_loadu_pd( rp + 1 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 0 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = m1;
|
|
__m128d v3, v4, m3, s3;
|
|
v4 = _mm_loadu_pd( rp - 3 ); v3 = _mm_loadu_pd( rp + 3 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 2 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = m3;
|
|
v2 = _mm_loadu_pd( rp - 5 ); v1 = _mm_loadu_pd( rp + 5 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 4 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
v4 = _mm_loadu_pd( rp - 7 ); v3 = _mm_loadu_pd( rp + 7 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 6 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = _mm_add_pd( s3, m3 );
|
|
v2 = _mm_loadu_pd( rp - 9 ); v1 = _mm_loadu_pd( rp + 9 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 8 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
v4 = _mm_loadu_pd( rp - 11 ); v3 = _mm_loadu_pd( rp + 11 );
|
|
m3 = _mm_mul_pd( _mm_load_pd( flt + 10 ),
|
|
_mm_add_pd( v3, _mm_shuffle_pd( v4, v4, 1 )));
|
|
s3 = _mm_add_pd( s3, m3 );
|
|
v2 = _mm_loadu_pd( rp - 13 ); v1 = _mm_loadu_pd( rp + 13 );
|
|
m1 = _mm_mul_pd( _mm_load_pd( flt + 12 ),
|
|
_mm_add_pd( v1, _mm_shuffle_pd( v2, v2, 1 )));
|
|
s1 = _mm_add_pd( s1, m1 );
|
|
s1 = _mm_add_pd( s1, s3 );
|
|
_mm_storel_pd( op + 1, _mm_add_pd( s1, _mm_shuffle_pd( s1, s1, 1 )));
|
|
R8BHBC2
|
|
|
|
#elif defined( R8B_NEON )
|
|
|
|
R8BHBC1( convolve1 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve2 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
op[ 1 ] = vaddvq_f64( s1 );
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve3 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve4 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
float64x2_t v3, v4, s3;
|
|
s3 = vdupq_n_f64( 0.0 );
|
|
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
s1 = vaddq_f64( s1, s3 );
|
|
op[ 1 ] = vaddvq_f64( s1 );
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve5 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
float64x2_t v3, v4, s3;
|
|
s3 = vdupq_n_f64( 0.0 );
|
|
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
s1 = vaddq_f64( s1, s3 );
|
|
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve6 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
float64x2_t v3, v4, s3;
|
|
s3 = vdupq_n_f64( 0.0 );
|
|
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
s1 = vaddq_f64( s1, s3 );
|
|
op[ 1 ] = vaddvq_f64( s1 );
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve7 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
float64x2_t v3, v4, s3;
|
|
s3 = vdupq_n_f64( 0.0 );
|
|
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
s1 = vaddq_f64( s1, s3 );
|
|
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve8 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
float64x2_t v3, v4, s3;
|
|
s3 = vdupq_n_f64( 0.0 );
|
|
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
s1 = vaddq_f64( s1, s3 );
|
|
op[ 1 ] = vaddvq_f64( s1 );
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve9 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
float64x2_t v3, v4, s3;
|
|
s3 = vdupq_n_f64( 0.0 );
|
|
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
s1 = vaddq_f64( s1, s3 );
|
|
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve10 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
float64x2_t v3, v4, s3;
|
|
s3 = vdupq_n_f64( 0.0 );
|
|
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
s1 = vaddq_f64( s1, s3 );
|
|
op[ 1 ] = vaddvq_f64( s1 );
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve11 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
float64x2_t v3, v4, s3;
|
|
s3 = vdupq_n_f64( 0.0 );
|
|
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
s1 = vaddq_f64( s1, s3 );
|
|
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve12 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
float64x2_t v3, v4, s3;
|
|
s3 = vdupq_n_f64( 0.0 );
|
|
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
s1 = vaddq_f64( s1, s3 );
|
|
op[ 1 ] = vaddvq_f64( s1 );
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve13 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
float64x2_t v3, v4, s3;
|
|
s3 = vdupq_n_f64( 0.0 );
|
|
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
s1 = vaddq_f64( s1, s3 );
|
|
op[ 1 ] = vaddvq_f64( s1 ) + flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve14 )
|
|
float64x2_t v1, v2, s1;
|
|
s1 = vdupq_n_f64( 0.0 );
|
|
v2 = vld1q_f64( rp - 1 ); v1 = vld1q_f64( rp + 1 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 0 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
float64x2_t v3, v4, s3;
|
|
s3 = vdupq_n_f64( 0.0 );
|
|
v4 = vld1q_f64( rp - 3 ); v3 = vld1q_f64( rp + 3 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 2 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 5 ); v1 = vld1q_f64( rp + 5 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 4 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
v4 = vld1q_f64( rp - 7 ); v3 = vld1q_f64( rp + 7 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 6 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 9 ); v1 = vld1q_f64( rp + 9 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 8 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
v4 = vld1q_f64( rp - 11 ); v3 = vld1q_f64( rp + 11 );
|
|
s3 = vmlaq_f64( s3, vld1q_f64( flt + 10 ),
|
|
vaddq_f64( v3, vextq_f64( v4, v4, 1 )));
|
|
v2 = vld1q_f64( rp - 13 ); v1 = vld1q_f64( rp + 13 );
|
|
s1 = vmlaq_f64( s1, vld1q_f64( flt + 12 ),
|
|
vaddq_f64( v1, vextq_f64( v2, v2, 1 )));
|
|
s1 = vaddq_f64( s1, s3 );
|
|
op[ 1 ] = vaddvq_f64( s1 );
|
|
R8BHBC2
|
|
|
|
#else // SIMD
|
|
|
|
R8BHBC1( convolve1 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve2 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve3 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve4 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
|
|
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve5 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
|
|
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
|
|
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve6 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
|
|
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
|
|
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
|
|
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve7 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
|
|
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
|
|
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
|
|
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
|
|
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve8 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
|
|
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
|
|
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
|
|
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
|
|
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
|
|
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve9 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
|
|
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
|
|
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
|
|
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
|
|
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
|
|
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
|
|
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve10 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
|
|
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
|
|
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
|
|
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
|
|
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
|
|
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
|
|
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
|
|
+ flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve11 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
|
|
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
|
|
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
|
|
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
|
|
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
|
|
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
|
|
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
|
|
+ flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
|
|
+ flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve12 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
|
|
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
|
|
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
|
|
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
|
|
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
|
|
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
|
|
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
|
|
+ flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
|
|
+ flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
|
|
+ flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve13 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
|
|
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
|
|
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
|
|
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
|
|
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
|
|
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
|
|
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
|
|
+ flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
|
|
+ flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
|
|
+ flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ])
|
|
+ flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ]);
|
|
R8BHBC2
|
|
|
|
R8BHBC1( convolve14 )
|
|
op[ 1 ] = flt[ 0 ] * ( rp[ 1 ] + rp[ 0 ])
|
|
+ flt[ 1 ] * ( rp[ 2 ] + rp[ -1 ])
|
|
+ flt[ 2 ] * ( rp[ 3 ] + rp[ -2 ])
|
|
+ flt[ 3 ] * ( rp[ 4 ] + rp[ -3 ])
|
|
+ flt[ 4 ] * ( rp[ 5 ] + rp[ -4 ])
|
|
+ flt[ 5 ] * ( rp[ 6 ] + rp[ -5 ])
|
|
+ flt[ 6 ] * ( rp[ 7 ] + rp[ -6 ])
|
|
+ flt[ 7 ] * ( rp[ 8 ] + rp[ -7 ])
|
|
+ flt[ 8 ] * ( rp[ 9 ] + rp[ -8 ])
|
|
+ flt[ 9 ] * ( rp[ 10 ] + rp[ -9 ])
|
|
+ flt[ 10 ] * ( rp[ 11 ] + rp[ -10 ])
|
|
+ flt[ 11 ] * ( rp[ 12 ] + rp[ -11 ])
|
|
+ flt[ 12 ] * ( rp[ 13 ] + rp[ -12 ])
|
|
+ flt[ 13 ] * ( rp[ 14 ] + rp[ -13 ]);
|
|
R8BHBC2
|
|
|
|
#endif // SIMD
|