我想优化里面第二个for循环。
用一个xmm存储四次循环的数据达到消除for循环的目的。
static Void FUNC(hevc_loop_filter_chroma)(U08 *_pix, PTR_DIFF _xstride,
PTR_DIFF _ystride, S32 *_tc,
U08 *_no_p, U08 *_no_q)
{
S32 d, j, no_p, no_q;
pixel *pix = (pixel *)_pix;
PTR_DIFF xstride = _xstride / sizeof(pixel);
PTR_DIFF ystride = _ystride / sizeof(pixel);
for (j = 0; j < 2; j++)
{
const S32 tc = _tc[j] << (BIT_DEPTH - 8);
if (tc <= 0)
{
pix += 4 * ystride;
continue;
}
no_p = _no_p[j];
no_q = _no_q[j];
for (d = 0; d < 4; d++)
{
S32 delta0;
const S32 p1 = P1;
const S32 p0 = P0;
const S32 q0 = Q0;
const S32 q1 = Q1;
delta0 = CLIP3(-tc, tc, (((q0 - p0) << 2) + p1 - q1 + 4) >> 3);
P0 = HEVC_clip_pixel(p0 + delta0);
Q0 = HEVC_clip_pixel(q0 - delta0);
pix += ystride;
}
}
}
优化后的代码如下:
Void HEVCDEC_h_loop_filter_chroma_8_see4(U08 *_pix, PTR_DIFF xstride, S32 *_tc, U08 *_no_p, U08 *_no_q)
{
S32 j, no_p, no_q;
S32 delta0;
U32 ystride = 1;
pixel *pix = (pixel *)_pix;
DECLSPEC_ALIGN_32 S32 n_4[4] = {4, 4, 4, 4};
DECLSPEC_ALIGN_32 S32 n_FF[4] = {0xFF, 0xFF, 0xFF, 0xFF};
DECLSPEC_ALIGN_32 S08 n_8FF[4] = {0xFF, 0xFF, 0xFF, 0xFF};
for (j = 0; j < 2; j++)
{
const S32 tc = _tc[j] << (BIT_DEPTH - 8);
if (tc <= 0)
{
pix += 4 * ystride;
continue;
}
no_p = _no_p[j];
no_q = _no_q[j];
__asm
{
mov ESI_TBD, pix
psubd xmm0, xmm0 //P0
sub ESI_TBD, xstride
pinsrb xmm0, [ESI_TBD], 0
pinsrb xmm0, [ESI_TBD + 1], 4
pinsrb xmm0, [ESI_TBD + 2], 8
pinsrb xmm0, [ESI_TBD + 3], 12
psubd xmm1, xmm1 //P1
sub ESI_TBD, xstride
pinsrb xmm1, [ESI_TBD], 0
pinsrb xmm1, [ESI_TBD + 1], 4
pinsrb xmm1, [ESI_TBD + 2], 8
pinsrb xmm1, [ESI_TBD + 3], 12
psubd xmm2, xmm2 //Q0
add ESI_TBD, xstride
add ESI_TBD, xstride
pinsrb xmm2, [ESI_TBD], 0
pinsrb xmm2, [ESI_TBD + 1], 4
pinsrb xmm2, [ESI_TBD + 2], 8
pinsrb xmm2, [ESI_TBD + 3], 12
psubd xmm3, xmm3 //Q1
add ESI_TBD, xstride
pinsrb xmm3, [ESI_TBD], 0
pinsrb xmm3, [ESI_TBD + 1], 4
pinsrb xmm3, [ESI_TBD + 2], 8
pinsrb xmm3, [ESI_TBD + 3], 12
psubd xmm4, xmm4
psubd xmm5, xmm5
psubd xmm6, xmm6
psubd xmm7, xmm7
movdqa xmm4, xmm0 //p0 p1 q0 q1
movdqa xmm5, xmm1
movdqa xmm6, xmm2
movdqa xmm7, xmm3
psubd xmm6, xmm4
pslld xmm6, 2
paddd xmm6, xmm5
psubd xmm6, xmm7
psubd xmm1, xmm1
movdqa xmm1, n_4
paddd xmm6, xmm1
psrld xmm6, 3 //((q0 - p0) << 2) + p1 - q1 + 4) >> 3)
psubd xmm1, xmm1
PINSRD xmm1, tc, 0 //把tc和-tc存到寄存器中
PINSRD xmm1, tc, 1
PINSRD xmm1, tc, 2
PINSRD xmm1, tc, 3
psubd xmm3, xmm3
psubd xmm3, xmm1
psubd xmm4, xmm4
psubd xmm5, xmm5
movdqa xmm4, xmm6 // A
movdqa xmm5, xmm3 //-tc
pcmpgtd xmm5, xmm4 //CLIP3(-tc, tc, (((q0 - p0) << 2) + p1 - q1 + 4) >> 3) mask0
pand xmm3, xmm5 //mask0 & -tc