750 lines
21 KiB
Plaintext
750 lines
21 KiB
Plaintext
|
Index: src/x86/ipred_avx2.asm
|
||
|
--- src/x86/ipred_avx2.asm.orig
|
||
|
+++ src/x86/ipred_avx2.asm
|
||
|
@@ -215,19 +215,24 @@ cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl,
|
||
|
add wq, r5
|
||
|
jmp r6
|
||
|
.h64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+32] ; unaligned when jumping here from dc_top
|
||
|
pmaddubsw m1, m2
|
||
|
paddw m0, m1
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
vextracti128 xm1, m0, 1
|
||
|
paddw xm0, xm1
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
punpckhqdq xm1, xm0, xm0
|
||
|
paddw xm0, xm1
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
psrlq xm1, xm0, 32
|
||
|
paddw xm0, xm1
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
pmaddwd xm0, xm2
|
||
|
pmulhrsw xm0, xm3
|
||
|
lea stride3q, [strideq*3]
|
||
|
@@ -254,10 +259,12 @@ cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp r6
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm0, [tlq-4]
|
||
|
pmaddubsw xm0, xm3
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm1, [tlq+1]
|
||
|
pmaddubsw xm1, xm3
|
||
|
psubw xm0, xm4
|
||
|
@@ -281,6 +288,7 @@ cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h,
|
||
|
.w4_end:
|
||
|
vpbroadcastb xm0, xm0
|
||
|
.s4:
|
||
|
+ _CET_ENDBR
|
||
|
movd [dstq+strideq*0], xm0
|
||
|
movd [dstq+strideq*1], xm0
|
||
|
movd [dstq+strideq*2], xm0
|
||
|
@@ -291,10 +299,12 @@ cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h,
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xm0, [tlq-8]
|
||
|
pmaddubsw xm0, xm3
|
||
|
jmp wq
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xm1, [tlq+1]
|
||
|
vextracti128 xm2, m0, 1
|
||
|
pmaddubsw xm1, xm3
|
||
|
@@ -318,6 +328,7 @@ ALIGN function_align
|
||
|
.w8_end:
|
||
|
vpbroadcastb xm0, xm0
|
||
|
.s8:
|
||
|
+ _CET_ENDBR
|
||
|
movq [dstq+strideq*0], xm0
|
||
|
movq [dstq+strideq*1], xm0
|
||
|
movq [dstq+strideq*2], xm0
|
||
|
@@ -328,10 +339,12 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
mova xm0, [tlq-16]
|
||
|
pmaddubsw xm0, xm3
|
||
|
jmp wq
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm1, [tlq+1]
|
||
|
vextracti128 xm2, m0, 1
|
||
|
pmaddubsw xm1, xm3
|
||
|
@@ -355,6 +368,7 @@ ALIGN function_align
|
||
|
.w16_end:
|
||
|
vpbroadcastb xm0, xm0
|
||
|
.s16:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+strideq*0], xm0
|
||
|
mova [dstq+strideq*1], xm0
|
||
|
mova [dstq+strideq*2], xm0
|
||
|
@@ -365,10 +379,12 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [tlq-32]
|
||
|
pmaddubsw m0, m3
|
||
|
jmp wq
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+1]
|
||
|
pmaddubsw m1, m3
|
||
|
paddw m0, m1
|
||
|
@@ -391,6 +407,7 @@ ALIGN function_align
|
||
|
.w32_end:
|
||
|
vpbroadcastb m0, xm0
|
||
|
.s32:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+strideq*0], m0
|
||
|
mova [dstq+strideq*1], m0
|
||
|
mova [dstq+strideq*2], m0
|
||
|
@@ -401,6 +418,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.h64:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [tlq-64]
|
||
|
mova m1, [tlq-32]
|
||
|
pmaddubsw m0, m3
|
||
|
@@ -408,6 +426,7 @@ ALIGN function_align
|
||
|
paddw m0, m1
|
||
|
jmp wq
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+ 1]
|
||
|
movu m2, [tlq+33]
|
||
|
pmaddubsw m1, m3
|
||
|
@@ -433,6 +452,7 @@ ALIGN function_align
|
||
|
vpbroadcastb m0, xm0
|
||
|
mova m1, m0
|
||
|
.s64:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+strideq*0+32*0], m0
|
||
|
mova [dstq+strideq*0+32*1], m1
|
||
|
mova [dstq+strideq*1+32*0], m0
|
||
|
@@ -495,15 +515,20 @@ cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
IPRED_H 4, d
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
IPRED_H 8, q
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
IPRED_H 16, a
|
||
|
INIT_YMM avx2
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
IPRED_H 32, a
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastb m0, [tlq-1]
|
||
|
vpbroadcastb m1, [tlq-2]
|
||
|
vpbroadcastb m2, [tlq-3]
|
||
|
@@ -554,6 +579,7 @@ cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w,
|
||
|
add wq, r5
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastd m6, [tlq+1] ; top
|
||
|
mova m8, [base+ipred_h_shuf]
|
||
|
lea r3, [strideq*3]
|
||
|
@@ -584,6 +610,7 @@ cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w,
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastq m6, [tlq+1]
|
||
|
mova m8, [base+ipred_h_shuf]
|
||
|
lea r3, [strideq*3]
|
||
|
@@ -606,6 +633,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m6, [tlq+1]
|
||
|
mova xm8, xm4 ; lower half = 1, upper half = 0
|
||
|
psubusb m7, m5, m6
|
||
|
@@ -624,6 +652,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m6, [tlq+1]
|
||
|
psubusb m7, m5, m6
|
||
|
psubusb m0, m6, m5
|
||
|
@@ -639,6 +668,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m6, [tlq+ 1]
|
||
|
movu m7, [tlq+33]
|
||
|
%if WIN64
|
||
|
@@ -691,6 +721,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl,
|
||
|
add wq, r6
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastd m2, [tlq+1]
|
||
|
punpcklbw m2, m5 ; top, bottom
|
||
|
mova m5, [base+ipred_v_shuf]
|
||
|
@@ -724,6 +755,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl,
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastq m2, [tlq+1]
|
||
|
punpcklbw m2, m5
|
||
|
mova m5, [base+ipred_v_shuf]
|
||
|
@@ -749,6 +781,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
WIN64_SPILL_XMM 7
|
||
|
vbroadcasti128 m3, [tlq+1]
|
||
|
mova m6, [base+ipred_v_shuf]
|
||
|
@@ -772,6 +805,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset stack_offset - stack_size_padded
|
||
|
WIN64_SPILL_XMM 6
|
||
|
movu m3, [tlq+1]
|
||
|
@@ -793,6 +827,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
WIN64_SPILL_XMM 11
|
||
|
movu m4, [tlq+ 1]
|
||
|
movu m8, [tlq+33]
|
||
|
@@ -848,6 +883,7 @@ cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl,
|
||
|
add wq, r6
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
WIN64_SPILL_XMM 8
|
||
|
vpbroadcastq m6, [base+smooth_weights+4*2]
|
||
|
mova m7, [base+ipred_h_shuf]
|
||
|
@@ -891,6 +927,7 @@ cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl,
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset stack_offset - stack_size_padded
|
||
|
WIN64_SPILL_XMM 8
|
||
|
vbroadcasti128 m6, [base+smooth_weights+8*2]
|
||
|
@@ -927,6 +964,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
SETUP_STACK_FRAME 32*4, 7, 8
|
||
|
lea r3, [rsp+64*2-4]
|
||
|
call .prep ; only worthwhile for for w16 and above
|
||
|
@@ -951,6 +989,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
SETUP_STACK_FRAME 32*4, 7, 6
|
||
|
lea r3, [rsp+64*2-2]
|
||
|
call .prep
|
||
|
@@ -971,6 +1010,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
SETUP_STACK_FRAME 32*4, 7, 9
|
||
|
lea r3, [rsp+64*2-2]
|
||
|
call .prep
|
||
|
@@ -1062,6 +1102,7 @@ cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w
|
||
|
lea v_weightsq, [base+smooth_weights+hq*2]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
WIN64_SPILL_XMM 12
|
||
|
mova m10, [base+ipred_h_shuf]
|
||
|
vpbroadcastq m11, [base+smooth_weights+4*2]
|
||
|
@@ -1113,6 +1154,7 @@ cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset stack_offset - stack_size_padded
|
||
|
WIN64_SPILL_XMM 12
|
||
|
mova m10, [base+ipred_h_shuf]
|
||
|
@@ -1157,6 +1199,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
SETUP_STACK_FRAME 32*4, 7, 14
|
||
|
vbroadcasti128 m11, [tlq+1]
|
||
|
lea r3, [rsp+64*2-4]
|
||
|
@@ -1197,6 +1240,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
SETUP_STACK_FRAME 32*4, 7, 11
|
||
|
movu m8, [tlq+1]
|
||
|
lea r3, [rsp+64*2-2]
|
||
|
@@ -1232,6 +1276,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
SETUP_STACK_FRAME 32*8, 7, 16
|
||
|
movu m13, [tlq+1 ]
|
||
|
movu m15, [tlq+33]
|
||
|
@@ -1335,6 +1380,7 @@ cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h,
|
||
|
vpbroadcastd m5, [pw_64]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
cmp angleb, 40
|
||
|
jae .w4_no_upsample
|
||
|
lea r3d, [angleq-1024]
|
||
|
@@ -1518,6 +1564,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [angleq+216]
|
||
|
mov r3b, hb
|
||
|
cmp r3d, 8
|
||
|
@@ -1696,6 +1743,7 @@ ALIGN function_align
|
||
|
jmp .w16_main
|
||
|
ALIGN function_align
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset org_stack_offset
|
||
|
ALLOC_STACK -64, 12
|
||
|
lea maxbased, [hq+15]
|
||
|
@@ -1816,6 +1864,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset org_stack_offset
|
||
|
ALLOC_STACK -96, 15
|
||
|
lea r3d, [hq+31]
|
||
|
@@ -1960,6 +2009,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset org_stack_offset
|
||
|
ALLOC_STACK -128, 16
|
||
|
lea maxbased, [hq+63]
|
||
|
@@ -2175,6 +2225,7 @@ cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl
|
||
|
neg dyd
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6
|
||
|
vbroadcasti128 m10, [base+z1_shuf_w4]
|
||
|
vbroadcasti128 m11, [base+z2_shuf_h4]
|
||
|
@@ -2424,6 +2475,7 @@ ALIGN function_align
|
||
|
.w4_end:
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6
|
||
|
movd xm5, dyd
|
||
|
vbroadcasti128 m10, [base+z_filter_s+2]
|
||
|
@@ -2655,6 +2707,7 @@ ALIGN function_align
|
||
|
.w8_end:
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
mov r8d, hd
|
||
|
test angled, 0x400
|
||
|
jnz .w16_main
|
||
|
@@ -2901,6 +2954,7 @@ ALIGN function_align
|
||
|
.w16_ret:
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
mova m2, [tlq+32]
|
||
|
lea r8d, [hq+(1<<8)]
|
||
|
mova [rsp+96], m2
|
||
|
@@ -2946,6 +3000,7 @@ ALIGN function_align
|
||
|
movu [rsp+65], m0
|
||
|
jmp .w16_filter_left
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
mova m2, [tlq+32]
|
||
|
mov r3d, [tlq+64]
|
||
|
lea r8d, [hq+(3<<8)]
|
||
|
@@ -3021,6 +3076,7 @@ cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h,
|
||
|
mov org_wd, wd
|
||
|
jmp hq
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
lea r7, [strideq*3]
|
||
|
cmp angleb, 40
|
||
|
jae .h4_no_upsample
|
||
|
@@ -3211,6 +3267,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
lea r4d, [angleq+216]
|
||
|
mov r4b, wb
|
||
|
cmp r4d, 8
|
||
|
@@ -3455,6 +3512,7 @@ ALIGN function_align
|
||
|
jmp .h16_main
|
||
|
ALIGN function_align
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset org_stack_offset
|
||
|
ALLOC_STACK -64, 12
|
||
|
lea maxbased, [wq+15]
|
||
|
@@ -3661,6 +3719,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset org_stack_offset
|
||
|
ALLOC_STACK -96, 15
|
||
|
lea maxbased, [wq+31]
|
||
|
@@ -3890,6 +3949,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.h64:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset org_stack_offset
|
||
|
ALLOC_STACK -128, 16
|
||
|
lea maxbased, [wq+63]
|
||
|
@@ -4234,6 +4294,7 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w
|
||
|
mov hd, hm
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
WIN64_SPILL_XMM 9
|
||
|
mova xm8, [base+filter_shuf2]
|
||
|
sub tlq, 3
|
||
|
@@ -4251,6 +4312,7 @@ cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset stack_offset - stack_size_padded
|
||
|
WIN64_SPILL_XMM 10
|
||
|
mova m8, [base+filter_shuf1]
|
||
|
@@ -4278,6 +4340,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
%if WIN64
|
||
|
%assign stack_offset stack_offset - stack_size_padded
|
||
|
%assign xmm_regs_used 15
|
||
|
@@ -4350,6 +4413,7 @@ ALIGN function_align
|
||
|
ret
|
||
|
ALIGN function_align
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
sub rsp, stack_size_padded
|
||
|
sub hd, 2
|
||
|
lea r3, [dstq+16]
|
||
|
@@ -4474,15 +4538,19 @@ cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl,
|
||
|
movifnidn acq, acmp
|
||
|
jmp r6
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
vextracti128 xm1, m0, 1
|
||
|
paddw xm0, xm1
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
punpckhqdq xm1, xm0, xm0
|
||
|
paddw xm0, xm1
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
psrlq xm1, xm0, 32
|
||
|
paddw xm0, xm1
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
pmaddwd xm0, xm2
|
||
|
pmulhrsw xm0, xm3
|
||
|
vpbroadcastw m0, xm0
|
||
|
@@ -4507,10 +4575,12 @@ cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h
|
||
|
movifnidn acq, acmp
|
||
|
jmp r6
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm0, [tlq-4]
|
||
|
pmaddubsw xm0, xm3
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm1, [tlq+1]
|
||
|
pmaddubsw xm1, xm3
|
||
|
psubw xm0, xm4
|
||
|
@@ -4534,6 +4604,7 @@ cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h
|
||
|
.w4_end:
|
||
|
vpbroadcastw m0, xm0
|
||
|
.s4:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastw m1, alpham
|
||
|
lea r6, [strideq*3]
|
||
|
pabsw m2, m1
|
||
|
@@ -4554,10 +4625,12 @@ cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xm0, [tlq-8]
|
||
|
pmaddubsw xm0, xm3
|
||
|
jmp wq
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xm1, [tlq+1]
|
||
|
vextracti128 xm2, m0, 1
|
||
|
pmaddubsw xm1, xm3
|
||
|
@@ -4581,6 +4654,7 @@ ALIGN function_align
|
||
|
.w8_end:
|
||
|
vpbroadcastw m0, xm0
|
||
|
.s8:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastw m1, alpham
|
||
|
lea r6, [strideq*3]
|
||
|
pabsw m2, m1
|
||
|
@@ -4603,10 +4677,12 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
mova xm0, [tlq-16]
|
||
|
pmaddubsw xm0, xm3
|
||
|
jmp wq
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm1, [tlq+1]
|
||
|
vextracti128 xm2, m0, 1
|
||
|
pmaddubsw xm1, xm3
|
||
|
@@ -4630,6 +4706,7 @@ ALIGN function_align
|
||
|
.w16_end:
|
||
|
vpbroadcastw m0, xm0
|
||
|
.s16:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastw m1, alpham
|
||
|
pabsw m2, m1
|
||
|
psllw m2, 9
|
||
|
@@ -4649,10 +4726,12 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [tlq-32]
|
||
|
pmaddubsw m0, m3
|
||
|
jmp wq
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+1]
|
||
|
pmaddubsw m1, m3
|
||
|
paddw m0, m1
|
||
|
@@ -4675,6 +4754,7 @@ ALIGN function_align
|
||
|
.w32_end:
|
||
|
vpbroadcastw m0, xm0
|
||
|
.s32:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastw m1, alpham
|
||
|
pabsw m2, m1
|
||
|
psllw m2, 9
|
||
|
@@ -4720,6 +4800,7 @@ cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride,
|
||
|
|
||
|
DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
lea stride3q, [strideq*3]
|
||
|
.w4_loop:
|
||
|
movq xm0, [yq]
|
||
|
@@ -4747,6 +4828,7 @@ cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride,
|
||
|
jmp .calc_avg
|
||
|
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
lea stride3q, [strideq*3]
|
||
|
test wpadd, wpadd
|
||
|
jnz .w8_wpad
|
||
|
@@ -4797,6 +4879,7 @@ cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride,
|
||
|
jmp .calc_avg
|
||
|
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
test wpadd, wpadd
|
||
|
jnz .w16_wpad
|
||
|
.w16_loop:
|
||
|
@@ -4824,14 +4907,17 @@ cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride,
|
||
|
add iptrq, wpadq
|
||
|
jmp iptrq
|
||
|
.w16_pad3:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastq m0, [yq]
|
||
|
vpbroadcastq m1, [yq+strideq]
|
||
|
jmp .w16_wpad_end
|
||
|
.w16_pad2:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m0, [yq]
|
||
|
vbroadcasti128 m1, [yq+strideq]
|
||
|
jmp .w16_wpad_end
|
||
|
.w16_pad1:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [yq]
|
||
|
mova m1, [yq+strideq]
|
||
|
; fall-through
|
||
|
@@ -4902,6 +4988,7 @@ cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride,
|
||
|
|
||
|
DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
lea stride3q, [strideq*3]
|
||
|
.w4_loop:
|
||
|
movq xm1, [yq]
|
||
|
@@ -4930,6 +5017,7 @@ cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride,
|
||
|
jmp .calc_avg
|
||
|
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
lea stride3q, [strideq*3]
|
||
|
test wpadd, wpadd
|
||
|
jnz .w8_wpad
|
||
|
@@ -4983,6 +5071,7 @@ cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride,
|
||
|
jmp .calc_avg
|
||
|
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
test wpadd, wpadd
|
||
|
jnz .w16_wpad
|
||
|
.w16_loop:
|
||
|
@@ -5011,14 +5100,17 @@ cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride,
|
||
|
add iptrq, wpadq
|
||
|
jmp iptrq
|
||
|
.w16_pad3:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastq m1, [yq]
|
||
|
vpbroadcastq m0, [yq+strideq]
|
||
|
jmp .w16_wpad_end
|
||
|
.w16_pad2:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m1, [yq]
|
||
|
vbroadcasti128 m0, [yq+strideq]
|
||
|
jmp .w16_wpad_end
|
||
|
.w16_pad1:
|
||
|
+ _CET_ENDBR
|
||
|
mova m1, [yq]
|
||
|
mova m0, [yq+strideq]
|
||
|
; fall-through
|
||
|
@@ -5096,6 +5188,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride,
|
||
|
jmp r5
|
||
|
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
lea stride3q, [strideq*3]
|
||
|
pxor xm2, xm2
|
||
|
.w4_loop:
|
||
|
@@ -5129,6 +5222,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride,
|
||
|
jmp .calc_avg_mul
|
||
|
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
lea stride3q, [strideq*3]
|
||
|
pxor m2, m2
|
||
|
.w8_loop:
|
||
|
@@ -5162,6 +5256,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride,
|
||
|
jmp .calc_avg_mul
|
||
|
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
test wpadd, wpadd
|
||
|
jnz .w16_wpad
|
||
|
.w16_loop:
|
||
|
@@ -5214,6 +5309,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride,
|
||
|
jmp .calc_avg
|
||
|
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
test wpadd, wpadd
|
||
|
jnz .w32_wpad
|
||
|
.w32_loop:
|
||
|
@@ -5242,16 +5338,19 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride,
|
||
|
add iptrq, wpadq
|
||
|
jmp iptrq
|
||
|
.w32_pad3:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastq m1, [yq]
|
||
|
pshufb m1, m3
|
||
|
vpermq m0, m1, q3232
|
||
|
jmp .w32_wpad_end
|
||
|
.w32_pad2:
|
||
|
+ _CET_ENDBR
|
||
|
pmovzxbw m1, [yq]
|
||
|
pshufhw m0, m1, q3333
|
||
|
vpermq m0, m0, q3333
|
||
|
jmp .w32_wpad_end
|
||
|
.w32_pad1:
|
||
|
+ _CET_ENDBR
|
||
|
pmovzxbw m1, [yq]
|
||
|
vpbroadcastq m0, [yq+16]
|
||
|
pshufb m0, m3
|
||
|
@@ -5317,6 +5416,7 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx,
|
||
|
lea r2, [strideq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
pshufb xm0, xm4, [idxq]
|
||
|
add idxq, 16
|
||
|
movd [dstq+strideq*0], xm0
|
||
|
@@ -5329,6 +5429,7 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx,
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
pshufb xm0, xm4, [idxq+16*0]
|
||
|
pshufb xm1, xm4, [idxq+16*1]
|
||
|
add idxq, 16*2
|
||
|
@@ -5342,6 +5443,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
pshufb m0, m4, [idxq+32*0]
|
||
|
pshufb m1, m4, [idxq+32*1]
|
||
|
add idxq, 32*2
|
||
|
@@ -5355,6 +5457,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
pshufb m0, m4, [idxq+32*0]
|
||
|
pshufb m1, m4, [idxq+32*1]
|
||
|
pshufb m2, m4, [idxq+32*2]
|
||
|
@@ -5370,6 +5473,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
pshufb m0, m4, [idxq+32*0]
|
||
|
pshufb m1, m4, [idxq+32*1]
|
||
|
pshufb m2, m4, [idxq+32*2]
|