#include .text .globl ossl_rsaz_avxifma_eligible .type ossl_rsaz_avxifma_eligible,@function .align 32 ossl_rsaz_avxifma_eligible: movl OPENSSL_ia32cap_P+20(%rip),%ecx xorl %eax,%eax andl $8388608,%ecx cmpl $8388608,%ecx cmovel %ecx,%eax .byte 0xf3,0xc3 .size ossl_rsaz_avxifma_eligible, .-ossl_rsaz_avxifma_eligible .text .globl ossl_rsaz_amm52x20_x1_avxifma256 .type ossl_rsaz_amm52x20_x1_avxifma256,@function .align 32 ossl_rsaz_amm52x20_x1_avxifma256: .cfi_startproc .byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lossl_rsaz_amm52x20_x1_avxifma256_body: vpxor %ymm0,%ymm0,%ymm0 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 vmovapd %ymm0,%ymm8 xorl %r9d,%r9d movq %rdx,%r11 movq $0xfffffffffffff,%rax movl $5,%ebx .align 32 .Lloop5: movq 0(%r11),%r13 vpbroadcastq 0(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -168(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm5,32(%rsp) vmovdqu %ymm6,64(%rsp) vmovdqu %ymm7,96(%rsp) vmovdqu %ymm8,128(%rsp) movq $0,160(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm5 vmovdqu 72(%rsp),%ymm6 vmovdqu 104(%rsp),%ymm7 vmovdqu 136(%rsp),%ymm8 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8 leaq 168(%rsp),%rsp movq 8(%r11),%r13 vpbroadcastq 8(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -168(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm5,32(%rsp) vmovdqu %ymm6,64(%rsp) vmovdqu %ymm7,96(%rsp) vmovdqu %ymm8,128(%rsp) movq $0,160(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm5 vmovdqu 72(%rsp),%ymm6 vmovdqu 104(%rsp),%ymm7 vmovdqu 136(%rsp),%ymm8 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8 leaq 168(%rsp),%rsp movq 16(%r11),%r13 vpbroadcastq 16(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -168(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm5,32(%rsp) vmovdqu %ymm6,64(%rsp) vmovdqu %ymm7,96(%rsp) vmovdqu %ymm8,128(%rsp) movq $0,160(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm5 vmovdqu 72(%rsp),%ymm6 vmovdqu 104(%rsp),%ymm7 vmovdqu 136(%rsp),%ymm8 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8 leaq 168(%rsp),%rsp movq 24(%r11),%r13 vpbroadcastq 24(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -168(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm5,32(%rsp) vmovdqu %ymm6,64(%rsp) vmovdqu %ymm7,96(%rsp) vmovdqu %ymm8,128(%rsp) movq $0,160(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm5 vmovdqu 72(%rsp),%ymm6 vmovdqu 104(%rsp),%ymm7 vmovdqu 136(%rsp),%ymm8 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8 leaq 168(%rsp),%rsp leaq 32(%r11),%r11 decl %ebx jne .Lloop5 vmovq %r9,%xmm0 vpbroadcastq %xmm0,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 vpsrlq $52,%ymm3,%ymm0 vpsrlq $52,%ymm5,%ymm1 vpsrlq $52,%ymm6,%ymm2 vpsrlq $52,%ymm7,%ymm13 vpsrlq $52,%ymm8,%ymm14 vpermq $144,%ymm14,%ymm14 vpermq $3,%ymm13,%ymm15 vblendpd $1,%ymm15,%ymm14,%ymm14 vpermq $144,%ymm13,%ymm13 vpermq $3,%ymm2,%ymm15 vblendpd $1,%ymm15,%ymm13,%ymm13 vpermq $144,%ymm2,%ymm2 vpermq $3,%ymm1,%ymm15 vblendpd $1,%ymm15,%ymm2,%ymm2 vpermq $144,%ymm1,%ymm1 vpermq $3,%ymm0,%ymm15 vblendpd $1,%ymm15,%ymm1,%ymm1 vpermq $144,%ymm0,%ymm0 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpaddq %ymm0,%ymm3,%ymm3 vpaddq %ymm1,%ymm5,%ymm5 vpaddq %ymm2,%ymm6,%ymm6 vpaddq %ymm13,%ymm7,%ymm7 vpaddq %ymm14,%ymm8,%ymm8 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm1 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm2 vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm14 vmovmskpd %ymm0,%r14d vmovmskpd %ymm1,%r13d vmovmskpd %ymm2,%r12d vmovmskpd %ymm13,%r11d vmovmskpd %ymm14,%r10d vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm1 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm2 vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm14 vmovmskpd %ymm0,%r9d vmovmskpd %ymm1,%r8d vmovmskpd %ymm2,%ebx vmovmskpd %ymm13,%ecx vmovmskpd %ymm14,%edx shlb $4,%r13b orb %r13b,%r14b shlb $4,%r11b orb %r11b,%r12b addb %r14b,%r14b adcb %r12b,%r12b adcb %r10b,%r10b shlb $4,%r8b orb %r8b,%r9b shlb $4,%cl orb %cl,%bl addb %r9b,%r14b adcb %bl,%r12b adcb %dl,%r10b xorb %r9b,%r14b xorb %bl,%r12b xorb %dl,%r10b leaq .Lkmasklut(%rip),%rdx movb %r14b,%r13b andq $0xf,%r14 vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 shlq $5,%r14 vmovapd (%rdx,%r14,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 shrb $4,%r13b andq $0xf,%r13 vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 shlq $5,%r13 vmovapd (%rdx,%r13,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 movb %r12b,%r11b andq $0xf,%r12 vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 shlq $5,%r12 vmovapd (%rdx,%r12,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 shrb $4,%r11b andq $0xf,%r11 vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 shlq $5,%r11 vmovapd (%rdx,%r11,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vmovdqu %ymm3,0(%rdi) vmovdqu %ymm5,32(%rdi) vmovdqu %ymm6,64(%rdi) vmovdqu %ymm7,96(%rdi) vmovdqu %ymm8,128(%rdi) vzeroupper movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbp .cfi_restore %rbp movq 40(%rsp),%rbx .cfi_restore %rbx leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lossl_rsaz_amm52x20_x1_avxifma256_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ossl_rsaz_amm52x20_x1_avxifma256, .-ossl_rsaz_amm52x20_x1_avxifma256 .section .rodata .align 32 .Lmask52x4: .quad 0xfffffffffffff .quad 0xfffffffffffff .quad 0xfffffffffffff .quad 0xfffffffffffff .Lhigh64x3: .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .Lkmasklut: .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .text .globl ossl_rsaz_amm52x20_x2_avxifma256 .type ossl_rsaz_amm52x20_x2_avxifma256,@function .align 32 ossl_rsaz_amm52x20_x2_avxifma256: .cfi_startproc .byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lossl_rsaz_amm52x20_x2_avxifma256_body: vpxor %ymm0,%ymm0,%ymm0 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 vmovapd %ymm0,%ymm8 vmovapd %ymm0,%ymm4 vmovapd %ymm0,%ymm9 vmovapd %ymm0,%ymm10 vmovapd %ymm0,%ymm11 vmovapd %ymm0,%ymm12 xorl %r9d,%r9d xorl %r15d,%r15d movq %rdx,%r11 movq $0xfffffffffffff,%rax movl $20,%ebx .align 32 .Lloop20: movq 0(%r11),%r13 vpbroadcastq 0(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq (%r8),%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -168(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm8 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm5,32(%rsp) vmovdqu %ymm6,64(%rsp) vmovdqu %ymm7,96(%rsp) vmovdqu %ymm8,128(%rsp) movq $0,160(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm5 vmovdqu 72(%rsp),%ymm6 vmovdqu 104(%rsp),%ymm7 vmovdqu 136(%rsp),%ymm8 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm8 leaq 168(%rsp),%rsp movq 160(%r11),%r13 vpbroadcastq 160(%r11),%ymm1 movq 160(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r15 movq %r12,%r10 adcq $0,%r10 movq 8(%r8),%r13 imulq %r15,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 160(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r15 adcq %r12,%r10 shrq $52,%r15 salq $12,%r10 orq %r10,%r15 leaq -168(%rsp),%rsp {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 vmovdqu %ymm4,0(%rsp) vmovdqu %ymm9,32(%rsp) vmovdqu %ymm10,64(%rsp) vmovdqu %ymm11,96(%rsp) vmovdqu %ymm12,128(%rsp) movq $0,160(%rsp) vmovdqu 8(%rsp),%ymm4 vmovdqu 40(%rsp),%ymm9 vmovdqu 72(%rsp),%ymm10 vmovdqu 104(%rsp),%ymm11 vmovdqu 136(%rsp),%ymm12 addq 8(%rsp),%r15 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 leaq 168(%rsp),%rsp leaq 8(%r11),%r11 decl %ebx jne .Lloop20 vmovq %r9,%xmm0 vpbroadcastq %xmm0,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 vpsrlq $52,%ymm3,%ymm0 vpsrlq $52,%ymm5,%ymm1 vpsrlq $52,%ymm6,%ymm2 vpsrlq $52,%ymm7,%ymm13 vpsrlq $52,%ymm8,%ymm14 vpermq $144,%ymm14,%ymm14 vpermq $3,%ymm13,%ymm15 vblendpd $1,%ymm15,%ymm14,%ymm14 vpermq $144,%ymm13,%ymm13 vpermq $3,%ymm2,%ymm15 vblendpd $1,%ymm15,%ymm13,%ymm13 vpermq $144,%ymm2,%ymm2 vpermq $3,%ymm1,%ymm15 vblendpd $1,%ymm15,%ymm2,%ymm2 vpermq $144,%ymm1,%ymm1 vpermq $3,%ymm0,%ymm15 vblendpd $1,%ymm15,%ymm1,%ymm1 vpermq $144,%ymm0,%ymm0 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpaddq %ymm0,%ymm3,%ymm3 vpaddq %ymm1,%ymm5,%ymm5 vpaddq %ymm2,%ymm6,%ymm6 vpaddq %ymm13,%ymm7,%ymm7 vpaddq %ymm14,%ymm8,%ymm8 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm1 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm2 vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm14 vmovmskpd %ymm0,%r14d vmovmskpd %ymm1,%r13d vmovmskpd %ymm2,%r12d vmovmskpd %ymm13,%r11d vmovmskpd %ymm14,%r10d vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm1 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm2 vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm14 vmovmskpd %ymm0,%r9d vmovmskpd %ymm1,%r8d vmovmskpd %ymm2,%ebx vmovmskpd %ymm13,%ecx vmovmskpd %ymm14,%edx shlb $4,%r13b orb %r13b,%r14b shlb $4,%r11b orb %r11b,%r12b addb %r14b,%r14b adcb %r12b,%r12b adcb %r10b,%r10b shlb $4,%r8b orb %r8b,%r9b shlb $4,%cl orb %cl,%bl addb %r9b,%r14b adcb %bl,%r12b adcb %dl,%r10b xorb %r9b,%r14b xorb %bl,%r12b xorb %dl,%r10b leaq .Lkmasklut(%rip),%rdx movb %r14b,%r13b andq $0xf,%r14 vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 shlq $5,%r14 vmovapd (%rdx,%r14,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 shrb $4,%r13b andq $0xf,%r13 vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 shlq $5,%r13 vmovapd (%rdx,%r13,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 movb %r12b,%r11b andq $0xf,%r12 vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 shlq $5,%r12 vmovapd (%rdx,%r12,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 shrb $4,%r11b andq $0xf,%r11 vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 shlq $5,%r11 vmovapd (%rdx,%r11,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vmovq %r15,%xmm0 vpbroadcastq %xmm0,%ymm0 vpblendd $3,%ymm0,%ymm4,%ymm4 vpsrlq $52,%ymm4,%ymm0 vpsrlq $52,%ymm9,%ymm1 vpsrlq $52,%ymm10,%ymm2 vpsrlq $52,%ymm11,%ymm13 vpsrlq $52,%ymm12,%ymm14 vpermq $144,%ymm14,%ymm14 vpermq $3,%ymm13,%ymm15 vblendpd $1,%ymm15,%ymm14,%ymm14 vpermq $144,%ymm13,%ymm13 vpermq $3,%ymm2,%ymm15 vblendpd $1,%ymm15,%ymm13,%ymm13 vpermq $144,%ymm2,%ymm2 vpermq $3,%ymm1,%ymm15 vblendpd $1,%ymm15,%ymm2,%ymm2 vpermq $144,%ymm1,%ymm1 vpermq $3,%ymm0,%ymm15 vblendpd $1,%ymm15,%ymm1,%ymm1 vpermq $144,%ymm0,%ymm0 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vpand .Lmask52x4(%rip),%ymm11,%ymm11 vpand .Lmask52x4(%rip),%ymm12,%ymm12 vpaddq %ymm0,%ymm4,%ymm4 vpaddq %ymm1,%ymm9,%ymm9 vpaddq %ymm2,%ymm10,%ymm10 vpaddq %ymm13,%ymm11,%ymm11 vpaddq %ymm14,%ymm12,%ymm12 vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm0 vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm1 vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm2 vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13 vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm14 vmovmskpd %ymm0,%r14d vmovmskpd %ymm1,%r13d vmovmskpd %ymm2,%r12d vmovmskpd %ymm13,%r11d vmovmskpd %ymm14,%r10d vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm0 vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm1 vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm2 vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13 vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm14 vmovmskpd %ymm0,%r9d vmovmskpd %ymm1,%r8d vmovmskpd %ymm2,%ebx vmovmskpd %ymm13,%ecx vmovmskpd %ymm14,%edx shlb $4,%r13b orb %r13b,%r14b shlb $4,%r11b orb %r11b,%r12b addb %r14b,%r14b adcb %r12b,%r12b adcb %r10b,%r10b shlb $4,%r8b orb %r8b,%r9b shlb $4,%cl orb %cl,%bl addb %r9b,%r14b adcb %bl,%r12b adcb %dl,%r10b xorb %r9b,%r14b xorb %bl,%r12b xorb %dl,%r10b leaq .Lkmasklut(%rip),%rdx movb %r14b,%r13b andq $0xf,%r14 vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 shlq $5,%r14 vmovapd (%rdx,%r14,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 shrb $4,%r13b andq $0xf,%r13 vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 shlq $5,%r13 vmovapd (%rdx,%r13,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 movb %r12b,%r11b andq $0xf,%r12 vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 shlq $5,%r12 vmovapd (%rdx,%r12,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 shrb $4,%r11b andq $0xf,%r11 vpsubq .Lmask52x4(%rip),%ymm11,%ymm0 shlq $5,%r11 vmovapd (%rdx,%r11,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm11,%ymm11 andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm12,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm12,%ymm12 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vpand .Lmask52x4(%rip),%ymm11,%ymm11 vpand .Lmask52x4(%rip),%ymm12,%ymm12 vmovdqu %ymm3,0(%rdi) vmovdqu %ymm5,32(%rdi) vmovdqu %ymm6,64(%rdi) vmovdqu %ymm7,96(%rdi) vmovdqu %ymm8,128(%rdi) vmovdqu %ymm4,160(%rdi) vmovdqu %ymm9,192(%rdi) vmovdqu %ymm10,224(%rdi) vmovdqu %ymm11,256(%rdi) vmovdqu %ymm12,288(%rdi) vzeroupper movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbp .cfi_restore %rbp movq 40(%rsp),%rbx .cfi_restore %rbx leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lossl_rsaz_amm52x20_x2_avxifma256_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ossl_rsaz_amm52x20_x2_avxifma256, .-ossl_rsaz_amm52x20_x2_avxifma256 .text .align 32 .globl ossl_extract_multiplier_2x20_win5_avx .type ossl_extract_multiplier_2x20_win5_avx,@function ossl_extract_multiplier_2x20_win5_avx: .cfi_startproc .byte 243,15,30,250 vmovapd .Lones(%rip),%ymm14 vmovq %rdx,%xmm10 vpbroadcastq %xmm10,%ymm12 vmovq %rcx,%xmm10 vpbroadcastq %xmm10,%ymm13 leaq 10240(%rsi),%rax vpxor %xmm0,%xmm0,%xmm0 vmovapd %ymm0,%ymm11 vmovapd %ymm0,%ymm1 vmovapd %ymm0,%ymm2 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm4 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 vmovapd %ymm0,%ymm8 vmovapd %ymm0,%ymm9 .align 32 .Lloop: vpcmpeqq %ymm11,%ymm12,%ymm15 vmovdqu 0(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm0,%ymm0 vmovdqu 32(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm1,%ymm1 vmovdqu 64(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm2,%ymm2 vmovdqu 96(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm3,%ymm3 vmovdqu 128(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm4,%ymm4 vpcmpeqq %ymm11,%ymm13,%ymm15 vmovdqu 160(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm5,%ymm5 vmovdqu 192(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm6,%ymm6 vmovdqu 224(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm7,%ymm7 vmovdqu 256(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm8,%ymm8 vmovdqu 288(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm9,%ymm9 vpaddq %ymm14,%ymm11,%ymm11 addq $320,%rsi cmpq %rsi,%rax jne .Lloop vmovdqu %ymm0,0(%rdi) vmovdqu %ymm1,32(%rdi) vmovdqu %ymm2,64(%rdi) vmovdqu %ymm3,96(%rdi) vmovdqu %ymm4,128(%rdi) vmovdqu %ymm5,160(%rdi) vmovdqu %ymm6,192(%rdi) vmovdqu %ymm7,224(%rdi) vmovdqu %ymm8,256(%rdi) vmovdqu %ymm9,288(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size ossl_extract_multiplier_2x20_win5_avx, .-ossl_extract_multiplier_2x20_win5_avx .section .rodata .align 32 .Lones: .quad 1,1,1,1 .Lzeros: .quad 0,0,0,0 .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f .long 4f - 1f .long 5 0: # "GNU" encoded with .byte, since .asciz isn't supported # on Solaris. .byte 0x47 .byte 0x4e .byte 0x55 .byte 0 1: .p2align 3 .long 0xc0000002 .long 3f - 2f 2: .long 3 3: .p2align 3 4: