#include .text .globl ossl_rsaz_amm52x40_x1_avxifma256 .type ossl_rsaz_amm52x40_x1_avxifma256,@function .align 32 ossl_rsaz_amm52x40_x1_avxifma256: .cfi_startproc .byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 vpxor %ymm0,%ymm0,%ymm0 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm4 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 vmovapd %ymm0,%ymm8 vmovapd %ymm0,%ymm9 vmovapd %ymm0,%ymm10 vmovapd %ymm0,%ymm11 vmovapd %ymm0,%ymm12 xorl %r9d,%r9d movq %rdx,%r11 movq $0xfffffffffffff,%rax movl $10,%ebx .align 32 .Lloop10: movq 0(%r11),%r13 vpbroadcastq 0(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -328(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) vmovdqu %ymm11,256(%rsp) vmovdqu %ymm12,288(%rsp) movq $0,320(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 vmovdqu 264(%rsp),%ymm11 vmovdqu 296(%rsp),%ymm12 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 leaq 328(%rsp),%rsp movq 8(%r11),%r13 vpbroadcastq 8(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -328(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) vmovdqu %ymm11,256(%rsp) vmovdqu %ymm12,288(%rsp) movq $0,320(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 vmovdqu 264(%rsp),%ymm11 vmovdqu 296(%rsp),%ymm12 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 leaq 328(%rsp),%rsp movq 16(%r11),%r13 vpbroadcastq 16(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -328(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) vmovdqu %ymm11,256(%rsp) vmovdqu %ymm12,288(%rsp) movq $0,320(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 vmovdqu 264(%rsp),%ymm11 vmovdqu 296(%rsp),%ymm12 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 leaq 328(%rsp),%rsp movq 24(%r11),%r13 vpbroadcastq 24(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -328(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) vmovdqu %ymm11,256(%rsp) vmovdqu %ymm12,288(%rsp) movq $0,320(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 vmovdqu 264(%rsp),%ymm11 vmovdqu 296(%rsp),%ymm12 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 leaq 328(%rsp),%rsp leaq 32(%r11),%r11 decl %ebx jne .Lloop10 vmovq %r9,%xmm0 vpbroadcastq %xmm0,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 leaq -640(%rsp),%rsp vmovupd %ymm3,0(%rsp) vmovupd %ymm4,32(%rsp) vmovupd %ymm5,64(%rsp) vmovupd %ymm6,96(%rsp) vmovupd %ymm7,128(%rsp) vmovupd %ymm8,160(%rsp) vmovupd %ymm9,192(%rsp) vmovupd %ymm10,224(%rsp) vmovupd %ymm11,256(%rsp) vmovupd %ymm12,288(%rsp) vpsrlq $52,%ymm3,%ymm3 vpsrlq $52,%ymm4,%ymm4 vpsrlq $52,%ymm5,%ymm5 vpsrlq $52,%ymm6,%ymm6 vpsrlq $52,%ymm7,%ymm7 vpsrlq $52,%ymm8,%ymm8 vpsrlq $52,%ymm9,%ymm9 vpsrlq $52,%ymm10,%ymm10 vpsrlq $52,%ymm11,%ymm11 vpsrlq $52,%ymm12,%ymm12 vpermq $144,%ymm12,%ymm12 vpermq $3,%ymm11,%ymm13 vblendpd $1,%ymm13,%ymm12,%ymm12 vpermq $144,%ymm11,%ymm11 vpermq $3,%ymm10,%ymm13 vblendpd $1,%ymm13,%ymm11,%ymm11 vpermq $144,%ymm10,%ymm10 vpermq $3,%ymm9,%ymm13 vblendpd $1,%ymm13,%ymm10,%ymm10 vpermq $144,%ymm9,%ymm9 vpermq $3,%ymm8,%ymm13 vblendpd $1,%ymm13,%ymm9,%ymm9 vpermq $144,%ymm8,%ymm8 vpermq $3,%ymm7,%ymm13 vblendpd $1,%ymm13,%ymm8,%ymm8 vpermq $144,%ymm7,%ymm7 vpermq $3,%ymm6,%ymm13 vblendpd $1,%ymm13,%ymm7,%ymm7 vpermq $144,%ymm6,%ymm6 vpermq $3,%ymm5,%ymm13 vblendpd $1,%ymm13,%ymm6,%ymm6 vpermq $144,%ymm5,%ymm5 vpermq $3,%ymm4,%ymm13 vblendpd $1,%ymm13,%ymm5,%ymm5 vpermq $144,%ymm4,%ymm4 vpermq $3,%ymm3,%ymm13 vblendpd $1,%ymm13,%ymm4,%ymm4 vpermq $144,%ymm3,%ymm3 vpand .Lhigh64x3(%rip),%ymm3,%ymm3 vmovupd %ymm3,320(%rsp) vmovupd %ymm4,352(%rsp) vmovupd %ymm5,384(%rsp) vmovupd %ymm6,416(%rsp) vmovupd %ymm7,448(%rsp) vmovupd %ymm8,480(%rsp) vmovupd %ymm9,512(%rsp) vmovupd %ymm10,544(%rsp) vmovupd %ymm11,576(%rsp) vmovupd %ymm12,608(%rsp) vmovupd 0(%rsp),%ymm3 vmovupd 32(%rsp),%ymm4 vmovupd 64(%rsp),%ymm5 vmovupd 96(%rsp),%ymm6 vmovupd 128(%rsp),%ymm7 vmovupd 160(%rsp),%ymm8 vmovupd 192(%rsp),%ymm9 vmovupd 224(%rsp),%ymm10 vmovupd 256(%rsp),%ymm11 vmovupd 288(%rsp),%ymm12 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vpand .Lmask52x4(%rip),%ymm11,%ymm11 vpand .Lmask52x4(%rip),%ymm12,%ymm12 vpaddq 320(%rsp),%ymm3,%ymm3 vpaddq 352(%rsp),%ymm4,%ymm4 vpaddq 384(%rsp),%ymm5,%ymm5 vpaddq 416(%rsp),%ymm6,%ymm6 vpaddq 448(%rsp),%ymm7,%ymm7 vpaddq 480(%rsp),%ymm8,%ymm8 vpaddq 512(%rsp),%ymm9,%ymm9 vpaddq 544(%rsp),%ymm10,%ymm10 vpaddq 576(%rsp),%ymm11,%ymm11 vpaddq 608(%rsp),%ymm12,%ymm12 leaq 640(%rsp),%rsp vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13 vmovmskpd %ymm13,%r14d vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13 vmovmskpd %ymm13,%r13d shlb $4,%r13b orb %r13b,%r14b vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13 vmovmskpd %ymm13,%r13d vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13 vmovmskpd %ymm13,%r12d shlb $4,%r12b orb %r12b,%r13b vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 vmovmskpd %ymm13,%r12d vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm13,%r11d shlb $4,%r11b orb %r11b,%r12b vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13 vmovmskpd %ymm13,%r11d vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13 vmovmskpd %ymm13,%r10d shlb $4,%r10b orb %r10b,%r11b vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13 vmovmskpd %ymm13,%r10d vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13 vmovmskpd %ymm13,%r9d shlb $4,%r9b orb %r9b,%r10b addb %r14b,%r14b adcb %r13b,%r13b adcb %r12b,%r12b adcb %r11b,%r11b adcb %r10b,%r10b vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13 vmovmskpd %ymm13,%r9d vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13 vmovmskpd %ymm13,%r8d shlb $4,%r8b orb %r8b,%r9b vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13 vmovmskpd %ymm13,%r8d vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13 vmovmskpd %ymm13,%edx shlb $4,%dl orb %dl,%r8b vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 vmovmskpd %ymm13,%edx vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm13,%ecx shlb $4,%cl orb %cl,%dl vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13 vmovmskpd %ymm13,%ecx vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13 vmovmskpd %ymm13,%ebx shlb $4,%bl orb %bl,%cl vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13 vmovmskpd %ymm13,%ebx vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13 vmovmskpd %ymm13,%eax shlb $4,%al orb %al,%bl addb %r9b,%r14b adcb %r8b,%r13b adcb %dl,%r12b adcb %cl,%r11b adcb %bl,%r10b xorb %r9b,%r14b xorb %r8b,%r13b xorb %dl,%r12b xorb %cl,%r11b xorb %bl,%r10b pushq %r9 pushq %r8 leaq .Lkmasklut(%rip),%r8 movb %r14b,%r9b andq $0xf,%r14 vpsubq .Lmask52x4(%rip),%ymm3,%ymm13 shlq $5,%r14 vmovapd (%r8,%r14,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm3,%ymm3 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm4,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm4,%ymm4 movb %r13b,%r9b andq $0xf,%r13 vpsubq .Lmask52x4(%rip),%ymm5,%ymm13 shlq $5,%r13 vmovapd (%r8,%r13,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm5,%ymm5 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm6,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm6,%ymm6 movb %r12b,%r9b andq $0xf,%r12 vpsubq .Lmask52x4(%rip),%ymm7,%ymm13 shlq $5,%r12 vmovapd (%r8,%r12,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm7,%ymm7 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm8,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm8,%ymm8 movb %r11b,%r9b andq $0xf,%r11 vpsubq .Lmask52x4(%rip),%ymm9,%ymm13 shlq $5,%r11 vmovapd (%r8,%r11,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm9,%ymm9 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm10,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm10,%ymm10 movb %r10b,%r9b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm11,%ymm13 shlq $5,%r10 vmovapd (%r8,%r10,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm11,%ymm11 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm12,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm12,%ymm12 popq %r8 popq %r9 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vpand .Lmask52x4(%rip),%ymm11,%ymm11 vpand .Lmask52x4(%rip),%ymm12,%ymm12 vmovdqu %ymm3,0(%rdi) vmovdqu %ymm4,32(%rdi) vmovdqu %ymm5,64(%rdi) vmovdqu %ymm6,96(%rdi) vmovdqu %ymm7,128(%rdi) vmovdqu %ymm8,160(%rdi) vmovdqu %ymm9,192(%rdi) vmovdqu %ymm10,224(%rdi) vmovdqu %ymm11,256(%rdi) vmovdqu %ymm12,288(%rdi) vzeroupper leaq (%rsp),%rax .cfi_def_cfa_register %rax movq 0(%rax),%r15 .cfi_restore %r15 movq 8(%rax),%r14 .cfi_restore %r14 movq 16(%rax),%r13 .cfi_restore %r13 movq 24(%rax),%r12 .cfi_restore %r12 movq 32(%rax),%rbp .cfi_restore %rbp movq 40(%rax),%rbx .cfi_restore %rbx leaq 48(%rax),%rsp .cfi_def_cfa %rsp,8 .Lossl_rsaz_amm52x40_x1_avxifma256_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ossl_rsaz_amm52x40_x1_avxifma256, .-ossl_rsaz_amm52x40_x1_avxifma256 .section .rodata .align 32 .Lmask52x4: .quad 0xfffffffffffff .quad 0xfffffffffffff .quad 0xfffffffffffff .quad 0xfffffffffffff .Lhigh64x3: .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .Lkmasklut: .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .text .globl ossl_rsaz_amm52x40_x2_avxifma256 .type ossl_rsaz_amm52x40_x2_avxifma256,@function .align 32 ossl_rsaz_amm52x40_x2_avxifma256: .cfi_startproc .byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 vpxor %ymm0,%ymm0,%ymm0 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm4 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 vmovapd %ymm0,%ymm8 vmovapd %ymm0,%ymm9 vmovapd %ymm0,%ymm10 vmovapd %ymm0,%ymm11 vmovapd %ymm0,%ymm12 xorl %r9d,%r9d movq %rdx,%r11 movq $0xfffffffffffff,%rax movl $40,%ebx .align 32 .Lloop40: movq 0(%r11),%r13 vpbroadcastq 0(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq (%r8),%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -328(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm12 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) vmovdqu %ymm11,256(%rsp) vmovdqu %ymm12,288(%rsp) movq $0,320(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 vmovdqu 264(%rsp),%ymm11 vmovdqu 296(%rsp),%ymm12 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm11 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm12 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm11 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm12 leaq 328(%rsp),%rsp leaq 8(%r11),%r11 decl %ebx jne .Lloop40 pushq %r11 pushq %rsi pushq %rcx pushq %r8 vmovq %r9,%xmm0 vpbroadcastq %xmm0,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 leaq -640(%rsp),%rsp vmovupd %ymm3,0(%rsp) vmovupd %ymm4,32(%rsp) vmovupd %ymm5,64(%rsp) vmovupd %ymm6,96(%rsp) vmovupd %ymm7,128(%rsp) vmovupd %ymm8,160(%rsp) vmovupd %ymm9,192(%rsp) vmovupd %ymm10,224(%rsp) vmovupd %ymm11,256(%rsp) vmovupd %ymm12,288(%rsp) vpsrlq $52,%ymm3,%ymm3 vpsrlq $52,%ymm4,%ymm4 vpsrlq $52,%ymm5,%ymm5 vpsrlq $52,%ymm6,%ymm6 vpsrlq $52,%ymm7,%ymm7 vpsrlq $52,%ymm8,%ymm8 vpsrlq $52,%ymm9,%ymm9 vpsrlq $52,%ymm10,%ymm10 vpsrlq $52,%ymm11,%ymm11 vpsrlq $52,%ymm12,%ymm12 vpermq $144,%ymm12,%ymm12 vpermq $3,%ymm11,%ymm13 vblendpd $1,%ymm13,%ymm12,%ymm12 vpermq $144,%ymm11,%ymm11 vpermq $3,%ymm10,%ymm13 vblendpd $1,%ymm13,%ymm11,%ymm11 vpermq $144,%ymm10,%ymm10 vpermq $3,%ymm9,%ymm13 vblendpd $1,%ymm13,%ymm10,%ymm10 vpermq $144,%ymm9,%ymm9 vpermq $3,%ymm8,%ymm13 vblendpd $1,%ymm13,%ymm9,%ymm9 vpermq $144,%ymm8,%ymm8 vpermq $3,%ymm7,%ymm13 vblendpd $1,%ymm13,%ymm8,%ymm8 vpermq $144,%ymm7,%ymm7 vpermq $3,%ymm6,%ymm13 vblendpd $1,%ymm13,%ymm7,%ymm7 vpermq $144,%ymm6,%ymm6 vpermq $3,%ymm5,%ymm13 vblendpd $1,%ymm13,%ymm6,%ymm6 vpermq $144,%ymm5,%ymm5 vpermq $3,%ymm4,%ymm13 vblendpd $1,%ymm13,%ymm5,%ymm5 vpermq $144,%ymm4,%ymm4 vpermq $3,%ymm3,%ymm13 vblendpd $1,%ymm13,%ymm4,%ymm4 vpermq $144,%ymm3,%ymm3 vpand .Lhigh64x3(%rip),%ymm3,%ymm3 vmovupd %ymm3,320(%rsp) vmovupd %ymm4,352(%rsp) vmovupd %ymm5,384(%rsp) vmovupd %ymm6,416(%rsp) vmovupd %ymm7,448(%rsp) vmovupd %ymm8,480(%rsp) vmovupd %ymm9,512(%rsp) vmovupd %ymm10,544(%rsp) vmovupd %ymm11,576(%rsp) vmovupd %ymm12,608(%rsp) vmovupd 0(%rsp),%ymm3 vmovupd 32(%rsp),%ymm4 vmovupd 64(%rsp),%ymm5 vmovupd 96(%rsp),%ymm6 vmovupd 128(%rsp),%ymm7 vmovupd 160(%rsp),%ymm8 vmovupd 192(%rsp),%ymm9 vmovupd 224(%rsp),%ymm10 vmovupd 256(%rsp),%ymm11 vmovupd 288(%rsp),%ymm12 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vpand .Lmask52x4(%rip),%ymm11,%ymm11 vpand .Lmask52x4(%rip),%ymm12,%ymm12 vpaddq 320(%rsp),%ymm3,%ymm3 vpaddq 352(%rsp),%ymm4,%ymm4 vpaddq 384(%rsp),%ymm5,%ymm5 vpaddq 416(%rsp),%ymm6,%ymm6 vpaddq 448(%rsp),%ymm7,%ymm7 vpaddq 480(%rsp),%ymm8,%ymm8 vpaddq 512(%rsp),%ymm9,%ymm9 vpaddq 544(%rsp),%ymm10,%ymm10 vpaddq 576(%rsp),%ymm11,%ymm11 vpaddq 608(%rsp),%ymm12,%ymm12 leaq 640(%rsp),%rsp vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13 vmovmskpd %ymm13,%r14d vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13 vmovmskpd %ymm13,%r13d shlb $4,%r13b orb %r13b,%r14b vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13 vmovmskpd %ymm13,%r13d vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13 vmovmskpd %ymm13,%r12d shlb $4,%r12b orb %r12b,%r13b vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 vmovmskpd %ymm13,%r12d vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm13,%r11d shlb $4,%r11b orb %r11b,%r12b vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13 vmovmskpd %ymm13,%r11d vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13 vmovmskpd %ymm13,%r10d shlb $4,%r10b orb %r10b,%r11b vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13 vmovmskpd %ymm13,%r10d vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13 vmovmskpd %ymm13,%r9d shlb $4,%r9b orb %r9b,%r10b addb %r14b,%r14b adcb %r13b,%r13b adcb %r12b,%r12b adcb %r11b,%r11b adcb %r10b,%r10b vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13 vmovmskpd %ymm13,%r9d vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13 vmovmskpd %ymm13,%r8d shlb $4,%r8b orb %r8b,%r9b vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13 vmovmskpd %ymm13,%r8d vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13 vmovmskpd %ymm13,%edx shlb $4,%dl orb %dl,%r8b vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 vmovmskpd %ymm13,%edx vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm13,%ecx shlb $4,%cl orb %cl,%dl vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13 vmovmskpd %ymm13,%ecx vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13 vmovmskpd %ymm13,%ebx shlb $4,%bl orb %bl,%cl vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13 vmovmskpd %ymm13,%ebx vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13 vmovmskpd %ymm13,%eax shlb $4,%al orb %al,%bl addb %r9b,%r14b adcb %r8b,%r13b adcb %dl,%r12b adcb %cl,%r11b adcb %bl,%r10b xorb %r9b,%r14b xorb %r8b,%r13b xorb %dl,%r12b xorb %cl,%r11b xorb %bl,%r10b pushq %r9 pushq %r8 leaq .Lkmasklut(%rip),%r8 movb %r14b,%r9b andq $0xf,%r14 vpsubq .Lmask52x4(%rip),%ymm3,%ymm13 shlq $5,%r14 vmovapd (%r8,%r14,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm3,%ymm3 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm4,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm4,%ymm4 movb %r13b,%r9b andq $0xf,%r13 vpsubq .Lmask52x4(%rip),%ymm5,%ymm13 shlq $5,%r13 vmovapd (%r8,%r13,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm5,%ymm5 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm6,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm6,%ymm6 movb %r12b,%r9b andq $0xf,%r12 vpsubq .Lmask52x4(%rip),%ymm7,%ymm13 shlq $5,%r12 vmovapd (%r8,%r12,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm7,%ymm7 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm8,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm8,%ymm8 movb %r11b,%r9b andq $0xf,%r11 vpsubq .Lmask52x4(%rip),%ymm9,%ymm13 shlq $5,%r11 vmovapd (%r8,%r11,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm9,%ymm9 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm10,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm10,%ymm10 movb %r10b,%r9b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm11,%ymm13 shlq $5,%r10 vmovapd (%r8,%r10,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm11,%ymm11 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm12,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm12,%ymm12 popq %r8 popq %r9 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vpand .Lmask52x4(%rip),%ymm11,%ymm11 vpand .Lmask52x4(%rip),%ymm12,%ymm12 popq %r8 popq %rcx popq %rsi popq %r11 vmovdqu %ymm3,0(%rdi) vmovdqu %ymm4,32(%rdi) vmovdqu %ymm5,64(%rdi) vmovdqu %ymm6,96(%rdi) vmovdqu %ymm7,128(%rdi) vmovdqu %ymm8,160(%rdi) vmovdqu %ymm9,192(%rdi) vmovdqu %ymm10,224(%rdi) vmovdqu %ymm11,256(%rdi) vmovdqu %ymm12,288(%rdi) xorl %r15d,%r15d movq $0xfffffffffffff,%rax movl $40,%ebx vpxor %ymm0,%ymm0,%ymm0 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm4 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 vmovapd %ymm0,%ymm8 vmovapd %ymm0,%ymm9 vmovapd %ymm0,%ymm10 vmovapd %ymm0,%ymm11 vmovapd %ymm0,%ymm12 .align 32 .Lloop40_1: movq 0(%r11),%r13 vpbroadcastq 0(%r11),%ymm1 movq 320(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq 8(%r8),%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 320(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -328(%rsp),%rsp {vex} vpmadd52luq 320(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 352(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 384(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 416(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 448(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 480(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 512(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 544(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 576(%rsi),%ymm1,%ymm11 {vex} vpmadd52luq 608(%rsi),%ymm1,%ymm12 {vex} vpmadd52luq 320(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 352(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 384(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 416(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 448(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 480(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 512(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 544(%rcx),%ymm2,%ymm10 {vex} vpmadd52luq 576(%rcx),%ymm2,%ymm11 {vex} vpmadd52luq 608(%rcx),%ymm2,%ymm12 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) vmovdqu %ymm11,256(%rsp) vmovdqu %ymm12,288(%rsp) movq $0,320(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 vmovdqu 264(%rsp),%ymm11 vmovdqu 296(%rsp),%ymm12 addq 8(%rsp),%r9 {vex} vpmadd52huq 320(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 352(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 384(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 416(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 448(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 480(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 512(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 544(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 576(%rsi),%ymm1,%ymm11 {vex} vpmadd52huq 608(%rsi),%ymm1,%ymm12 {vex} vpmadd52huq 320(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 352(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 384(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 416(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 448(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 480(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 512(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 544(%rcx),%ymm2,%ymm10 {vex} vpmadd52huq 576(%rcx),%ymm2,%ymm11 {vex} vpmadd52huq 608(%rcx),%ymm2,%ymm12 leaq 328(%rsp),%rsp leaq 8(%r11),%r11 decl %ebx jne .Lloop40_1 vmovq %r9,%xmm0 vpbroadcastq %xmm0,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 leaq -640(%rsp),%rsp vmovupd %ymm3,0(%rsp) vmovupd %ymm4,32(%rsp) vmovupd %ymm5,64(%rsp) vmovupd %ymm6,96(%rsp) vmovupd %ymm7,128(%rsp) vmovupd %ymm8,160(%rsp) vmovupd %ymm9,192(%rsp) vmovupd %ymm10,224(%rsp) vmovupd %ymm11,256(%rsp) vmovupd %ymm12,288(%rsp) vpsrlq $52,%ymm3,%ymm3 vpsrlq $52,%ymm4,%ymm4 vpsrlq $52,%ymm5,%ymm5 vpsrlq $52,%ymm6,%ymm6 vpsrlq $52,%ymm7,%ymm7 vpsrlq $52,%ymm8,%ymm8 vpsrlq $52,%ymm9,%ymm9 vpsrlq $52,%ymm10,%ymm10 vpsrlq $52,%ymm11,%ymm11 vpsrlq $52,%ymm12,%ymm12 vpermq $144,%ymm12,%ymm12 vpermq $3,%ymm11,%ymm13 vblendpd $1,%ymm13,%ymm12,%ymm12 vpermq $144,%ymm11,%ymm11 vpermq $3,%ymm10,%ymm13 vblendpd $1,%ymm13,%ymm11,%ymm11 vpermq $144,%ymm10,%ymm10 vpermq $3,%ymm9,%ymm13 vblendpd $1,%ymm13,%ymm10,%ymm10 vpermq $144,%ymm9,%ymm9 vpermq $3,%ymm8,%ymm13 vblendpd $1,%ymm13,%ymm9,%ymm9 vpermq $144,%ymm8,%ymm8 vpermq $3,%ymm7,%ymm13 vblendpd $1,%ymm13,%ymm8,%ymm8 vpermq $144,%ymm7,%ymm7 vpermq $3,%ymm6,%ymm13 vblendpd $1,%ymm13,%ymm7,%ymm7 vpermq $144,%ymm6,%ymm6 vpermq $3,%ymm5,%ymm13 vblendpd $1,%ymm13,%ymm6,%ymm6 vpermq $144,%ymm5,%ymm5 vpermq $3,%ymm4,%ymm13 vblendpd $1,%ymm13,%ymm5,%ymm5 vpermq $144,%ymm4,%ymm4 vpermq $3,%ymm3,%ymm13 vblendpd $1,%ymm13,%ymm4,%ymm4 vpermq $144,%ymm3,%ymm3 vpand .Lhigh64x3(%rip),%ymm3,%ymm3 vmovupd %ymm3,320(%rsp) vmovupd %ymm4,352(%rsp) vmovupd %ymm5,384(%rsp) vmovupd %ymm6,416(%rsp) vmovupd %ymm7,448(%rsp) vmovupd %ymm8,480(%rsp) vmovupd %ymm9,512(%rsp) vmovupd %ymm10,544(%rsp) vmovupd %ymm11,576(%rsp) vmovupd %ymm12,608(%rsp) vmovupd 0(%rsp),%ymm3 vmovupd 32(%rsp),%ymm4 vmovupd 64(%rsp),%ymm5 vmovupd 96(%rsp),%ymm6 vmovupd 128(%rsp),%ymm7 vmovupd 160(%rsp),%ymm8 vmovupd 192(%rsp),%ymm9 vmovupd 224(%rsp),%ymm10 vmovupd 256(%rsp),%ymm11 vmovupd 288(%rsp),%ymm12 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vpand .Lmask52x4(%rip),%ymm11,%ymm11 vpand .Lmask52x4(%rip),%ymm12,%ymm12 vpaddq 320(%rsp),%ymm3,%ymm3 vpaddq 352(%rsp),%ymm4,%ymm4 vpaddq 384(%rsp),%ymm5,%ymm5 vpaddq 416(%rsp),%ymm6,%ymm6 vpaddq 448(%rsp),%ymm7,%ymm7 vpaddq 480(%rsp),%ymm8,%ymm8 vpaddq 512(%rsp),%ymm9,%ymm9 vpaddq 544(%rsp),%ymm10,%ymm10 vpaddq 576(%rsp),%ymm11,%ymm11 vpaddq 608(%rsp),%ymm12,%ymm12 leaq 640(%rsp),%rsp vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm13 vmovmskpd %ymm13,%r14d vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm13 vmovmskpd %ymm13,%r13d shlb $4,%r13b orb %r13b,%r14b vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm13 vmovmskpd %ymm13,%r13d vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm13 vmovmskpd %ymm13,%r12d shlb $4,%r12b orb %r12b,%r13b vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm13 vmovmskpd %ymm13,%r12d vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm13,%r11d shlb $4,%r11b orb %r11b,%r12b vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm13 vmovmskpd %ymm13,%r11d vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm13 vmovmskpd %ymm13,%r10d shlb $4,%r10b orb %r10b,%r11b vpcmpgtq .Lmask52x4(%rip),%ymm11,%ymm13 vmovmskpd %ymm13,%r10d vpcmpgtq .Lmask52x4(%rip),%ymm12,%ymm13 vmovmskpd %ymm13,%r9d shlb $4,%r9b orb %r9b,%r10b addb %r14b,%r14b adcb %r13b,%r13b adcb %r12b,%r12b adcb %r11b,%r11b adcb %r10b,%r10b vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm13 vmovmskpd %ymm13,%r9d vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm13 vmovmskpd %ymm13,%r8d shlb $4,%r8b orb %r8b,%r9b vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm13 vmovmskpd %ymm13,%r8d vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm13 vmovmskpd %ymm13,%edx shlb $4,%dl orb %dl,%r8b vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm13 vmovmskpd %ymm13,%edx vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm13,%ecx shlb $4,%cl orb %cl,%dl vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm13 vmovmskpd %ymm13,%ecx vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm13 vmovmskpd %ymm13,%ebx shlb $4,%bl orb %bl,%cl vpcmpeqq .Lmask52x4(%rip),%ymm11,%ymm13 vmovmskpd %ymm13,%ebx vpcmpeqq .Lmask52x4(%rip),%ymm12,%ymm13 vmovmskpd %ymm13,%eax shlb $4,%al orb %al,%bl addb %r9b,%r14b adcb %r8b,%r13b adcb %dl,%r12b adcb %cl,%r11b adcb %bl,%r10b xorb %r9b,%r14b xorb %r8b,%r13b xorb %dl,%r12b xorb %cl,%r11b xorb %bl,%r10b pushq %r9 pushq %r8 leaq .Lkmasklut(%rip),%r8 movb %r14b,%r9b andq $0xf,%r14 vpsubq .Lmask52x4(%rip),%ymm3,%ymm13 shlq $5,%r14 vmovapd (%r8,%r14,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm3,%ymm3 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm4,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm4,%ymm4 movb %r13b,%r9b andq $0xf,%r13 vpsubq .Lmask52x4(%rip),%ymm5,%ymm13 shlq $5,%r13 vmovapd (%r8,%r13,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm5,%ymm5 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm6,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm6,%ymm6 movb %r12b,%r9b andq $0xf,%r12 vpsubq .Lmask52x4(%rip),%ymm7,%ymm13 shlq $5,%r12 vmovapd (%r8,%r12,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm7,%ymm7 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm8,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm8,%ymm8 movb %r11b,%r9b andq $0xf,%r11 vpsubq .Lmask52x4(%rip),%ymm9,%ymm13 shlq $5,%r11 vmovapd (%r8,%r11,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm9,%ymm9 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm10,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm10,%ymm10 movb %r10b,%r9b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm11,%ymm13 shlq $5,%r10 vmovapd (%r8,%r10,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm11,%ymm11 shrb $4,%r9b andq $0xf,%r9 vpsubq .Lmask52x4(%rip),%ymm12,%ymm13 shlq $5,%r9 vmovapd (%r8,%r9,1),%ymm14 vblendvpd %ymm14,%ymm13,%ymm12,%ymm12 popq %r8 popq %r9 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vpand .Lmask52x4(%rip),%ymm11,%ymm11 vpand .Lmask52x4(%rip),%ymm12,%ymm12 vmovdqu %ymm3,320(%rdi) vmovdqu %ymm4,352(%rdi) vmovdqu %ymm5,384(%rdi) vmovdqu %ymm6,416(%rdi) vmovdqu %ymm7,448(%rdi) vmovdqu %ymm8,480(%rdi) vmovdqu %ymm9,512(%rdi) vmovdqu %ymm10,544(%rdi) vmovdqu %ymm11,576(%rdi) vmovdqu %ymm12,608(%rdi) vzeroupper leaq (%rsp),%rax .cfi_def_cfa_register %rax movq 0(%rax),%r15 .cfi_restore %r15 movq 8(%rax),%r14 .cfi_restore %r14 movq 16(%rax),%r13 .cfi_restore %r13 movq 24(%rax),%r12 .cfi_restore %r12 movq 32(%rax),%rbp .cfi_restore %rbp movq 40(%rax),%rbx .cfi_restore %rbx leaq 48(%rax),%rsp .cfi_def_cfa %rsp,8 .Lossl_rsaz_amm52x40_x2_avxifma256_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ossl_rsaz_amm52x40_x2_avxifma256, .-ossl_rsaz_amm52x40_x2_avxifma256 .text .align 32 .globl ossl_extract_multiplier_2x40_win5_avx .type ossl_extract_multiplier_2x40_win5_avx,@function ossl_extract_multiplier_2x40_win5_avx: .cfi_startproc .byte 243,15,30,250 vmovapd .Lones(%rip),%ymm14 vmovq %rdx,%xmm10 vpbroadcastq %xmm10,%ymm12 vmovq %rcx,%xmm10 vpbroadcastq %xmm10,%ymm13 leaq 20480(%rsi),%rax movq %rsi,%r10 vpxor %xmm0,%xmm0,%xmm0 vmovapd %ymm0,%ymm1 vmovapd %ymm0,%ymm2 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm4 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 vmovapd %ymm0,%ymm8 vmovapd %ymm0,%ymm9 vpxor %ymm11,%ymm11,%ymm11 .align 32 .Lloop_0: vpcmpeqq %ymm11,%ymm12,%ymm15 vmovdqu 0(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm0,%ymm0 vmovdqu 32(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm1,%ymm1 vmovdqu 64(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm2,%ymm2 vmovdqu 96(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm3,%ymm3 vmovdqu 128(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm4,%ymm4 vmovdqu 160(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm5,%ymm5 vmovdqu 192(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm6,%ymm6 vmovdqu 224(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm7,%ymm7 vmovdqu 256(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm8,%ymm8 vmovdqu 288(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm9,%ymm9 vpaddq %ymm14,%ymm11,%ymm11 addq $640,%rsi cmpq %rsi,%rax jne .Lloop_0 vmovdqu %ymm0,0(%rdi) vmovdqu %ymm1,32(%rdi) vmovdqu %ymm2,64(%rdi) vmovdqu %ymm3,96(%rdi) vmovdqu %ymm4,128(%rdi) vmovdqu %ymm5,160(%rdi) vmovdqu %ymm6,192(%rdi) vmovdqu %ymm7,224(%rdi) vmovdqu %ymm8,256(%rdi) vmovdqu %ymm9,288(%rdi) movq %r10,%rsi vpxor %ymm11,%ymm11,%ymm11 .align 32 .Lloop_320: vpcmpeqq %ymm11,%ymm13,%ymm15 vmovdqu 320(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm0,%ymm0 vmovdqu 352(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm1,%ymm1 vmovdqu 384(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm2,%ymm2 vmovdqu 416(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm3,%ymm3 vmovdqu 448(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm4,%ymm4 vmovdqu 480(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm5,%ymm5 vmovdqu 512(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm6,%ymm6 vmovdqu 544(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm7,%ymm7 vmovdqu 576(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm8,%ymm8 vmovdqu 608(%rsi),%ymm10 vblendvpd %ymm15,%ymm10,%ymm9,%ymm9 vpaddq %ymm14,%ymm11,%ymm11 addq $640,%rsi cmpq %rsi,%rax jne .Lloop_320 vmovdqu %ymm0,320(%rdi) vmovdqu %ymm1,352(%rdi) vmovdqu %ymm2,384(%rdi) vmovdqu %ymm3,416(%rdi) vmovdqu %ymm4,448(%rdi) vmovdqu %ymm5,480(%rdi) vmovdqu %ymm6,512(%rdi) vmovdqu %ymm7,544(%rdi) vmovdqu %ymm8,576(%rdi) vmovdqu %ymm9,608(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size ossl_extract_multiplier_2x40_win5_avx, .-ossl_extract_multiplier_2x40_win5_avx .section .rodata .align 32 .Lones: .quad 1,1,1,1 .Lzeros: .quad 0,0,0,0 .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f .long 4f - 1f .long 5 0: # "GNU" encoded with .byte, since .asciz isn't supported # on Solaris. .byte 0x47 .byte 0x4e .byte 0x55 .byte 0 1: .p2align 3 .long 0xc0000002 .long 3f - 2f 2: .long 3 3: .p2align 3 4: