#include .text .globl ossl_rsaz_amm52x30_x1_avxifma256 .type ossl_rsaz_amm52x30_x1_avxifma256,@function .align 32 ossl_rsaz_amm52x30_x1_avxifma256: .cfi_startproc .byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 vpxor %ymm0,%ymm0,%ymm0 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm4 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 vmovapd %ymm0,%ymm8 vmovapd %ymm0,%ymm9 vmovapd %ymm0,%ymm10 xorl %r9d,%r9d movq %rdx,%r11 movq $0xfffffffffffff,%rax movl $7,%ebx .align 32 .Lloop7: movq 0(%r11),%r13 vpbroadcastq 0(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -264(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) movq $0,256(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 leaq 264(%rsp),%rsp movq 8(%r11),%r13 vpbroadcastq 8(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -264(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) movq $0,256(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 leaq 264(%rsp),%rsp movq 16(%r11),%r13 vpbroadcastq 16(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -264(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) movq $0,256(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 leaq 264(%rsp),%rsp movq 24(%r11),%r13 vpbroadcastq 24(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -264(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) movq $0,256(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 leaq 264(%rsp),%rsp leaq 32(%r11),%r11 decl %ebx jne .Lloop7 movq 0(%r11),%r13 vpbroadcastq 0(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -264(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) movq $0,256(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 leaq 264(%rsp),%rsp movq 8(%r11),%r13 vpbroadcastq 8(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq %r8,%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -264(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) movq $0,256(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 leaq 264(%rsp),%rsp vmovq %r9,%xmm0 vpbroadcastq %xmm0,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 vpsrlq $52,%ymm3,%ymm0 vpsrlq $52,%ymm4,%ymm1 vpsrlq $52,%ymm5,%ymm2 vpsrlq $52,%ymm6,%ymm11 vpsrlq $52,%ymm7,%ymm12 vpsrlq $52,%ymm8,%ymm13 vpsrlq $52,%ymm9,%ymm14 vpsrlq $52,%ymm10,%ymm15 leaq -32(%rsp),%rsp vmovupd %ymm3,(%rsp) vpermq $144,%ymm15,%ymm15 vpermq $3,%ymm14,%ymm3 vblendpd $1,%ymm3,%ymm15,%ymm15 vpermq $144,%ymm14,%ymm14 vpermq $3,%ymm13,%ymm3 vblendpd $1,%ymm3,%ymm14,%ymm14 vpermq $144,%ymm13,%ymm13 vpermq $3,%ymm12,%ymm3 vblendpd $1,%ymm3,%ymm13,%ymm13 vpermq $144,%ymm12,%ymm12 vpermq $3,%ymm11,%ymm3 vblendpd $1,%ymm3,%ymm12,%ymm12 vpermq $144,%ymm11,%ymm11 vpermq $3,%ymm2,%ymm3 vblendpd $1,%ymm3,%ymm11,%ymm11 vpermq $144,%ymm2,%ymm2 vpermq $3,%ymm1,%ymm3 vblendpd $1,%ymm3,%ymm2,%ymm2 vpermq $144,%ymm1,%ymm1 vpermq $3,%ymm0,%ymm3 vblendpd $1,%ymm3,%ymm1,%ymm1 vpermq $144,%ymm0,%ymm0 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 vmovupd (%rsp),%ymm3 leaq 32(%rsp),%rsp vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vpaddq %ymm0,%ymm3,%ymm3 vpaddq %ymm1,%ymm4,%ymm4 vpaddq %ymm2,%ymm5,%ymm5 vpaddq %ymm11,%ymm6,%ymm6 vpaddq %ymm12,%ymm7,%ymm7 vpaddq %ymm13,%ymm8,%ymm8 vpaddq %ymm14,%ymm9,%ymm9 vpaddq %ymm15,%ymm10,%ymm10 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1 vmovmskpd %ymm0,%r14d vmovmskpd %ymm1,%r13d shlb $4,%r13b orb %r13b,%r14b vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11 vmovmskpd %ymm2,%r13d vmovmskpd %ymm11,%r12d shlb $4,%r12b orb %r12b,%r13b vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm12,%r12d vmovmskpd %ymm13,%r11d shlb $4,%r11b orb %r11b,%r12b vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14 vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15 vmovmskpd %ymm14,%r11d vmovmskpd %ymm15,%r10d shlb $4,%r10b orb %r10b,%r11b addb %r14b,%r14b adcb %r13b,%r13b adcb %r12b,%r12b adcb %r11b,%r11b vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1 vmovmskpd %ymm0,%r9d vmovmskpd %ymm1,%r8d shlb $4,%r8b orb %r8b,%r9b vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11 vmovmskpd %ymm2,%r8d vmovmskpd %ymm11,%edx shlb $4,%dl orb %dl,%r8b vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm12,%edx vmovmskpd %ymm13,%ecx shlb $4,%cl orb %cl,%dl vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14 vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15 vmovmskpd %ymm14,%ecx vmovmskpd %ymm15,%ebx shlb $4,%bl orb %bl,%cl addb %r9b,%r14b adcb %r8b,%r13b adcb %dl,%r12b adcb %cl,%r11b xorb %r9b,%r14b xorb %r8b,%r13b xorb %dl,%r12b xorb %cl,%r11b leaq .Lkmasklut(%rip),%rdx movb %r14b,%r10b andq $0xf,%r14 vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 shlq $5,%r14 vmovapd (%rdx,%r14,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 movb %r13b,%r10b andq $0xf,%r13 vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 shlq $5,%r13 vmovapd (%rdx,%r13,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 movb %r12b,%r10b andq $0xf,%r12 vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 shlq $5,%r12 vmovapd (%rdx,%r12,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 movb %r11b,%r10b andq $0xf,%r11 vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 shlq $5,%r11 vmovapd (%rdx,%r11,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vmovdqu %ymm3,0(%rdi) vmovdqu %ymm4,32(%rdi) vmovdqu %ymm5,64(%rdi) vmovdqu %ymm6,96(%rdi) vmovdqu %ymm7,128(%rdi) vmovdqu %ymm8,160(%rdi) vmovdqu %ymm9,192(%rdi) vmovdqu %ymm10,224(%rdi) vzeroupper leaq (%rsp),%rax .cfi_def_cfa_register %rax movq 0(%rax),%r15 .cfi_restore %r15 movq 8(%rax),%r14 .cfi_restore %r14 movq 16(%rax),%r13 .cfi_restore %r13 movq 24(%rax),%r12 .cfi_restore %r12 movq 32(%rax),%rbp .cfi_restore %rbp movq 40(%rax),%rbx .cfi_restore %rbx leaq 48(%rax),%rsp .cfi_def_cfa %rsp,8 .Lossl_rsaz_amm52x30_x1_avxifma256_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ossl_rsaz_amm52x30_x1_avxifma256, .-ossl_rsaz_amm52x30_x1_avxifma256 .section .rodata .align 32 .Lmask52x4: .quad 0xfffffffffffff .quad 0xfffffffffffff .quad 0xfffffffffffff .quad 0xfffffffffffff .Lhigh64x3: .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .Lkmasklut: .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0x0 .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0x0 .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .quad 0xffffffffffffffff .text .globl ossl_rsaz_amm52x30_x2_avxifma256 .type ossl_rsaz_amm52x30_x2_avxifma256,@function .align 32 ossl_rsaz_amm52x30_x2_avxifma256: .cfi_startproc .byte 243,15,30,250 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 vpxor %ymm0,%ymm0,%ymm0 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm4 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 vmovapd %ymm0,%ymm8 vmovapd %ymm0,%ymm9 vmovapd %ymm0,%ymm10 xorl %r9d,%r9d movq %rdx,%r11 movq $0xfffffffffffff,%rax movl $30,%ebx .align 32 .Lloop30: movq 0(%r11),%r13 vpbroadcastq 0(%r11),%ymm1 movq 0(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq (%r8),%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 0(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -264(%rsp),%rsp {vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) movq $0,256(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 addq 8(%rsp),%r9 {vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 leaq 264(%rsp),%rsp leaq 8(%r11),%r11 decl %ebx jne .Lloop30 pushq %r11 pushq %rsi pushq %rcx pushq %r8 vmovq %r9,%xmm0 vpbroadcastq %xmm0,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 vpsrlq $52,%ymm3,%ymm0 vpsrlq $52,%ymm4,%ymm1 vpsrlq $52,%ymm5,%ymm2 vpsrlq $52,%ymm6,%ymm11 vpsrlq $52,%ymm7,%ymm12 vpsrlq $52,%ymm8,%ymm13 vpsrlq $52,%ymm9,%ymm14 vpsrlq $52,%ymm10,%ymm15 leaq -32(%rsp),%rsp vmovupd %ymm3,(%rsp) vpermq $144,%ymm15,%ymm15 vpermq $3,%ymm14,%ymm3 vblendpd $1,%ymm3,%ymm15,%ymm15 vpermq $144,%ymm14,%ymm14 vpermq $3,%ymm13,%ymm3 vblendpd $1,%ymm3,%ymm14,%ymm14 vpermq $144,%ymm13,%ymm13 vpermq $3,%ymm12,%ymm3 vblendpd $1,%ymm3,%ymm13,%ymm13 vpermq $144,%ymm12,%ymm12 vpermq $3,%ymm11,%ymm3 vblendpd $1,%ymm3,%ymm12,%ymm12 vpermq $144,%ymm11,%ymm11 vpermq $3,%ymm2,%ymm3 vblendpd $1,%ymm3,%ymm11,%ymm11 vpermq $144,%ymm2,%ymm2 vpermq $3,%ymm1,%ymm3 vblendpd $1,%ymm3,%ymm2,%ymm2 vpermq $144,%ymm1,%ymm1 vpermq $3,%ymm0,%ymm3 vblendpd $1,%ymm3,%ymm1,%ymm1 vpermq $144,%ymm0,%ymm0 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 vmovupd (%rsp),%ymm3 leaq 32(%rsp),%rsp vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vpaddq %ymm0,%ymm3,%ymm3 vpaddq %ymm1,%ymm4,%ymm4 vpaddq %ymm2,%ymm5,%ymm5 vpaddq %ymm11,%ymm6,%ymm6 vpaddq %ymm12,%ymm7,%ymm7 vpaddq %ymm13,%ymm8,%ymm8 vpaddq %ymm14,%ymm9,%ymm9 vpaddq %ymm15,%ymm10,%ymm10 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1 vmovmskpd %ymm0,%r14d vmovmskpd %ymm1,%r13d shlb $4,%r13b orb %r13b,%r14b vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11 vmovmskpd %ymm2,%r13d vmovmskpd %ymm11,%r12d shlb $4,%r12b orb %r12b,%r13b vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm12,%r12d vmovmskpd %ymm13,%r11d shlb $4,%r11b orb %r11b,%r12b vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14 vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15 vmovmskpd %ymm14,%r11d vmovmskpd %ymm15,%r10d shlb $4,%r10b orb %r10b,%r11b addb %r14b,%r14b adcb %r13b,%r13b adcb %r12b,%r12b adcb %r11b,%r11b vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1 vmovmskpd %ymm0,%r9d vmovmskpd %ymm1,%r8d shlb $4,%r8b orb %r8b,%r9b vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11 vmovmskpd %ymm2,%r8d vmovmskpd %ymm11,%edx shlb $4,%dl orb %dl,%r8b vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm12,%edx vmovmskpd %ymm13,%ecx shlb $4,%cl orb %cl,%dl vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14 vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15 vmovmskpd %ymm14,%ecx vmovmskpd %ymm15,%ebx shlb $4,%bl orb %bl,%cl addb %r9b,%r14b adcb %r8b,%r13b adcb %dl,%r12b adcb %cl,%r11b xorb %r9b,%r14b xorb %r8b,%r13b xorb %dl,%r12b xorb %cl,%r11b leaq .Lkmasklut(%rip),%rdx movb %r14b,%r10b andq $0xf,%r14 vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 shlq $5,%r14 vmovapd (%rdx,%r14,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 movb %r13b,%r10b andq $0xf,%r13 vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 shlq $5,%r13 vmovapd (%rdx,%r13,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 movb %r12b,%r10b andq $0xf,%r12 vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 shlq $5,%r12 vmovapd (%rdx,%r12,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 movb %r11b,%r10b andq $0xf,%r11 vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 shlq $5,%r11 vmovapd (%rdx,%r11,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 popq %r8 popq %rcx popq %rsi popq %r11 vmovdqu %ymm3,0(%rdi) vmovdqu %ymm4,32(%rdi) vmovdqu %ymm5,64(%rdi) vmovdqu %ymm6,96(%rdi) vmovdqu %ymm7,128(%rdi) vmovdqu %ymm8,160(%rdi) vmovdqu %ymm9,192(%rdi) vmovdqu %ymm10,224(%rdi) xorl %r15d,%r15d leaq 16(%r11),%r11 movq $0xfffffffffffff,%rax movl $30,%ebx vpxor %ymm0,%ymm0,%ymm0 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm4 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 vmovapd %ymm0,%ymm8 vmovapd %ymm0,%ymm9 vmovapd %ymm0,%ymm10 .align 32 .Lloop40: movq 0(%r11),%r13 vpbroadcastq 0(%r11),%ymm1 movq 256(%rsi),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 movq %r12,%r10 adcq $0,%r10 movq 8(%r8),%r13 imulq %r9,%r13 andq %rax,%r13 vmovq %r13,%xmm2 vpbroadcastq %xmm2,%ymm2 movq 256(%rcx),%rdx mulxq %r13,%r13,%r12 addq %r13,%r9 adcq %r12,%r10 shrq $52,%r9 salq $12,%r10 orq %r10,%r9 leaq -264(%rsp),%rsp {vex} vpmadd52luq 256(%rsi),%ymm1,%ymm3 {vex} vpmadd52luq 288(%rsi),%ymm1,%ymm4 {vex} vpmadd52luq 320(%rsi),%ymm1,%ymm5 {vex} vpmadd52luq 352(%rsi),%ymm1,%ymm6 {vex} vpmadd52luq 384(%rsi),%ymm1,%ymm7 {vex} vpmadd52luq 416(%rsi),%ymm1,%ymm8 {vex} vpmadd52luq 448(%rsi),%ymm1,%ymm9 {vex} vpmadd52luq 480(%rsi),%ymm1,%ymm10 {vex} vpmadd52luq 256(%rcx),%ymm2,%ymm3 {vex} vpmadd52luq 288(%rcx),%ymm2,%ymm4 {vex} vpmadd52luq 320(%rcx),%ymm2,%ymm5 {vex} vpmadd52luq 352(%rcx),%ymm2,%ymm6 {vex} vpmadd52luq 384(%rcx),%ymm2,%ymm7 {vex} vpmadd52luq 416(%rcx),%ymm2,%ymm8 {vex} vpmadd52luq 448(%rcx),%ymm2,%ymm9 {vex} vpmadd52luq 480(%rcx),%ymm2,%ymm10 vmovdqu %ymm3,0(%rsp) vmovdqu %ymm4,32(%rsp) vmovdqu %ymm5,64(%rsp) vmovdqu %ymm6,96(%rsp) vmovdqu %ymm7,128(%rsp) vmovdqu %ymm8,160(%rsp) vmovdqu %ymm9,192(%rsp) vmovdqu %ymm10,224(%rsp) movq $0,256(%rsp) vmovdqu 8(%rsp),%ymm3 vmovdqu 40(%rsp),%ymm4 vmovdqu 72(%rsp),%ymm5 vmovdqu 104(%rsp),%ymm6 vmovdqu 136(%rsp),%ymm7 vmovdqu 168(%rsp),%ymm8 vmovdqu 200(%rsp),%ymm9 vmovdqu 232(%rsp),%ymm10 addq 8(%rsp),%r9 {vex} vpmadd52huq 256(%rsi),%ymm1,%ymm3 {vex} vpmadd52huq 288(%rsi),%ymm1,%ymm4 {vex} vpmadd52huq 320(%rsi),%ymm1,%ymm5 {vex} vpmadd52huq 352(%rsi),%ymm1,%ymm6 {vex} vpmadd52huq 384(%rsi),%ymm1,%ymm7 {vex} vpmadd52huq 416(%rsi),%ymm1,%ymm8 {vex} vpmadd52huq 448(%rsi),%ymm1,%ymm9 {vex} vpmadd52huq 480(%rsi),%ymm1,%ymm10 {vex} vpmadd52huq 256(%rcx),%ymm2,%ymm3 {vex} vpmadd52huq 288(%rcx),%ymm2,%ymm4 {vex} vpmadd52huq 320(%rcx),%ymm2,%ymm5 {vex} vpmadd52huq 352(%rcx),%ymm2,%ymm6 {vex} vpmadd52huq 384(%rcx),%ymm2,%ymm7 {vex} vpmadd52huq 416(%rcx),%ymm2,%ymm8 {vex} vpmadd52huq 448(%rcx),%ymm2,%ymm9 {vex} vpmadd52huq 480(%rcx),%ymm2,%ymm10 leaq 264(%rsp),%rsp leaq 8(%r11),%r11 decl %ebx jne .Lloop40 vmovq %r9,%xmm0 vpbroadcastq %xmm0,%ymm0 vpblendd $3,%ymm0,%ymm3,%ymm3 vpsrlq $52,%ymm3,%ymm0 vpsrlq $52,%ymm4,%ymm1 vpsrlq $52,%ymm5,%ymm2 vpsrlq $52,%ymm6,%ymm11 vpsrlq $52,%ymm7,%ymm12 vpsrlq $52,%ymm8,%ymm13 vpsrlq $52,%ymm9,%ymm14 vpsrlq $52,%ymm10,%ymm15 leaq -32(%rsp),%rsp vmovupd %ymm3,(%rsp) vpermq $144,%ymm15,%ymm15 vpermq $3,%ymm14,%ymm3 vblendpd $1,%ymm3,%ymm15,%ymm15 vpermq $144,%ymm14,%ymm14 vpermq $3,%ymm13,%ymm3 vblendpd $1,%ymm3,%ymm14,%ymm14 vpermq $144,%ymm13,%ymm13 vpermq $3,%ymm12,%ymm3 vblendpd $1,%ymm3,%ymm13,%ymm13 vpermq $144,%ymm12,%ymm12 vpermq $3,%ymm11,%ymm3 vblendpd $1,%ymm3,%ymm12,%ymm12 vpermq $144,%ymm11,%ymm11 vpermq $3,%ymm2,%ymm3 vblendpd $1,%ymm3,%ymm11,%ymm11 vpermq $144,%ymm2,%ymm2 vpermq $3,%ymm1,%ymm3 vblendpd $1,%ymm3,%ymm2,%ymm2 vpermq $144,%ymm1,%ymm1 vpermq $3,%ymm0,%ymm3 vblendpd $1,%ymm3,%ymm1,%ymm1 vpermq $144,%ymm0,%ymm0 vpand .Lhigh64x3(%rip),%ymm0,%ymm0 vmovupd (%rsp),%ymm3 leaq 32(%rsp),%rsp vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vpaddq %ymm0,%ymm3,%ymm3 vpaddq %ymm1,%ymm4,%ymm4 vpaddq %ymm2,%ymm5,%ymm5 vpaddq %ymm11,%ymm6,%ymm6 vpaddq %ymm12,%ymm7,%ymm7 vpaddq %ymm13,%ymm8,%ymm8 vpaddq %ymm14,%ymm9,%ymm9 vpaddq %ymm15,%ymm10,%ymm10 vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1 vmovmskpd %ymm0,%r14d vmovmskpd %ymm1,%r13d shlb $4,%r13b orb %r13b,%r14b vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2 vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11 vmovmskpd %ymm2,%r13d vmovmskpd %ymm11,%r12d shlb $4,%r12b orb %r12b,%r13b vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12 vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm12,%r12d vmovmskpd %ymm13,%r11d shlb $4,%r11b orb %r11b,%r12b vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14 vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15 vmovmskpd %ymm14,%r11d vmovmskpd %ymm15,%r10d shlb $4,%r10b orb %r10b,%r11b addb %r14b,%r14b adcb %r13b,%r13b adcb %r12b,%r12b adcb %r11b,%r11b vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1 vmovmskpd %ymm0,%r9d vmovmskpd %ymm1,%r8d shlb $4,%r8b orb %r8b,%r9b vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2 vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11 vmovmskpd %ymm2,%r8d vmovmskpd %ymm11,%edx shlb $4,%dl orb %dl,%r8b vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12 vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 vmovmskpd %ymm12,%edx vmovmskpd %ymm13,%ecx shlb $4,%cl orb %cl,%dl vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14 vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15 vmovmskpd %ymm14,%ecx vmovmskpd %ymm15,%ebx shlb $4,%bl orb %bl,%cl addb %r9b,%r14b adcb %r8b,%r13b adcb %dl,%r12b adcb %cl,%r11b xorb %r9b,%r14b xorb %r8b,%r13b xorb %dl,%r12b xorb %cl,%r11b leaq .Lkmasklut(%rip),%rdx movb %r14b,%r10b andq $0xf,%r14 vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 shlq $5,%r14 vmovapd (%rdx,%r14,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 movb %r13b,%r10b andq $0xf,%r13 vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 shlq $5,%r13 vmovapd (%rdx,%r13,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 movb %r12b,%r10b andq $0xf,%r12 vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 shlq $5,%r12 vmovapd (%rdx,%r12,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 movb %r11b,%r10b andq $0xf,%r11 vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 shlq $5,%r11 vmovapd (%rdx,%r11,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 shrb $4,%r10b andq $0xf,%r10 vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 shlq $5,%r10 vmovapd (%rdx,%r10,1),%ymm2 vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 vpand .Lmask52x4(%rip),%ymm3,%ymm3 vpand .Lmask52x4(%rip),%ymm4,%ymm4 vpand .Lmask52x4(%rip),%ymm5,%ymm5 vpand .Lmask52x4(%rip),%ymm6,%ymm6 vpand .Lmask52x4(%rip),%ymm7,%ymm7 vpand .Lmask52x4(%rip),%ymm8,%ymm8 vpand .Lmask52x4(%rip),%ymm9,%ymm9 vpand .Lmask52x4(%rip),%ymm10,%ymm10 vmovdqu %ymm3,256(%rdi) vmovdqu %ymm4,288(%rdi) vmovdqu %ymm5,320(%rdi) vmovdqu %ymm6,352(%rdi) vmovdqu %ymm7,384(%rdi) vmovdqu %ymm8,416(%rdi) vmovdqu %ymm9,448(%rdi) vmovdqu %ymm10,480(%rdi) vzeroupper leaq (%rsp),%rax .cfi_def_cfa_register %rax movq 0(%rax),%r15 .cfi_restore %r15 movq 8(%rax),%r14 .cfi_restore %r14 movq 16(%rax),%r13 .cfi_restore %r13 movq 24(%rax),%r12 .cfi_restore %r12 movq 32(%rax),%rbp .cfi_restore %rbp movq 40(%rax),%rbx .cfi_restore %rbx leaq 48(%rax),%rsp .cfi_def_cfa %rsp,8 .Lossl_rsaz_amm52x30_x2_avxifma256_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ossl_rsaz_amm52x30_x2_avxifma256, .-ossl_rsaz_amm52x30_x2_avxifma256 .text .align 32 .globl ossl_extract_multiplier_2x30_win5_avx .type ossl_extract_multiplier_2x30_win5_avx,@function ossl_extract_multiplier_2x30_win5_avx: .cfi_startproc .byte 243,15,30,250 vmovapd .Lones(%rip),%ymm12 vmovq %rdx,%xmm8 vpbroadcastq %xmm8,%ymm10 vmovq %rcx,%xmm8 vpbroadcastq %xmm8,%ymm11 leaq 16384(%rsi),%rax vpxor %xmm0,%xmm0,%xmm0 vmovapd %ymm0,%ymm9 vmovapd %ymm0,%ymm1 vmovapd %ymm0,%ymm2 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm4 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 .align 32 .Lloop: vpcmpeqq %ymm9,%ymm10,%ymm13 vmovdqu 0(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm0,%ymm0 vmovdqu 32(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm1,%ymm1 vmovdqu 64(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm2,%ymm2 vmovdqu 96(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm3,%ymm3 vmovdqu 128(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm4,%ymm4 vmovdqu 160(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm5,%ymm5 vmovdqu 192(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm6,%ymm6 vmovdqu 224(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm7,%ymm7 vpaddq %ymm12,%ymm9,%ymm9 addq $512,%rsi cmpq %rsi,%rax jne .Lloop vmovdqu %ymm0,0(%rdi) vmovdqu %ymm1,32(%rdi) vmovdqu %ymm2,64(%rdi) vmovdqu %ymm3,96(%rdi) vmovdqu %ymm4,128(%rdi) vmovdqu %ymm5,160(%rdi) vmovdqu %ymm6,192(%rdi) vmovdqu %ymm7,224(%rdi) leaq -16384(%rax),%rsi vpxor %xmm0,%xmm0,%xmm0 vmovapd %ymm0,%ymm9 vmovapd %ymm0,%ymm0 vmovapd %ymm0,%ymm1 vmovapd %ymm0,%ymm2 vmovapd %ymm0,%ymm3 vmovapd %ymm0,%ymm4 vmovapd %ymm0,%ymm5 vmovapd %ymm0,%ymm6 vmovapd %ymm0,%ymm7 .align 32 .Lloop_8_15: vpcmpeqq %ymm9,%ymm11,%ymm13 vmovdqu 256(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm0,%ymm0 vmovdqu 288(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm1,%ymm1 vmovdqu 320(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm2,%ymm2 vmovdqu 352(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm3,%ymm3 vmovdqu 384(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm4,%ymm4 vmovdqu 416(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm5,%ymm5 vmovdqu 448(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm6,%ymm6 vmovdqu 480(%rsi),%ymm8 vblendvpd %ymm13,%ymm8,%ymm7,%ymm7 vpaddq %ymm12,%ymm9,%ymm9 addq $512,%rsi cmpq %rsi,%rax jne .Lloop_8_15 vmovdqu %ymm0,256(%rdi) vmovdqu %ymm1,288(%rdi) vmovdqu %ymm2,320(%rdi) vmovdqu %ymm3,352(%rdi) vmovdqu %ymm4,384(%rdi) vmovdqu %ymm5,416(%rdi) vmovdqu %ymm6,448(%rdi) vmovdqu %ymm7,480(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size ossl_extract_multiplier_2x30_win5_avx, .-ossl_extract_multiplier_2x30_win5_avx .section .rodata .align 32 .Lones: .quad 1,1,1,1 .Lzeros: .quad 0,0,0,0 .section ".note.gnu.property", "a" .p2align 3 .long 1f - 0f .long 4f - 1f .long 5 0: # "GNU" encoded with .byte, since .asciz isn't supported # on Solaris. .byte 0x47 .byte 0x4e .byte 0x55 .byte 0 1: .p2align 3 .long 0xc0000002 .long 3f - 2f 2: .long 3 3: .p2align 3 4: