.file "mp_mod_384.c" .text .p2align 4,,15.globl mp_mod_384 .type mp_mod_384, @functionmp_mod_384:.LFB0: .cfi_startproc# uint64_t mp_mod_384 (uint64_t r[6], uint64_t a[12])# rdi = r[]# rsi = c[]# load (xmm0 ~ xmm5) = (a[0] ~ a[11]) movdqa (%rsi), %xmm0 movdqa 16(%rsi), %xmm1 movdqa 32(%rsi), %xmm2 movdqa 48(%rsi), %xmm3 movdqa 64(%rsi), %xmm4 movdqa 80(%rsi), %xmm5# backup (r12, r13, r14, r15) movq %r12, %xmm14 movq %r14, %xmm15 pinsrq $1, %r13, %xmm14 pinsrq $1, %r15, %xmm15# init xorq %r14, %r14 xorq %r15, %r15 xorq %rdx, %rdx# xmm6 = |a21|a22|a20|a23|# rsi = | 0 |a20|# r8 = |a21|a22|# r9 = |a23<<1 | pshufd $0xc9, %xmm5, %xmm6 pextrd $3, %xmm5, %r9d movd %xmm5, %esi movq %xmm6, %r8 shl $1, %r9 shl $32, %rsi# |a21|a22| 0 |a20|a21|a22|a23<<1 |(=) movq %r8, %r10 movq %rsi, %r11 movq %r8, %r12 movq %r9, %r13# | 0 |a20|a21|a22|a23<<1 | 0 | 0 |(-) subq %rsi, %r10 sbbq %r8, %r11 sbbq %r9, %r12 sbbq $0, %r13# rsi = |a20|a23| pextrq $1, %xmm6, %rsi# | 0 | 0 |a20|a23| 0 | 0 | 0 | 0 |(-) subq %rsi, %r11 sbbq $0, %r12 sbbq $0, %r13# | 0 | 0 | 0 | 0 |a21|a22| 0 | 0 |(+) addq %r8, %r12 adcq $0, %r13# r8 = |a20|a21|# r9 = |a22|a23| movq %xmm5, %r8 pextrq $1, %xmm5, %r9# |a20|a23|a20|a21|a22|a23| 0 | 0 |(+) addq %rsi, %r10 adcq %r8, %r11 adcq %r9, %r12 adcq $0, %r13# | 0 | 0 | 0 | 0 |a20|a21|a22|a23|a20|a21|a22|a23|(+) addq %r8, %r12 adcq %r9, %r13 adcq %r8, %r14 adcq %r9, %r15 adcq $0, %rdx# rax = |a12|a13|# rsi = |a14|a15|# r8 = |a16|a17|# r9 = |a18|a19| movq %xmm3, %rax pextrq $1, %xmm3, %rsi movq %xmm4, %r8 pextrq $1, %xmm4, %r9# |a12|a13|a14|a15|a16|a17|a18|a19| 0 | 0 | 0 | 0 |(+) addq %rax, %r10 adcq %rsi, %r11 adcq %r8, %r12 adcq %r9, %r13 adcq $0, %r14 adcq $0, %r15 adcq $0, %rdx# | 0 | 0 | 0 | 0 |a12|a13|a14|a15|a16|a17|a18|a19|(+) addq %rax, %r12 adcq %rsi, %r13 adcq %r8, %r14 adcq %r9, %r15 adcq $0, %rdx# |a00|a01|a02|a03|a04|a05|a06|a07|a08|a09|a10|a11|(+) movq %xmm0, %rax addq %rax, %r10 pextrq $1, %xmm0, %rax adcq %rax, %r11# ---- movq %xmm1, %rax adcq %rax, %r12 pextrq $1, %xmm1, %rax adcq %rax, %r13# ---- movq %xmm2, %rax adcq %rax, %r14 pextrq $1, %xmm2, %rax adcq %rax, %r15# ---- adcq $0, %rdx# |---------------|---------------|---------------|# | xmm3 | xmm4 | xmm5 |# |---|---|---|---|---|---|---|---|---|---|---|---|---|# |a23|a12|a13|a14|a15|a16|a17|a18|a19|a20|a21|a22|# |---|---|---|---|---|---|---|---|---|---|---|---|# | xmm0 | xmm1 | xmm2 |# |---------------|---------------|---------------| pshufd $0x93, %xmm3, %xmm0 pshufd $0x93, %xmm4, %xmm1 pshufd $0x93, %xmm5, %xmm2 pextrd $3, %xmm3, %r8d pextrd $3, %xmm4, %r9d pextrd $3, %xmm5, %esi pinsrd $0, %r8d, %xmm1 pinsrd $0, %r9d, %xmm2 pinsrd $0, %esi, %xmm0# rax = |a23|a12|# rsi = |a13|a14|# r8 = |a15|a16|# r9 = |a17|a18|# rcx = |a19|a20| movq %xmm0, %rax pextrq $1, %xmm0, %rsi movq %xmm1, %r8 pextrq $1, %xmm1, %r9 movq %xmm2, %rcx# | 0 | 0 |a23|a12|a13|a14|a15|a16|a17|a18|a19|a20|(+) addq %rax, %r11 adcq %rsi, %r12 adcq %r8, %r13 adcq %r9, %r14 adcq %rcx, %r15 adcq $0, %rdx# |a23|a12|a13|a14|a15|a16|a17|a18|a19|a20| 0 | 0 |(+) subq %rax, %r10 sbbq %rsi, %r11 sbbq %r8, %r12 sbbq %r9, %r13 sbbq %rcx, %r14 sbbq $0, %r15 sbbq $0, %rdx# ----# p384# ---- xorq %rax, %rax notq %rax movq %rax, %r8 movq %rax, %r9 movq %rax, %rsi shr $32, %r8 shl $32, %r9 decq %rsi# | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |a21|a22|(-) movq %xmm6, %rcx subq %rcx, %r15 sbbq $0, %rdx jc .minus.reduce: subq %r8, %r10 sbbq %r9, %r11 sbbq %rsi, %r12 sbbq %rax, %r13 sbbq %rax, %r14 sbbq %rax, %r15 sbbq $0, %rdx jnc .reduce.minus: addq %r8, %r10 adcq %r9, %r11 adcq %rsi, %r12 adcq %rax, %r13 adcq %rax, %r14 adcq %rax, %r15# ---- movq %r10, %xmm6 movq %r12, %xmm7 movq %r14, %xmm8 pinsrq $1, %r11, %xmm6 pinsrq $1, %r13, %xmm7 pinsrq $1, %r15, %xmm8# restore (r12, r13, r14, r15) movq %xmm14, %r12 movq %xmm15, %r14 pextrq $1, %xmm14, %r13 pextrq $1, %xmm15, %r15# output (xmm6, xmm7, xmm8) movdqa %xmm6, (%rdi) movdqa %xmm7, 16(%rdi) movdqa %xmm8, 32(%rdi)# return emms xorq %rax, %rax xorq %rdx, %rdx ret .cfi_endproc.LFE0: .size mp_mod_384, .-mp_mod_384 .ident "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-11)" .section .note.GNU-stack,"",@progbits
测试使用时将程序文本保存为mp_mod_384.s,相应C程序头文件为mp_mod_384.h,其全部内容就下面一行
uint64_t mp_mod_384 (uint64_t r[6], uint64_t a[12]);
在C程序应用此汇编程序必须保证输入输出数组r和a的首地址16字节对齐,CPU须支持SSE42指令,本函数实现满足可重入要求,无须加锁即可在多线程环境中以最高效能运行。