-
Notifications
You must be signed in to change notification settings - Fork 2.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Monero & slow hash minor changes + add inline asm #5677
base: bleeding-jumbo
Are you sure you want to change the base?
Conversation
This may make the optimizer's job easier (and it actually made the code generated by gcc 11 slightly smaller)
Compiler-generated code for the main loop: 2f8: 48 8b 9d 48 fe ff ff mov -0x1b8(%rbp),%rbx
2ff: 4c 89 d9 mov %r11,%rcx
302: 81 e1 f0 ff 1f 00 and $0x1ffff0,%ecx
308: 48 01 d9 add %rbx,%rcx
30b: c5 f9 6f 19 vmovdqa (%rcx),%xmm3
30f: c4 62 61 dc bd 60 fe vaesenc -0x1a0(%rbp),%xmm3,%xmm15
316: ff ff
318: c4 41 f9 7e fa vmovq %xmm15,%r10
31d: c4 c1 11 ef d7 vpxor %xmm15,%xmm13,%xmm2
322: c4 61 f9 7e f8 vmovq %xmm15,%rax
327: 41 81 e2 f0 ff 1f 00 and $0x1ffff0,%r10d
32e: c5 f9 7f 11 vmovdqa %xmm2,(%rcx)
332: 49 01 da add %rbx,%r10
335: c4 c1 79 6f 0a vmovdqa (%r10),%xmm1
33a: c4 e1 f9 7e cf vmovq %xmm1,%rdi
33f: c4 c3 f9 16 cf 01 vpextrq $0x1,%xmm1,%r15
345: 48 f7 e7 mul %rdi
348: 4e 8d 04 1a lea (%rdx,%r11,1),%r8
34c: 48 03 85 68 fe ff ff add -0x198(%rbp),%rax
353: 4d 89 02 mov %r8,(%r10)
356: 49 31 f8 xor %rdi,%r8
359: 49 31 c7 xor %rax,%r15
35c: 4d 89 c1 mov %r8,%r9
35f: 4c 89 85 60 fe ff ff mov %r8,-0x1a0(%rbp)
366: 41 81 e1 f0 ff 1f 00 and $0x1ffff0,%r9d
36d: 4c 89 bd 68 fe ff ff mov %r15,-0x198(%rbp)
374: 49 01 d9 add %rbx,%r9
377: 49 89 42 08 mov %rax,0x8(%r10)
37b: c4 41 79 6f 09 vmovdqa (%r9),%xmm9
380: c4 62 31 dc ad 60 fe vaesenc -0x1a0(%rbp),%xmm9,%xmm13
387: ff ff
389: c4 41 f9 7e ed vmovq %xmm13,%r13
38e: c4 41 01 ef dd vpxor %xmm13,%xmm15,%xmm11
393: c4 61 f9 7e e8 vmovq %xmm13,%rax
398: 41 81 e5 f0 ff 1f 00 and $0x1ffff0,%r13d
39f: c4 41 79 7f 19 vmovdqa %xmm11,(%r9)
3a4: 49 01 dd add %rbx,%r13
3a7: c4 c1 79 6f 6d 00 vmovdqa 0x0(%r13),%xmm5
3ad: c4 e1 f9 7e e9 vmovq %xmm5,%rcx
3b2: c4 e3 f9 16 eb 01 vpextrq $0x1,%xmm5,%rbx
3b8: c5 f9 7f ad 70 fe ff vmovdqa %xmm5,-0x190(%rbp)
3bf: ff
3c0: 48 f7 e1 mul %rcx
3c3: 4e 8d 1c 02 lea (%rdx,%r8,1),%r11
3c7: 4c 01 f8 add %r15,%rax
3ca: 4d 89 5d 00 mov %r11,0x0(%r13)
3ce: 48 31 c3 xor %rax,%rbx
3d1: 49 31 cb xor %rcx,%r11
3d4: 49 89 45 08 mov %rax,0x8(%r13)
3d8: 4c 89 9d 60 fe ff ff mov %r11,-0x1a0(%rbp)
3df: 48 89 9d 68 fe ff ff mov %rbx,-0x198(%rbp)
3e6: 48 83 ee 02 sub $0x2,%rsi
3ea: 0f 85 08 ff ff ff jne 2f8 <cn_slow_hash+0x218> This PR's inline asm: 305: c4 e1 f9 7e c0 vmovq %xmm0,%rax
30a: c4 e2 40 f2 c0 andn %eax,%edi,%eax
30f: c5 f9 6f 14 03 vmovdqa (%rbx,%rax,1),%xmm2
314: c4 e2 69 dc d0 vaesenc %xmm0,%xmm2,%xmm2
319: c5 f1 ef ca vpxor %xmm2,%xmm1,%xmm1
31d: c5 f9 7f 0c 03 vmovdqa %xmm1,(%rbx,%rax,1)
322: c4 e1 f9 7e d0 vmovq %xmm2,%rax
327: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
32c: c5 f9 6f 0c 0b vmovdqa (%rbx,%rcx,1),%xmm1
331: 48 8b 14 0b mov (%rbx,%rcx,1),%rdx
335: 48 f7 e2 mul %rdx
338: c4 e1 f9 6e da vmovq %rdx,%xmm3
33d: c4 e3 e1 22 d8 01 vpinsrq $0x1,%rax,%xmm3,%xmm3
343: c5 f9 d4 c3 vpaddq %xmm3,%xmm0,%xmm0
347: c5 f9 7f 04 0b vmovdqa %xmm0,(%rbx,%rcx,1)
34c: c5 f9 ef c1 vpxor %xmm1,%xmm0,%xmm0
350: c4 e1 f9 7e c0 vmovq %xmm0,%rax
355: c4 e2 40 f2 c0 andn %eax,%edi,%eax
35a: c5 f9 6f 0c 03 vmovdqa (%rbx,%rax,1),%xmm1
35f: c4 e2 71 dc c8 vaesenc %xmm0,%xmm1,%xmm1
364: c5 e9 ef d1 vpxor %xmm1,%xmm2,%xmm2
368: c5 f9 7f 14 03 vmovdqa %xmm2,(%rbx,%rax,1)
36d: c4 e1 f9 7e c8 vmovq %xmm1,%rax
372: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
377: c5 f9 6f 14 0b vmovdqa (%rbx,%rcx,1),%xmm2
37c: 48 8b 14 0b mov (%rbx,%rcx,1),%rdx
380: 48 f7 e2 mul %rdx
383: c4 e1 f9 6e da vmovq %rdx,%xmm3
388: c4 e3 e1 22 d8 01 vpinsrq $0x1,%rax,%xmm3,%xmm3
38e: c5 f9 d4 c3 vpaddq %xmm3,%xmm0,%xmm0
392: c5 f9 7f 04 0b vmovdqa %xmm0,(%rbx,%rcx,1)
397: c5 f9 ef c2 vpxor %xmm2,%xmm0,%xmm0
39b: ff ce dec %esi
39d: 0f 85 62 ff ff ff jne 305 <cn_slow_hash+0x225> Same extent of loop unrolling (4 iterations). |
8dd21e7
to
266c749
Compare
Made some tweaks, such as to squeeze some tiny bits of parallelism. Now same speed as the larger compiler-generated code, and enabled hoping for more stable good speeds not dependent on compiler version. 305: c4 e1 f9 7e c0 vmovq %xmm0,%rax
30a: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
30f: c5 f9 6f 14 0b vmovdqa (%rbx,%rcx,1),%xmm2
314: c4 e2 69 dc d0 vaesenc %xmm0,%xmm2,%xmm2
319: c5 f1 ef ca vpxor %xmm2,%xmm1,%xmm1
31d: c4 e1 f9 7e d0 vmovq %xmm2,%rax
322: c5 f9 7f 0c 0b vmovdqa %xmm1,(%rbx,%rcx,1)
327: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
32c: 48 8b 14 0b mov (%rbx,%rcx,1),%rdx
330: 48 f7 e2 mul %rdx
333: c4 e1 f9 6e da vmovq %rdx,%xmm3
338: c4 e3 e1 22 d8 01 vpinsrq $0x1,%rax,%xmm3,%xmm3
33e: c5 f9 d4 c3 vpaddq %xmm3,%xmm0,%xmm0
342: c5 f9 ef 1c 0b vpxor (%rbx,%rcx,1),%xmm0,%xmm3
347: c5 f9 7f 04 0b vmovdqa %xmm0,(%rbx,%rcx,1)
34c: c4 e1 f9 7e d8 vmovq %xmm3,%rax
351: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
356: c5 f9 6f 0c 0b vmovdqa (%rbx,%rcx,1),%xmm1
35b: c4 e2 71 dc cb vaesenc %xmm3,%xmm1,%xmm1
360: c5 e9 ef d1 vpxor %xmm1,%xmm2,%xmm2
364: c4 e1 f9 7e c8 vmovq %xmm1,%rax
369: c5 f9 7f 14 0b vmovdqa %xmm2,(%rbx,%rcx,1)
36e: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
373: 48 8b 14 0b mov (%rbx,%rcx,1),%rdx
377: 48 f7 e2 mul %rdx
37a: c4 e1 f9 6e c2 vmovq %rdx,%xmm0
37f: c4 e3 f9 22 c0 01 vpinsrq $0x1,%rax,%xmm0,%xmm0
385: c5 e1 d4 d8 vpaddq %xmm0,%xmm3,%xmm3
389: c5 e1 ef 04 0b vpxor (%rbx,%rcx,1),%xmm3,%xmm0
38e: c5 f9 7f 1c 0b vmovdqa %xmm3,(%rbx,%rcx,1)
393: ff ce dec %esi
395: 0f 85 6a ff ff ff jne 305 <cn_slow_hash+0x225> |
266c749
to
d3127dd
Compare
Even smaller code, now 14 instructions per 2 iterations: 305: c4 e1 f9 7e c0 vmovq %xmm0,%rax
30a: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
30f: c5 f9 6f 14 0b vmovdqa (%rbx,%rcx,1),%xmm2
314: c4 e2 69 dc d0 vaesenc %xmm0,%xmm2,%xmm2
319: c5 f1 ef ca vpxor %xmm2,%xmm1,%xmm1
31d: c4 e1 f9 7e d0 vmovq %xmm2,%rax
322: c5 f9 7f 0c 0b vmovdqa %xmm1,(%rbx,%rcx,1)
327: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
32c: 48 f7 24 0b mulq (%rbx,%rcx,1)
330: c4 e1 f9 6e da vmovq %rdx,%xmm3
335: c4 e3 e1 22 d8 01 vpinsrq $0x1,%rax,%xmm3,%xmm3
33b: c5 f9 d4 c3 vpaddq %xmm3,%xmm0,%xmm0
33f: c5 f9 ef 1c 0b vpxor (%rbx,%rcx,1),%xmm0,%xmm3
344: c5 f9 7f 04 0b vmovdqa %xmm0,(%rbx,%rcx,1)
349: c4 e1 f9 7e d8 vmovq %xmm3,%rax
34e: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
353: c5 f9 6f 0c 0b vmovdqa (%rbx,%rcx,1),%xmm1
358: c4 e2 71 dc cb vaesenc %xmm3,%xmm1,%xmm1
35d: c5 e9 ef d1 vpxor %xmm1,%xmm2,%xmm2
361: c4 e1 f9 7e c8 vmovq %xmm1,%rax
366: c5 f9 7f 14 0b vmovdqa %xmm2,(%rbx,%rcx,1)
36b: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
370: 48 f7 24 0b mulq (%rbx,%rcx,1)
374: c4 e1 f9 6e c2 vmovq %rdx,%xmm0
379: c4 e3 f9 22 c0 01 vpinsrq $0x1,%rax,%xmm0,%xmm0
37f: c5 e1 d4 d8 vpaddq %xmm0,%xmm3,%xmm3
383: c5 e1 ef 04 0b vpxor (%rbx,%rcx,1),%xmm3,%xmm0
388: c5 f9 7f 1c 0b vmovdqa %xmm3,(%rbx,%rcx,1)
38d: ff ce dec %esi
38f: 0f 85 70 ff ff ff jne 305 <cn_slow_hash+0x225> |
d3127dd
to
fa175d6
Compare
And smaller yet (by 2 bytes): 305: c5 f9 7e c0 vmovd %xmm0,%eax
309: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
30e: c5 f9 6f 14 0b vmovdqa (%rbx,%rcx,1),%xmm2
313: c4 e2 69 dc d0 vaesenc %xmm0,%xmm2,%xmm2
318: c5 f1 ef ca vpxor %xmm2,%xmm1,%xmm1
31c: c4 e1 f9 7e d0 vmovq %xmm2,%rax
321: c5 f9 7f 0c 0b vmovdqa %xmm1,(%rbx,%rcx,1)
326: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
32b: 48 f7 24 0b mulq (%rbx,%rcx,1)
32f: c4 e1 f9 6e da vmovq %rdx,%xmm3
334: c4 e3 e1 22 d8 01 vpinsrq $0x1,%rax,%xmm3,%xmm3
33a: c5 f9 d4 c3 vpaddq %xmm3,%xmm0,%xmm0
33e: c5 f9 ef 1c 0b vpxor (%rbx,%rcx,1),%xmm0,%xmm3
343: c5 f9 7f 04 0b vmovdqa %xmm0,(%rbx,%rcx,1)
348: c5 f9 7e d8 vmovd %xmm3,%eax
34c: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
351: c5 f9 6f 0c 0b vmovdqa (%rbx,%rcx,1),%xmm1
356: c4 e2 71 dc cb vaesenc %xmm3,%xmm1,%xmm1
35b: c5 e9 ef d1 vpxor %xmm1,%xmm2,%xmm2
35f: c4 e1 f9 7e c8 vmovq %xmm1,%rax
364: c5 f9 7f 14 0b vmovdqa %xmm2,(%rbx,%rcx,1)
369: c4 e2 40 f2 c8 andn %eax,%edi,%ecx
36e: 48 f7 24 0b mulq (%rbx,%rcx,1)
372: c4 e1 f9 6e c2 vmovq %rdx,%xmm0
377: c4 e3 f9 22 c0 01 vpinsrq $0x1,%rax,%xmm0,%xmm0
37d: c5 e1 d4 d8 vpaddq %xmm0,%xmm3,%xmm3
381: c5 e1 ef 04 0b vpxor (%rbx,%rcx,1),%xmm3,%xmm0
386: c5 f9 7f 1c 0b vmovdqa %xmm3,(%rbx,%rcx,1)
38b: ff ce dec %esi
38d: 0f 85 72 ff ff ff jne 305 <cn_slow_hash+0x225> I think next step may be to use more registers to avoid "anti-dependencies" like: "vmovdqa %%xmm1,(%%rbx,%%rcx)\n\t"
"andnl %%eax,%4,%%ecx\n\t" which would allow placing the MOV instruction further down the code. Perhaps the CPU already optimizes this through register renaming and out-of-order execution, but it's worth trying to help it anyway. |
Using memory allocated by tests slows multi-threaded benchmarks down on a certain CentOS 7 system (but not on a newer system).
"jnz 1b\n\t" | ||
: | ||
: "m" (a), "m" (b), "b" (long_state), "S" (ITER / 4), "D" (~(uint32_t)(MEMORY - AES_BLOCK_SIZE)) | ||
: "ax", "cx", "dx", "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note to self: I need to add si
to clobbered registers (here, and ditto below).
The inline asm didn't work out yet - it's smaller, but slower (in my testing, YMMV). Perhaps the whole idea of keeping the
a
andb
blocks in SIMD registers isn't a great fit for the 2nd iteration with its scalar MUL, which may be why compiler-generated code is sometimes faster (after the C source had been tweaked). Perhaps keeping them in memory can be faster. Where by "keeping" I mean their "primary" location across remaining iterations of the unrolled loop. What happens inside the loop's body can then be optimized arbitrarily.