Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Monero & slow hash minor changes + add inline asm #5677

Open
wants to merge 7 commits into
base: bleeding-jumbo
Choose a base branch
from

Conversation

solardiz
Copy link
Member

The inline asm didn't work out yet - it's smaller, but slower (in my testing, YMMV). Perhaps the whole idea of keeping the a and b blocks in SIMD registers isn't a great fit for the 2nd iteration with its scalar MUL, which may be why compiler-generated code is sometimes faster (after the C source had been tweaked). Perhaps keeping them in memory can be faster. Where by "keeping" I mean their "primary" location across remaining iterations of the unrolled loop. What happens inside the loop's body can then be optimized arbitrarily.

@solardiz
Copy link
Member Author

Compiler-generated code for the main loop:

 2f8:   48 8b 9d 48 fe ff ff    mov    -0x1b8(%rbp),%rbx
 2ff:   4c 89 d9                mov    %r11,%rcx
 302:   81 e1 f0 ff 1f 00       and    $0x1ffff0,%ecx
 308:   48 01 d9                add    %rbx,%rcx
 30b:   c5 f9 6f 19             vmovdqa (%rcx),%xmm3
 30f:   c4 62 61 dc bd 60 fe    vaesenc -0x1a0(%rbp),%xmm3,%xmm15
 316:   ff ff 
 318:   c4 41 f9 7e fa          vmovq  %xmm15,%r10
 31d:   c4 c1 11 ef d7          vpxor  %xmm15,%xmm13,%xmm2
 322:   c4 61 f9 7e f8          vmovq  %xmm15,%rax
 327:   41 81 e2 f0 ff 1f 00    and    $0x1ffff0,%r10d
 32e:   c5 f9 7f 11             vmovdqa %xmm2,(%rcx)
 332:   49 01 da                add    %rbx,%r10
 335:   c4 c1 79 6f 0a          vmovdqa (%r10),%xmm1
 33a:   c4 e1 f9 7e cf          vmovq  %xmm1,%rdi
 33f:   c4 c3 f9 16 cf 01       vpextrq $0x1,%xmm1,%r15
 345:   48 f7 e7                mul    %rdi
 348:   4e 8d 04 1a             lea    (%rdx,%r11,1),%r8
 34c:   48 03 85 68 fe ff ff    add    -0x198(%rbp),%rax
 353:   4d 89 02                mov    %r8,(%r10)
 356:   49 31 f8                xor    %rdi,%r8
 359:   49 31 c7                xor    %rax,%r15
 35c:   4d 89 c1                mov    %r8,%r9
 35f:   4c 89 85 60 fe ff ff    mov    %r8,-0x1a0(%rbp)
 366:   41 81 e1 f0 ff 1f 00    and    $0x1ffff0,%r9d
 36d:   4c 89 bd 68 fe ff ff    mov    %r15,-0x198(%rbp)
 374:   49 01 d9                add    %rbx,%r9
 377:   49 89 42 08             mov    %rax,0x8(%r10)
 37b:   c4 41 79 6f 09          vmovdqa (%r9),%xmm9
 380:   c4 62 31 dc ad 60 fe    vaesenc -0x1a0(%rbp),%xmm9,%xmm13
 387:   ff ff 
 389:   c4 41 f9 7e ed          vmovq  %xmm13,%r13
 38e:   c4 41 01 ef dd          vpxor  %xmm13,%xmm15,%xmm11
 393:   c4 61 f9 7e e8          vmovq  %xmm13,%rax
 398:   41 81 e5 f0 ff 1f 00    and    $0x1ffff0,%r13d
 39f:   c4 41 79 7f 19          vmovdqa %xmm11,(%r9)
 3a4:   49 01 dd                add    %rbx,%r13
 3a7:   c4 c1 79 6f 6d 00       vmovdqa 0x0(%r13),%xmm5
 3ad:   c4 e1 f9 7e e9          vmovq  %xmm5,%rcx
 3b2:   c4 e3 f9 16 eb 01       vpextrq $0x1,%xmm5,%rbx
 3b8:   c5 f9 7f ad 70 fe ff    vmovdqa %xmm5,-0x190(%rbp)
 3bf:   ff 
 3c0:   48 f7 e1                mul    %rcx
 3c3:   4e 8d 1c 02             lea    (%rdx,%r8,1),%r11
 3c7:   4c 01 f8                add    %r15,%rax
 3ca:   4d 89 5d 00             mov    %r11,0x0(%r13)
 3ce:   48 31 c3                xor    %rax,%rbx
 3d1:   49 31 cb                xor    %rcx,%r11
 3d4:   49 89 45 08             mov    %rax,0x8(%r13)
 3d8:   4c 89 9d 60 fe ff ff    mov    %r11,-0x1a0(%rbp)
 3df:   48 89 9d 68 fe ff ff    mov    %rbx,-0x198(%rbp)
 3e6:   48 83 ee 02             sub    $0x2,%rsi
 3ea:   0f 85 08 ff ff ff       jne    2f8 <cn_slow_hash+0x218>

This PR's inline asm:

 305:   c4 e1 f9 7e c0          vmovq  %xmm0,%rax
 30a:   c4 e2 40 f2 c0          andn   %eax,%edi,%eax
 30f:   c5 f9 6f 14 03          vmovdqa (%rbx,%rax,1),%xmm2
 314:   c4 e2 69 dc d0          vaesenc %xmm0,%xmm2,%xmm2
 319:   c5 f1 ef ca             vpxor  %xmm2,%xmm1,%xmm1
 31d:   c5 f9 7f 0c 03          vmovdqa %xmm1,(%rbx,%rax,1)
 322:   c4 e1 f9 7e d0          vmovq  %xmm2,%rax
 327:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 32c:   c5 f9 6f 0c 0b          vmovdqa (%rbx,%rcx,1),%xmm1
 331:   48 8b 14 0b             mov    (%rbx,%rcx,1),%rdx
 335:   48 f7 e2                mul    %rdx
 338:   c4 e1 f9 6e da          vmovq  %rdx,%xmm3
 33d:   c4 e3 e1 22 d8 01       vpinsrq $0x1,%rax,%xmm3,%xmm3
 343:   c5 f9 d4 c3             vpaddq %xmm3,%xmm0,%xmm0
 347:   c5 f9 7f 04 0b          vmovdqa %xmm0,(%rbx,%rcx,1)
 34c:   c5 f9 ef c1             vpxor  %xmm1,%xmm0,%xmm0
 350:   c4 e1 f9 7e c0          vmovq  %xmm0,%rax
 355:   c4 e2 40 f2 c0          andn   %eax,%edi,%eax
 35a:   c5 f9 6f 0c 03          vmovdqa (%rbx,%rax,1),%xmm1
 35f:   c4 e2 71 dc c8          vaesenc %xmm0,%xmm1,%xmm1
 364:   c5 e9 ef d1             vpxor  %xmm1,%xmm2,%xmm2
 368:   c5 f9 7f 14 03          vmovdqa %xmm2,(%rbx,%rax,1)
 36d:   c4 e1 f9 7e c8          vmovq  %xmm1,%rax
 372:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 377:   c5 f9 6f 14 0b          vmovdqa (%rbx,%rcx,1),%xmm2
 37c:   48 8b 14 0b             mov    (%rbx,%rcx,1),%rdx
 380:   48 f7 e2                mul    %rdx
 383:   c4 e1 f9 6e da          vmovq  %rdx,%xmm3
 388:   c4 e3 e1 22 d8 01       vpinsrq $0x1,%rax,%xmm3,%xmm3
 38e:   c5 f9 d4 c3             vpaddq %xmm3,%xmm0,%xmm0
 392:   c5 f9 7f 04 0b          vmovdqa %xmm0,(%rbx,%rcx,1)
 397:   c5 f9 ef c2             vpxor  %xmm2,%xmm0,%xmm0
 39b:   ff ce                   dec    %esi
 39d:   0f 85 62 ff ff ff       jne    305 <cn_slow_hash+0x225>

Same extent of loop unrolling (4 iterations).

@solardiz solardiz changed the title Monero & slow hash minor changes + add inline asm (disabled for now) Monero & slow hash minor changes + add inline asm Feb 19, 2025
@solardiz
Copy link
Member Author

Made some tweaks, such as to squeeze some tiny bits of parallelism. Now same speed as the larger compiler-generated code, and enabled hoping for more stable good speeds not dependent on compiler version.

 305:   c4 e1 f9 7e c0          vmovq  %xmm0,%rax
 30a:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 30f:   c5 f9 6f 14 0b          vmovdqa (%rbx,%rcx,1),%xmm2
 314:   c4 e2 69 dc d0          vaesenc %xmm0,%xmm2,%xmm2
 319:   c5 f1 ef ca             vpxor  %xmm2,%xmm1,%xmm1
 31d:   c4 e1 f9 7e d0          vmovq  %xmm2,%rax
 322:   c5 f9 7f 0c 0b          vmovdqa %xmm1,(%rbx,%rcx,1)
 327:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 32c:   48 8b 14 0b             mov    (%rbx,%rcx,1),%rdx
 330:   48 f7 e2                mul    %rdx
 333:   c4 e1 f9 6e da          vmovq  %rdx,%xmm3
 338:   c4 e3 e1 22 d8 01       vpinsrq $0x1,%rax,%xmm3,%xmm3
 33e:   c5 f9 d4 c3             vpaddq %xmm3,%xmm0,%xmm0
 342:   c5 f9 ef 1c 0b          vpxor  (%rbx,%rcx,1),%xmm0,%xmm3
 347:   c5 f9 7f 04 0b          vmovdqa %xmm0,(%rbx,%rcx,1)
 34c:   c4 e1 f9 7e d8          vmovq  %xmm3,%rax
 351:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 356:   c5 f9 6f 0c 0b          vmovdqa (%rbx,%rcx,1),%xmm1
 35b:   c4 e2 71 dc cb          vaesenc %xmm3,%xmm1,%xmm1
 360:   c5 e9 ef d1             vpxor  %xmm1,%xmm2,%xmm2
 364:   c4 e1 f9 7e c8          vmovq  %xmm1,%rax
 369:   c5 f9 7f 14 0b          vmovdqa %xmm2,(%rbx,%rcx,1)
 36e:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 373:   48 8b 14 0b             mov    (%rbx,%rcx,1),%rdx
 377:   48 f7 e2                mul    %rdx
 37a:   c4 e1 f9 6e c2          vmovq  %rdx,%xmm0
 37f:   c4 e3 f9 22 c0 01       vpinsrq $0x1,%rax,%xmm0,%xmm0
 385:   c5 e1 d4 d8             vpaddq %xmm0,%xmm3,%xmm3
 389:   c5 e1 ef 04 0b          vpxor  (%rbx,%rcx,1),%xmm3,%xmm0
 38e:   c5 f9 7f 1c 0b          vmovdqa %xmm3,(%rbx,%rcx,1)
 393:   ff ce                   dec    %esi
 395:   0f 85 6a ff ff ff       jne    305 <cn_slow_hash+0x225>

@solardiz
Copy link
Member Author

Even smaller code, now 14 instructions per 2 iterations:

 305:   c4 e1 f9 7e c0          vmovq  %xmm0,%rax
 30a:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 30f:   c5 f9 6f 14 0b          vmovdqa (%rbx,%rcx,1),%xmm2
 314:   c4 e2 69 dc d0          vaesenc %xmm0,%xmm2,%xmm2
 319:   c5 f1 ef ca             vpxor  %xmm2,%xmm1,%xmm1
 31d:   c4 e1 f9 7e d0          vmovq  %xmm2,%rax
 322:   c5 f9 7f 0c 0b          vmovdqa %xmm1,(%rbx,%rcx,1)
 327:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 32c:   48 f7 24 0b             mulq   (%rbx,%rcx,1)
 330:   c4 e1 f9 6e da          vmovq  %rdx,%xmm3
 335:   c4 e3 e1 22 d8 01       vpinsrq $0x1,%rax,%xmm3,%xmm3
 33b:   c5 f9 d4 c3             vpaddq %xmm3,%xmm0,%xmm0
 33f:   c5 f9 ef 1c 0b          vpxor  (%rbx,%rcx,1),%xmm0,%xmm3
 344:   c5 f9 7f 04 0b          vmovdqa %xmm0,(%rbx,%rcx,1)
 349:   c4 e1 f9 7e d8          vmovq  %xmm3,%rax
 34e:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 353:   c5 f9 6f 0c 0b          vmovdqa (%rbx,%rcx,1),%xmm1
 358:   c4 e2 71 dc cb          vaesenc %xmm3,%xmm1,%xmm1
 35d:   c5 e9 ef d1             vpxor  %xmm1,%xmm2,%xmm2
 361:   c4 e1 f9 7e c8          vmovq  %xmm1,%rax
 366:   c5 f9 7f 14 0b          vmovdqa %xmm2,(%rbx,%rcx,1)
 36b:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 370:   48 f7 24 0b             mulq   (%rbx,%rcx,1)
 374:   c4 e1 f9 6e c2          vmovq  %rdx,%xmm0
 379:   c4 e3 f9 22 c0 01       vpinsrq $0x1,%rax,%xmm0,%xmm0
 37f:   c5 e1 d4 d8             vpaddq %xmm0,%xmm3,%xmm3
 383:   c5 e1 ef 04 0b          vpxor  (%rbx,%rcx,1),%xmm3,%xmm0
 388:   c5 f9 7f 1c 0b          vmovdqa %xmm3,(%rbx,%rcx,1)
 38d:   ff ce                   dec    %esi
 38f:   0f 85 70 ff ff ff       jne    305 <cn_slow_hash+0x225>

@solardiz
Copy link
Member Author

And smaller yet (by 2 bytes):

 305:   c5 f9 7e c0             vmovd  %xmm0,%eax
 309:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 30e:   c5 f9 6f 14 0b          vmovdqa (%rbx,%rcx,1),%xmm2
 313:   c4 e2 69 dc d0          vaesenc %xmm0,%xmm2,%xmm2
 318:   c5 f1 ef ca             vpxor  %xmm2,%xmm1,%xmm1
 31c:   c4 e1 f9 7e d0          vmovq  %xmm2,%rax
 321:   c5 f9 7f 0c 0b          vmovdqa %xmm1,(%rbx,%rcx,1)
 326:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 32b:   48 f7 24 0b             mulq   (%rbx,%rcx,1)
 32f:   c4 e1 f9 6e da          vmovq  %rdx,%xmm3
 334:   c4 e3 e1 22 d8 01       vpinsrq $0x1,%rax,%xmm3,%xmm3
 33a:   c5 f9 d4 c3             vpaddq %xmm3,%xmm0,%xmm0
 33e:   c5 f9 ef 1c 0b          vpxor  (%rbx,%rcx,1),%xmm0,%xmm3
 343:   c5 f9 7f 04 0b          vmovdqa %xmm0,(%rbx,%rcx,1)
 348:   c5 f9 7e d8             vmovd  %xmm3,%eax
 34c:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 351:   c5 f9 6f 0c 0b          vmovdqa (%rbx,%rcx,1),%xmm1
 356:   c4 e2 71 dc cb          vaesenc %xmm3,%xmm1,%xmm1
 35b:   c5 e9 ef d1             vpxor  %xmm1,%xmm2,%xmm2
 35f:   c4 e1 f9 7e c8          vmovq  %xmm1,%rax
 364:   c5 f9 7f 14 0b          vmovdqa %xmm2,(%rbx,%rcx,1)
 369:   c4 e2 40 f2 c8          andn   %eax,%edi,%ecx
 36e:   48 f7 24 0b             mulq   (%rbx,%rcx,1)
 372:   c4 e1 f9 6e c2          vmovq  %rdx,%xmm0
 377:   c4 e3 f9 22 c0 01       vpinsrq $0x1,%rax,%xmm0,%xmm0
 37d:   c5 e1 d4 d8             vpaddq %xmm0,%xmm3,%xmm3
 381:   c5 e1 ef 04 0b          vpxor  (%rbx,%rcx,1),%xmm3,%xmm0
 386:   c5 f9 7f 1c 0b          vmovdqa %xmm3,(%rbx,%rcx,1)
 38b:   ff ce                   dec    %esi
 38d:   0f 85 72 ff ff ff       jne    305 <cn_slow_hash+0x225>

I think next step may be to use more registers to avoid "anti-dependencies" like:

                "vmovdqa %%xmm1,(%%rbx,%%rcx)\n\t"
                "andnl %%eax,%4,%%ecx\n\t"

which would allow placing the MOV instruction further down the code. Perhaps the CPU already optimizes this through register renaming and out-of-order execution, but it's worth trying to help it anyway.

Using memory allocated by tests slows multi-threaded benchmarks down on a
certain CentOS 7 system (but not on a newer system).
"jnz 1b\n\t"
:
: "m" (a), "m" (b), "b" (long_state), "S" (ITER / 4), "D" (~(uint32_t)(MEMORY - AES_BLOCK_SIZE))
: "ax", "cx", "dx", "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc");
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to self: I need to add si to clobbered registers (here, and ditto below).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant