@ -134,7 +134,7 @@ $code.=<<___ if ($win64);
movaps % xmm10 , 0x40 ( % rsp )
movaps % xmm11 , 0x50 ( % rsp )
movaps % xmm12 , 0x60 ( % rsp )
movaps % xmm13 , - 0x68 ( % rax ) # not used, saved to share se_handler
movaps % xmm13 , - 0x68 ( % rax ) # not used, saved to share se_handler
movaps % xmm14 , - 0x58 ( % rax )
movaps % xmm15 , - 0x48 ( % rax )
___
@ -308,9 +308,9 @@ $code.=<<___;
movups @ out [ 0 ] , - 16 ( @ outptr [ 0 ] , $ offset )
pxor @ inp [ 0 ] , @ out [ 0 ]
movups @ out [ 1 ] , - 16 ( @ outptr [ 1 ] , $ offset )
movups @ out [ 1 ] , - 16 ( @ outptr [ 1 ] , $ offset )
pxor @ inp [ 1 ] , @ out [ 1 ]
movups @ out [ 2 ] , - 16 ( @ outptr [ 2 ] , $ offset )
movups @ out [ 2 ] , - 16 ( @ outptr [ 2 ] , $ offset )
pxor @ inp [ 2 ] , @ out [ 2 ]
movups @ out [ 3 ] , - 16 ( @ outptr [ 3 ] , $ offset )
pxor @ inp [ 3 ] , @ out [ 3 ]
@ -393,7 +393,7 @@ $code.=<<___ if ($win64);
movaps % xmm10 , 0x40 ( % rsp )
movaps % xmm11 , 0x50 ( % rsp )
movaps % xmm12 , 0x60 ( % rsp )
movaps % xmm13 , - 0x68 ( % rax ) # not used, saved to share se_handler
movaps % xmm13 , - 0x68 ( % rax ) # not used, saved to share se_handler
movaps % xmm14 , - 0x58 ( % rax )
movaps % xmm15 , - 0x48 ( % rax )
___
@ -563,10 +563,10 @@ $code.=<<___;
movups @ out [ 0 ] , - 16 ( @ outptr [ 0 ] , $ offset )
movdqu ( @ inptr [ 0 ] , $ offset ) , @ out [ 0 ]
movups @ out [ 1 ] , - 16 ( @ outptr [ 1 ] , $ offset )
movups @ out [ 1 ] , - 16 ( @ outptr [ 1 ] , $ offset )
movdqu ( @ inptr [ 1 ] , $ offset ) , @ out [ 1 ]
pxor $ zero , @ out [ 0 ]
movups @ out [ 2 ] , - 16 ( @ outptr [ 2 ] , $ offset )
movups @ out [ 2 ] , - 16 ( @ outptr [ 2 ] , $ offset )
movdqu ( @ inptr [ 2 ] , $ offset ) , @ out [ 2 ]
pxor $ zero , @ out [ 1 ]
movups @ out [ 3 ] , - 16 ( @ outptr [ 3 ] , $ offset )
@ -835,10 +835,10 @@ $code.=<<___;
vmovups @ out [ 0 ] , - 16 ( @ ptr [ 0 ] ) # write output
sub $ offset , @ ptr [ 0 ] # switch to input
vpxor 0x00 ( $ offload ) , @ out [ 0 ] , @ out [ 0 ]
vmovups @ out [ 1 ] , - 16 ( @ ptr [ 1 ] )
vmovups @ out [ 1 ] , - 16 ( @ ptr [ 1 ] )
sub ` 6 4 + 1 * 8 ` ( % rsp ) , @ ptr [ 1 ]
vpxor 0x10 ( $ offload ) , @ out [ 1 ] , @ out [ 1 ]
vmovups @ out [ 2 ] , - 16 ( @ ptr [ 2 ] )
vmovups @ out [ 2 ] , - 16 ( @ ptr [ 2 ] )
sub ` 6 4 + 2 * 8 ` ( % rsp ) , @ ptr [ 2 ]
vpxor 0x20 ( $ offload ) , @ out [ 2 ] , @ out [ 2 ]
vmovups @ out [ 3 ] , - 16 ( @ ptr [ 3 ] )
@ -847,10 +847,10 @@ $code.=<<___;
vmovups @ out [ 4 ] , - 16 ( @ ptr [ 4 ] )
sub ` 6 4 + 4 * 8 ` ( % rsp ) , @ ptr [ 4 ]
vpxor @ inp [ 0 ] , @ out [ 4 ] , @ out [ 4 ]
vmovups @ out [ 5 ] , - 16 ( @ ptr [ 5 ] )
vmovups @ out [ 5 ] , - 16 ( @ ptr [ 5 ] )
sub ` 6 4 + 5 * 8 ` ( % rsp ) , @ ptr [ 5 ]
vpxor @ inp [ 1 ] , @ out [ 5 ] , @ out [ 5 ]
vmovups @ out [ 6 ] , - 16 ( @ ptr [ 6 ] )
vmovups @ out [ 6 ] , - 16 ( @ ptr [ 6 ] )
sub ` 6 4 + 6 * 8 ` ( % rsp ) , @ ptr [ 6 ]
vpxor @ inp [ 2 ] , @ out [ 6 ] , @ out [ 6 ]
vmovups @ out [ 7 ] , - 16 ( @ ptr [ 7 ] )
@ -1128,12 +1128,12 @@ $code.=<<___;
sub $ offset , @ ptr [ 0 ] # switch to input
vmovdqu 128 + 0 ( % rsp ) , @ out [ 0 ]
vpxor 0x70 ( $ offload ) , @ out [ 7 ] , @ out [ 7 ]
vmovups @ out [ 1 ] , - 16 ( @ ptr [ 1 ] )
vmovups @ out [ 1 ] , - 16 ( @ ptr [ 1 ] )
sub ` 6 4 + 1 * 8 ` ( % rsp ) , @ ptr [ 1 ]
vmovdqu @ out [ 0 ] , 0x00 ( $ offload )
vpxor $ zero , @ out [ 0 ] , @ out [ 0 ]
vmovdqu 128 + 16 ( % rsp ) , @ out [ 1 ]
vmovups @ out [ 2 ] , - 16 ( @ ptr [ 2 ] )
vmovups @ out [ 2 ] , - 16 ( @ ptr [ 2 ] )
sub ` 6 4 + 2 * 8 ` ( % rsp ) , @ ptr [ 2 ]
vmovdqu @ out [ 1 ] , 0x10 ( $ offload )
vpxor $ zero , @ out [ 1 ] , @ out [ 1 ]
@ -1149,11 +1149,11 @@ $code.=<<___;
vpxor $ zero , @ out [ 3 ] , @ out [ 3 ]
vmovdqu @ inp [ 0 ] , 0x40 ( $ offload )
vpxor @ inp [ 0 ] , $ zero , @ out [ 4 ]
vmovups @ out [ 5 ] , - 16 ( @ ptr [ 5 ] )
vmovups @ out [ 5 ] , - 16 ( @ ptr [ 5 ] )
sub ` 6 4 + 5 * 8 ` ( % rsp ) , @ ptr [ 5 ]
vmovdqu @ inp [ 1 ] , 0x50 ( $ offload )
vpxor @ inp [ 1 ] , $ zero , @ out [ 5 ]
vmovups @ out [ 6 ] , - 16 ( @ ptr [ 6 ] )
vmovups @ out [ 6 ] , - 16 ( @ ptr [ 6 ] )
sub ` 6 4 + 6 * 8 ` ( % rsp ) , @ ptr [ 6 ]
vmovdqu @ inp [ 2 ] , 0x60 ( $ offload )
vpxor @ inp [ 2 ] , $ zero , @ out [ 6 ]