Remove trailing whitespace from some files.

The prevailing style seems to not have trailing whitespace, but a few
lines do. This is mostly in the perlasm files, but a few C files got
them after the reformat. This is the result of:

  find . -name '*.pl' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//'
  find . -name '*.c' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//'
  find . -name '*.h' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//'

Then bn_prime.h was excluded since this is a generated file.

Note mkerr.pl has some changes in a heredoc for some help output, but
other lines there lack trailing whitespace too.

Reviewed-by: Kurt Roeckx <kurt@openssl.org>
Reviewed-by: Matt Caswell <matt@openssl.org>
master
David Benjamin 7 years ago committed by Matt Caswell
parent 11542af65a
commit 609b0852e4

@ -146,7 +146,7 @@ OPTIONS cms_options[] = {
"Do not load certificates from the default certificates directory"},
{"content", OPT_CONTENT, '<',
"Supply or override content for detached signature"},
{"print", OPT_PRINT, '-',
{"print", OPT_PRINT, '-',
"For the -cmsout operation print out all fields of the CMS structure"},
{"secretkey", OPT_SECRETKEY, 's'},
{"secretkeyid", OPT_SECRETKEYID, 's'},

@ -89,7 +89,7 @@ OPTIONS smime_options[] = {
{"no-CApath", OPT_NOCAPATH, '-',
"Do not load certificates from the default certificates directory"},
{"resign", OPT_RESIGN, '-', "Resign a signed message"},
{"nochain", OPT_NOCHAIN, '-',
{"nochain", OPT_NOCHAIN, '-',
"set PKCS7_NOCHAIN so certificates contained in the message are not used as untrusted CAs" },
{"nosmimecap", OPT_NOSMIMECAP, '-', "Omit the SMIMECapabilities attribute"},
{"stream", OPT_STREAM, '-', "Enable CMS streaming" },

@ -1187,8 +1187,8 @@ static int run_benchmark(int async_jobs,
continue;
#endif
ret = ASYNC_start_job(&loopargs[i].inprogress_job,
loopargs[i].wait_ctx, &job_op_count, loop_function,
ret = ASYNC_start_job(&loopargs[i].inprogress_job,
loopargs[i].wait_ctx, &job_op_count, loop_function,
(void *)(loopargs + i), sizeof(loopargs_t));
switch (ret) {
case ASYNC_PAUSE:

@ -123,7 +123,7 @@
# words every cache-line is *guaranteed* to be accessed within ~50
# cycles window. Why just SSE? Because it's needed on hyper-threading
# CPU! Which is also why it's prefetched with 64 byte stride. Best
# part is that it has no negative effect on performance:-)
# part is that it has no negative effect on performance:-)
#
# Version 4.3 implements switch between compact and non-compact block
# functions in AES_cbc_encrypt depending on how much data was asked
@ -585,7 +585,7 @@ sub enctransform()
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# | mm4 | mm0 |
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# | s3 | s2 | s1 | s0 |
# | s3 | s2 | s1 | s0 |
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
@ -805,7 +805,7 @@ sub encstep()
if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]);
else { &mov ($tmp,$s[3]);
&shr ($tmp,24) }
&xor ($out,&DWP(1,$te,$tmp,8));
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
@ -1558,7 +1558,7 @@ sub sse_deccompact()
&pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
&pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
&pxor ("mm3","mm3"); &pxor ("mm7","mm7");
&pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
@ -2028,7 +2028,7 @@ sub declast()
{
# stack frame layout
# -4(%esp) # return address 0(%esp)
# 0(%esp) # s0 backing store 4(%esp)
# 0(%esp) # s0 backing store 4(%esp)
# 4(%esp) # s1 backing store 8(%esp)
# 8(%esp) # s2 backing store 12(%esp)
# 12(%esp) # s3 backing store 16(%esp)
@ -2738,7 +2738,7 @@ sub enckey()
&mov (&DWP(80,"edi"),10); # setup number of rounds
&xor ("eax","eax");
&jmp (&label("exit"));
&set_label("12rounds");
&mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
&mov ("ebx",&DWP(4,"esi"));

@ -1433,10 +1433,10 @@ $code.=<<___;
xor $s1,$s1,$acc05
xor $s2,$s2,$acc06
xor $s3,$s3,$acc07
xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
xor $s1,$s1,$acc09
xor $s2,$s2,$acc10
xor $s3,$s3,$acc11
xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
xor $s1,$s1,$acc09
xor $s2,$s2,$acc10
xor $s3,$s3,$acc11
b Ldec_compact_loop
.align 4

@ -404,7 +404,7 @@ _s390x_AES_encrypt:
or $s1,$t1
or $t2,$i2
or $t3,$i3
srlg $i1,$s2,`8-3` # i0
srlg $i2,$s2,`16-3` # i1
nr $i1,$mask
@ -457,7 +457,7 @@ _s390x_AES_encrypt:
x $s2,24($key)
x $s3,28($key)
br $ra
br $ra
.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
___
@ -779,7 +779,7 @@ _s390x_AES_decrypt:
x $s2,24($key)
x $s3,28($key)
br $ra
br $ra
.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
___
@ -1297,7 +1297,7 @@ $code.=<<___;
.Lcbc_enc_done:
l${g} $ivp,6*$SIZE_T($sp)
st $s0,0($ivp)
st $s1,4($ivp)
st $s1,4($ivp)
st $s2,8($ivp)
st $s3,12($ivp)
@ -1635,7 +1635,7 @@ $code.=<<___ if(1);
llgc $len,2*$SIZE_T-1($sp)
nill $len,0x0f # $len%=16
br $ra
.align 16
.Lxts_km_vanilla:
___
@ -1862,7 +1862,7 @@ $code.=<<___;
xgr $s1,%r1
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32
@ -1913,7 +1913,7 @@ $code.=<<___;
xgr $s1,%r1
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32
@ -2105,7 +2105,7 @@ $code.=<<___;
xgr $s1,%r1
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32

@ -1298,7 +1298,7 @@ $code.=<<___;
AES_set_encrypt_key:
push %rbx
push %rbp
push %r12 # redundant, but allows to share
push %r12 # redundant, but allows to share
push %r13 # exception handler...
push %r14
push %r15
@ -1424,7 +1424,7 @@ $code.=<<___;
xor %rax,%rax
jmp .Lexit
.L14rounds:
.L14rounds:
mov 0(%rsi),%rax # copy first 8 dwords
mov 8(%rsi),%rbx
mov 16(%rsi),%rcx

@ -134,7 +134,7 @@ $code.=<<___ if ($win64);
movaps %xmm10,0x40(%rsp)
movaps %xmm11,0x50(%rsp)
movaps %xmm12,0x60(%rsp)
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
movaps %xmm14,-0x58(%rax)
movaps %xmm15,-0x48(%rax)
___
@ -308,9 +308,9 @@ $code.=<<___;
movups @out[0],-16(@outptr[0],$offset)
pxor @inp[0],@out[0]
movups @out[1],-16(@outptr[1],$offset)
movups @out[1],-16(@outptr[1],$offset)
pxor @inp[1],@out[1]
movups @out[2],-16(@outptr[2],$offset)
movups @out[2],-16(@outptr[2],$offset)
pxor @inp[2],@out[2]
movups @out[3],-16(@outptr[3],$offset)
pxor @inp[3],@out[3]
@ -393,7 +393,7 @@ $code.=<<___ if ($win64);
movaps %xmm10,0x40(%rsp)
movaps %xmm11,0x50(%rsp)
movaps %xmm12,0x60(%rsp)
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
movaps %xmm14,-0x58(%rax)
movaps %xmm15,-0x48(%rax)
___
@ -563,10 +563,10 @@ $code.=<<___;
movups @out[0],-16(@outptr[0],$offset)
movdqu (@inptr[0],$offset),@out[0]
movups @out[1],-16(@outptr[1],$offset)
movups @out[1],-16(@outptr[1],$offset)
movdqu (@inptr[1],$offset),@out[1]
pxor $zero,@out[0]
movups @out[2],-16(@outptr[2],$offset)
movups @out[2],-16(@outptr[2],$offset)
movdqu (@inptr[2],$offset),@out[2]
pxor $zero,@out[1]
movups @out[3],-16(@outptr[3],$offset)
@ -835,10 +835,10 @@ $code.=<<___;
vmovups @out[0],-16(@ptr[0]) # write output
sub $offset,@ptr[0] # switch to input
vpxor 0x00($offload),@out[0],@out[0]
vmovups @out[1],-16(@ptr[1])
vmovups @out[1],-16(@ptr[1])
sub `64+1*8`(%rsp),@ptr[1]
vpxor 0x10($offload),@out[1],@out[1]
vmovups @out[2],-16(@ptr[2])
vmovups @out[2],-16(@ptr[2])
sub `64+2*8`(%rsp),@ptr[2]
vpxor 0x20($offload),@out[2],@out[2]
vmovups @out[3],-16(@ptr[3])
@ -847,10 +847,10 @@ $code.=<<___;
vmovups @out[4],-16(@ptr[4])
sub `64+4*8`(%rsp),@ptr[4]
vpxor @inp[0],@out[4],@out[4]
vmovups @out[5],-16(@ptr[5])
vmovups @out[5],-16(@ptr[5])
sub `64+5*8`(%rsp),@ptr[5]
vpxor @inp[1],@out[5],@out[5]
vmovups @out[6],-16(@ptr[6])
vmovups @out[6],-16(@ptr[6])
sub `64+6*8`(%rsp),@ptr[6]
vpxor @inp[2],@out[6],@out[6]
vmovups @out[7],-16(@ptr[7])
@ -1128,12 +1128,12 @@ $code.=<<___;
sub $offset,@ptr[0] # switch to input
vmovdqu 128+0(%rsp),@out[0]
vpxor 0x70($offload),@out[7],@out[7]
vmovups @out[1],-16(@ptr[1])
vmovups @out[1],-16(@ptr[1])
sub `64+1*8`(%rsp),@ptr[1]
vmovdqu @out[0],0x00($offload)
vpxor $zero,@out[0],@out[0]
vmovdqu 128+16(%rsp),@out[1]
vmovups @out[2],-16(@ptr[2])
vmovups @out[2],-16(@ptr[2])
sub `64+2*8`(%rsp),@ptr[2]
vmovdqu @out[1],0x10($offload)
vpxor $zero,@out[1],@out[1]
@ -1149,11 +1149,11 @@ $code.=<<___;
vpxor $zero,@out[3],@out[3]
vmovdqu @inp[0],0x40($offload)
vpxor @inp[0],$zero,@out[4]
vmovups @out[5],-16(@ptr[5])
vmovups @out[5],-16(@ptr[5])
sub `64+5*8`(%rsp),@ptr[5]
vmovdqu @inp[1],0x50($offload)
vpxor @inp[1],$zero,@out[5]
vmovups @out[6],-16(@ptr[6])
vmovups @out[6],-16(@ptr[6])
sub `64+6*8`(%rsp),@ptr[6]
vmovdqu @inp[2],0x60($offload)
vpxor @inp[2],$zero,@out[6]

@ -793,7 +793,7 @@ sub body_00_19_dec () { # ((c^d)&b)^d
sub body_20_39_dec () { # b^d^c
# on entry @T[0]=b^d
return &body_40_59_dec() if ($rx==39);
my @r=@body_20_39;
unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);

@ -884,7 +884,7 @@ if ($avx>1) {{
######################################################################
# AVX2+BMI code path
#
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;

@ -1051,7 +1051,7 @@ if ($PREFIX eq "aesni") {
&set_label("ctr32_one_shortcut",16);
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
&mov ($rounds,&DWP(240,$key));
&set_label("ctr32_one");
if ($inline)
{ &aesni_inline_generate1("enc"); }

@ -34,7 +34,7 @@
# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
#
@ -118,7 +118,7 @@
# performance is achieved by interleaving instructions working on
# independent blocks. In which case asymptotic limit for such modes
# can be obtained by dividing above mentioned numbers by AES
# instructions' interleave factor. Westmere can execute at most 3
# instructions' interleave factor. Westmere can execute at most 3
# instructions at a time, meaning that optimal interleave factor is 3,
# and that's where the "magic" number of 1.25 come from. "Optimal
# interleave factor" means that increase of interleave factor does
@ -312,7 +312,7 @@ ___
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
# * Bridge and "super-optimal" for other Intel CPUs...
# * Bridge and "super-optimal" for other Intel CPUs...
sub aesni_generate2 {
my $dir=shift;
@ -1271,7 +1271,7 @@ $code.=<<___;
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
mov OPENSSL_ia32cap_P+4(%rip),%r10d
mov OPENSSL_ia32cap_P+4(%rip),%r10d
xor $key0,%r9d
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
mov %r9d,0x70+12(%rsp)
@ -1551,7 +1551,7 @@ $code.=<<___;
.Lctr32_tail:
# note that at this point $inout0..5 are populated with
# counter values xor-ed with 0-round key
# counter values xor-ed with 0-round key
lea 16($key),$key
cmp \$4,$len
jb .Lctr32_loop3

@ -3773,7 +3773,7 @@ foreach(split("\n",$code)) {
if ($flavour =~ /le$/o) {
SWITCH: for($conv) {
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
/\?rev/ && do { @bytes=reverse(@bytes); last; };
/\?rev/ && do { @bytes=reverse(@bytes); last; };
}
}

@ -961,21 +961,21 @@ if ($flavour =~ /64/) { ######## 64-bit code
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
}
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvmov32 {
my $arg=shift;
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
}
foreach(split("\n",$code)) {

@ -91,7 +91,7 @@ my @s=@_[12..15];
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
veor @b[2], @b[2], @b[1]

@ -129,7 +129,7 @@ my @s=@_[12..15];
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
pxor @b[6], @b[5]
@ -379,7 +379,7 @@ $code.=<<___;
pxor @s[0], @t[3]
pxor @s[1], @t[2]
pxor @s[2], @t[1]
pxor @s[3], @t[0]
pxor @s[3], @t[0]
#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3

@ -769,7 +769,7 @@ _vpaes_schedule_core:
ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
bl _vpaes_schedule_transform // input transform
mov $inp, #7 // mov \$7, %esi
.Loop_schedule_256:
sub $inp, $inp, #1 // dec %esi
bl _vpaes_schedule_mangle // output low result
@ -778,7 +778,7 @@ _vpaes_schedule_core:
// high round
bl _vpaes_schedule_round
cbz $inp, .Lschedule_mangle_last
bl _vpaes_schedule_mangle
bl _vpaes_schedule_mangle
// low round. swap xmm7 and xmm6
dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
@ -787,7 +787,7 @@ _vpaes_schedule_core:
mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
bl _vpaes_schedule_low_round
mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
b .Loop_schedule_256
##
@ -814,7 +814,7 @@ _vpaes_schedule_core:
.Lschedule_mangle_last_dec:
ld1 {v20.2d-v21.2d}, [x11] // reload constants
sub $out, $out, #16 // add \$-16, %rdx
sub $out, $out, #16 // add \$-16, %rdx
eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
bl _vpaes_schedule_transform // output transform
st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key

@ -1074,7 +1074,7 @@ Loop_schedule_256:
# high round
bl _vpaes_schedule_round
bdz Lschedule_mangle_last # dec %esi
bl _vpaes_schedule_mangle
bl _vpaes_schedule_mangle
# low round. swap xmm7 and xmm6
?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
@ -1082,7 +1082,7 @@ Loop_schedule_256:
vmr v7, v6 # vmovdqa %xmm6, %xmm7
bl _vpaes_schedule_low_round
vmr v7, v5 # vmovdqa %xmm5, %xmm7
b Loop_schedule_256
##
## .aes_schedule_mangle_last
@ -1130,7 +1130,7 @@ Lschedule_mangle_last:
Lschedule_mangle_last_dec:
lvx $iptlo, r11, r12 # reload $ipt
lvx $ipthi, r9, r12
addi $out, $out, -16 # add \$-16, %rdx
addi $out, $out, -16 # add \$-16, %rdx
vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
bl _vpaes_schedule_transform # output transform
@ -1565,7 +1565,7 @@ foreach (split("\n",$code)) {
if ($flavour =~ /le$/o) {
SWITCH: for($conv) {
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
/\?rev/ && do { @bytes=reverse(@bytes); last; };
/\?rev/ && do { @bytes=reverse(@bytes); last; };
}
}

@ -445,7 +445,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
##
&set_label("schedule_192",16);
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&call ("_vpaes_schedule_transform"); # input transform
&movdqa ("xmm6","xmm0"); # save short part
&pxor ("xmm4","xmm4"); # clear 4
&movhlps("xmm6","xmm4"); # clobber low side with zeros
@ -476,7 +476,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
@ -487,7 +487,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle");
&call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
@ -610,7 +610,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
&movdqa ("xmm1","xmm4");
&movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k

@ -171,7 +171,7 @@ _vpaes_encrypt_core:
pshufb %xmm1, %xmm0
ret
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
##
## Decryption core
##
@ -332,7 +332,7 @@ _vpaes_schedule_core:
##
.Lschedule_128:
mov \$10, %esi
.Loop_schedule_128:
call _vpaes_schedule_round
dec %rsi
@ -366,7 +366,7 @@ _vpaes_schedule_core:
.Loop_schedule_192:
call _vpaes_schedule_round
palignr \$8,%xmm6,%xmm0
palignr \$8,%xmm6,%xmm0
call _vpaes_schedule_mangle # save key n
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle # save key n+1
@ -392,7 +392,7 @@ _vpaes_schedule_core:
movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
call _vpaes_schedule_transform # input transform
mov \$7, %esi
.Loop_schedule_256:
call _vpaes_schedule_mangle # output low result
movdqa %xmm0, %xmm6 # save cur_lo in xmm6
@ -401,7 +401,7 @@ _vpaes_schedule_core:
call _vpaes_schedule_round
dec %rsi
jz .Lschedule_mangle_last
call _vpaes_schedule_mangle
call _vpaes_schedule_mangle
# low round. swap xmm7 and xmm6
pshufd \$0xFF, %xmm0, %xmm0
@ -409,10 +409,10 @@ _vpaes_schedule_core:
movdqa %xmm6, %xmm7
call _vpaes_schedule_low_round
movdqa %xmm5, %xmm7
jmp .Loop_schedule_256
##
## .aes_schedule_mangle_last
##
@ -511,9 +511,9 @@ _vpaes_schedule_round:
# rotate
pshufd \$0xFF, %xmm0, %xmm0
palignr \$1, %xmm0, %xmm0
# fall through...
# low round: same as high round, but no rotation and no rcon.
_vpaes_schedule_low_round:
# smear xmm7
@ -552,7 +552,7 @@ _vpaes_schedule_low_round:
pxor %xmm4, %xmm0 # 0 = sbox output
# add in smeared stuff
pxor %xmm7, %xmm0
pxor %xmm7, %xmm0
movdqa %xmm0, %xmm7
ret
.size _vpaes_schedule_round,.-_vpaes_schedule_round

@ -36,7 +36,7 @@
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
$flavour = shift;

@ -23,7 +23,7 @@
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
# instructions. The latter means that this code didn't really have an
# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less

@ -54,7 +54,7 @@ sub bn_mul_add_words
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
&jmp(&label("maw_sse2_entry"));
&set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
@ -675,20 +675,20 @@ sub bn_sub_part_words
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_neg_loop"));
&set_label("pw_neg_finish",0);
&mov($tmp2,&wparam(4)); # get dl
&mov($num,0);
&sub($num,$tmp2);
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl<0 Tail Round $i");
@ -705,9 +705,9 @@ sub bn_sub_part_words
}
&jmp(&label("pw_end"));
&set_label("pw_pos",0);
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_pos_finish"));
@ -722,18 +722,18 @@ sub bn_sub_part_words
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_nc".$i));
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_pos_loop"));
&set_label("pw_pos_finish",0);
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl>0 Tail Round $i");
@ -754,17 +754,17 @@ sub bn_sub_part_words
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_nc".$i,0);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_nc_loop"));
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_nc_end"));
for ($i=0; $i<7; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a

@ -47,7 +47,7 @@ sub mul_add_c
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
@ -76,7 +76,7 @@ sub sqr_add_c
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
@ -127,7 +127,7 @@ sub bn_mul_comba
$c2="ebp";
$a="esi";
$b="edi";
$as=0;
$ae=0;
$bs=0;
@ -142,9 +142,9 @@ sub bn_mul_comba
&push("ebx");
&xor($c0,$c0);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
&mov("edx",&DWP(0,$b,"",0)); # load the first second
&mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
@ -152,7 +152,7 @@ sub bn_mul_comba
$bi=$bs;
$end=$be+1;
&comment("################## Calculate word $i");
&comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{

@ -80,7 +80,7 @@ $code=<<___;
// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
// const BN_ULONG *bp,const BN_ULONG *np,
// const BN_ULONG *n0p,int num);
// const BN_ULONG *n0p,int num);
.align 64
.global bn_mul_mont#
.proc bn_mul_mont#
@ -203,7 +203,7 @@ bn_mul_mont_general:
{ .mmi; .pred.rel "mutex",p39,p41
(p39) add topbit=r0,r0
(p41) add topbit=r0,r0,1
nop.i 0 }
nop.i 0 }
{ .mmi; st8 [tp_1]=n[0]
add tptr=16,sp
add tp_1=8,sp };;

@ -603,13 +603,13 @@ $code.=<<___;
sltu $v0,$t2,$ta2
$ST $t2,-2*$BNSZ($a0)
$ADDU $v0,$t8
$ADDU $ta3,$t3
sltu $t9,$ta3,$t3
$ADDU $t3,$ta3,$v0
sltu $v0,$t3,$ta3
$ST $t3,-$BNSZ($a0)
.set noreorder
bgtz $at,.L_bn_add_words_loop
$ADDU $v0,$t9
@ -808,7 +808,7 @@ bn_div_3_words:
# so that we can save two arguments
# and return address in registers
# instead of stack:-)
$LD $a0,($a3)
move $ta2,$a1
bne $a0,$a2,bn_div_3_words_internal

@ -546,7 +546,7 @@ L\$copy
ldd $idx($np),$hi0
std,ma %r0,8($tp)
addib,<> 8,$idx,.-8 ; L\$copy
std,ma $hi0,8($rp)
std,ma $hi0,8($rp)
___
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
@ -868,7 +868,7 @@ L\$copy_pa11
ldwx $idx($np),$hi0
stws,ma %r0,4($tp)
addib,<> 4,$idx,L\$copy_pa11
stws,ma $hi0,4($rp)
stws,ma $hi0,4($rp)
nop ; alignment
L\$done

@ -26,7 +26,7 @@
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
# for 64-bit application running on PPC970/G5 is:
#
# 512-bit +65%
# 512-bit +65%
# 1024-bit +35%
# 2048-bit +18%
# 4096-bit +4%
@ -49,7 +49,7 @@ if ($flavour =~ /32/) {
$UMULL= "mullw"; # unsigned multiply low
$UMULH= "mulhwu"; # unsigned multiply high
$UCMP= "cmplw"; # unsigned compare
$SHRI= "srwi"; # unsigned shift right by immediate
$SHRI= "srwi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} elsif ($flavour =~ /64/) {
@ -69,7 +69,7 @@ if ($flavour =~ /32/) {
$UMULL= "mulld"; # unsigned multiply low
$UMULH= "mulhdu"; # unsigned multiply high
$UCMP= "cmpld"; # unsigned compare
$SHRI= "srdi"; # unsigned shift right by immediate
$SHRI= "srdi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} else { die "nonsense $flavour"; }

@ -38,7 +38,7 @@
#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
#
# Same bechmark with this assembler code:
#
@ -74,7 +74,7 @@
#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
#
#
# Again, performance increases by at about 75%
#
# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
@ -125,7 +125,7 @@ if ($flavour =~ /32/) {
$CNTLZ= "cntlzw"; # count leading zeros
$SHL= "slw"; # shift left
$SHR= "srw"; # unsigned shift right
$SHRI= "srwi"; # unsigned shift right by immediate
$SHRI= "srwi"; # unsigned shift right by immediate
$SHLI= "slwi"; # shift left by immediate
$CLRU= "clrlwi"; # clear upper bits
$INSR= "insrwi"; # insert right
@ -149,10 +149,10 @@ if ($flavour =~ /32/) {
$CNTLZ= "cntlzd"; # count leading zeros
$SHL= "sld"; # shift left
$SHR= "srd"; # unsigned shift right
$SHRI= "srdi"; # unsigned shift right by immediate
$SHRI= "srdi"; # unsigned shift right by immediate
$SHLI= "sldi"; # shift left by immediate
$CLRU= "clrldi"; # clear upper bits
$INSR= "insrdi"; # insert right
$INSR= "insrdi"; # insert right
$ROTL= "rotldi"; # rotate left by immediate
$TR= "td"; # conditional trap
} else { die "nonsense $flavour"; }
@ -189,7 +189,7 @@ $data=<<EOF;
# below.
# 12/05/03 Suresh Chari
# (with lots of help from) Andy Polyakov
##
##
# 1. Initial version 10/20/02 Suresh Chari
#
#
@ -202,7 +202,7 @@ $data=<<EOF;
# be done in the build process.
#
# Hand optimized assembly code for the following routines
#
#
# bn_sqr_comba4
# bn_sqr_comba8
# bn_mul_comba4
@ -225,10 +225,10 @@ $data=<<EOF;
#--------------------------------------------------------------------------
#
# Defines to be used in the assembly code.
#
#
#.set r0,0 # we use it as storage for value of 0
#.set SP,1 # preserved
#.set RTOC,2 # preserved
#.set RTOC,2 # preserved
#.set r3,3 # 1st argument/return value
#.set r4,4 # 2nd argument/volatile register
#.set r5,5 # 3rd argument/volatile register
@ -246,7 +246,7 @@ $data=<<EOF;
# the first . i.e. for example change ".bn_sqr_comba4"
# to "bn_sqr_comba4". This should be automatically done
# in the build.
.globl .bn_sqr_comba4
.globl .bn_sqr_comba8
.globl .bn_mul_comba4
@ -257,9 +257,9 @@ $data=<<EOF;
.globl .bn_sqr_words
.globl .bn_mul_words
.globl .bn_mul_add_words
# .text section
.machine "any"
#
@ -278,8 +278,8 @@ $data=<<EOF;
# r3 contains r
# r4 contains a
#
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
#
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
#
# r5,r6 are the two BN_ULONGs being multiplied.
# r7,r8 are the results of the 32x32 giving 64 bit multiply.
# r9,r10, r11 are the equivalents of c1,c2, c3.
@ -288,10 +288,10 @@ $data=<<EOF;
#
xor r0,r0,r0 # set r0 = 0. Used in the addze
# instructions below
#sqr_add_c(a,0,c1,c2,c3)
$LD r5,`0*$BNSZ`(r4)
$UMULL r9,r5,r5
$LD r5,`0*$BNSZ`(r4)
$UMULL r9,r5,r5
$UMULH r10,r5,r5 #in first iteration. No need
#to add since c1=c2=c3=0.
# Note c3(r11) is NOT set to 0
@ -299,20 +299,20 @@ $data=<<EOF;
$ST r9,`0*$BNSZ`(r3) # r[0]=c1;
# sqr_add_c2(a,1,0,c2,c3,c1);
$LD r6,`1*$BNSZ`(r4)
$LD r6,`1*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
adde r8,r8,r8
addze r9,r0 # catch carry if any.
# r9= r0(=0) and carry
# r9= r0(=0) and carry
addc r10,r7,r10 # now add to temp result.
addze r11,r8 # r8 added to r11 which is 0
addze r11,r8 # r8 added to r11 which is 0
addze r9,r9
$ST r10,`1*$BNSZ`(r3) #r[1]=c2;
$ST r10,`1*$BNSZ`(r3) #r[1]=c2;
#sqr_add_c(a,1,c3,c1,c2)
$UMULL r7,r6,r6
$UMULH r8,r6,r6
@ -323,23 +323,23 @@ $data=<<EOF;
$LD r6,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r10,r10
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
$ST r11,`2*$BNSZ`(r3) #r[2]=c3
$ST r11,`2*$BNSZ`(r3) #r[2]=c3
#sqr_add_c2(a,3,0,c1,c2,c3);
$LD r6,`3*$BNSZ`(r4)
$LD r6,`3*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r11,r0
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
@ -348,7 +348,7 @@ $data=<<EOF;
$LD r6,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r11,r11
@ -363,31 +363,31 @@ $data=<<EOF;
adde r11,r8,r11
addze r9,r0
#sqr_add_c2(a,3,1,c2,c3,c1);
$LD r6,`3*$BNSZ`(r4)
$LD r6,`3*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r9,r9
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
$ST r10,`4*$BNSZ`(r3) #r[4]=c2
#sqr_add_c2(a,3,2,c3,c1,c2);
$LD r5,`2*$BNSZ`(r4)
$LD r5,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r10,r0
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
$ST r11,`5*$BNSZ`(r3) #r[5] = c3
#sqr_add_c(a,3,c1,c2,c3);
$UMULL r7,r6,r6
$UMULL r7,r6,r6
$UMULH r8,r6,r6
addc r9,r7,r9