A local copy of OpenSSL from GitHub
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1698 lines
38 KiB

  1. #! /usr/bin/env perl
  2. # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # Specific modes implementations for SPARC Architecture 2011. There
  9. # is T4 dependency though, an ASI value that is not specified in the
  10. # Architecture Manual. But as SPARC universe is rather monocultural,
  11. # we imply that processor capable of executing crypto instructions
  12. # can handle the ASI in question as well. This means that we ought to
  13. # keep eyes open when new processors emerge...
  14. #
  15. # As for above mentioned ASI. It's so called "block initializing
  16. # store" which cancels "read" in "read-update-write" on cache lines.
  17. # This is "cooperative" optimization, as it reduces overall pressure
  18. # on memory interface. Benefits can't be observed/quantified with
  19. # usual benchmarks, on the contrary you can notice that single-thread
  20. # performance for parallelizable modes is ~1.5% worse for largest
  21. # block sizes [though few percent better for not so long ones]. All
  22. # this based on suggestions from David Miller.
  23. $::bias="STACK_BIAS";
  24. $::frame="STACK_FRAME";
  25. $::size_t_cc="SIZE_T_CC";
  26. sub asm_init { # to be called with @ARGV as argument
  27. for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
  28. if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
  29. else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
  30. }
  31. # unified interface
  32. my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
  33. # local variables
  34. my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
  35. sub alg_cbc_encrypt_implement {
  36. my ($alg,$bits) = @_;
  37. $::code.=<<___;
  38. .globl ${alg}${bits}_t4_cbc_encrypt
  39. .align 32
  40. ${alg}${bits}_t4_cbc_encrypt:
  41. save %sp, -$::frame, %sp
  42. cmp $len, 0
  43. be,pn $::size_t_cc, .L${bits}_cbc_enc_abort
  44. sub $inp, $out, $blk_init ! $inp!=$out
  45. ___
  46. $::code.=<<___ if (!$::evp);
  47. andcc $ivec, 7, $ivoff
  48. alignaddr $ivec, %g0, $ivec
  49. ldd [$ivec + 0], %f0 ! load ivec
  50. bz,pt %icc, 1f
  51. ldd [$ivec + 8], %f2
  52. ldd [$ivec + 16], %f4
  53. faligndata %f0, %f2, %f0
  54. faligndata %f2, %f4, %f2
  55. 1:
  56. ___
  57. $::code.=<<___ if ($::evp);
  58. ld [$ivec + 0], %f0
  59. ld [$ivec + 4], %f1
  60. ld [$ivec + 8], %f2
  61. ld [$ivec + 12], %f3
  62. ___
  63. $::code.=<<___;
  64. prefetch [$inp], 20
  65. prefetch [$inp + 63], 20
  66. call _${alg}${bits}_load_enckey
  67. and $inp, 7, $ileft
  68. andn $inp, 7, $inp
  69. sll $ileft, 3, $ileft
  70. mov 64, $iright
  71. mov 0xff, $omask
  72. sub $iright, $ileft, $iright
  73. and $out, 7, $ooff
  74. cmp $len, 127
  75. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  76. movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
  77. brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
  78. srl $omask, $ooff, $omask
  79. alignaddrl $out, %g0, $out
  80. srlx $len, 4, $len
  81. prefetch [$out], 22
  82. .L${bits}_cbc_enc_loop:
  83. ldx [$inp + 0], %o0
  84. brz,pt $ileft, 4f
  85. ldx [$inp + 8], %o1
  86. ldx [$inp + 16], %o2
  87. sllx %o0, $ileft, %o0
  88. srlx %o1, $iright, %g1
  89. sllx %o1, $ileft, %o1
  90. or %g1, %o0, %o0
  91. srlx %o2, $iright, %o2
  92. or %o2, %o1, %o1
  93. 4:
  94. xor %g4, %o0, %o0 ! ^= rk[0]
  95. xor %g5, %o1, %o1
  96. movxtod %o0, %f12
  97. movxtod %o1, %f14
  98. fxor %f12, %f0, %f0 ! ^= ivec
  99. fxor %f14, %f2, %f2
  100. prefetch [$out + 63], 22
  101. prefetch [$inp + 16+63], 20
  102. call _${alg}${bits}_encrypt_1x
  103. add $inp, 16, $inp
  104. brnz,pn $ooff, 2f
  105. sub $len, 1, $len
  106. std %f0, [$out + 0]
  107. std %f2, [$out + 8]
  108. brnz,pt $len, .L${bits}_cbc_enc_loop
  109. add $out, 16, $out
  110. ___
  111. $::code.=<<___ if ($::evp);
  112. st %f0, [$ivec + 0]
  113. st %f1, [$ivec + 4]
  114. st %f2, [$ivec + 8]
  115. st %f3, [$ivec + 12]
  116. ___
  117. $::code.=<<___ if (!$::evp);
  118. brnz,pn $ivoff, 3f
  119. nop
  120. std %f0, [$ivec + 0] ! write out ivec
  121. std %f2, [$ivec + 8]
  122. ___
  123. $::code.=<<___;
  124. .L${bits}_cbc_enc_abort:
  125. ret
  126. restore
  127. .align 16
  128. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  129. ! and ~3x deterioration
  130. ! in inp==out case
  131. faligndata %f0, %f0, %f4 ! handle unaligned output
  132. faligndata %f0, %f2, %f6
  133. faligndata %f2, %f2, %f8
  134. stda %f4, [$out + $omask]0xc0 ! partial store
  135. std %f6, [$out + 8]
  136. add $out, 16, $out
  137. orn %g0, $omask, $omask
  138. stda %f8, [$out + $omask]0xc0 ! partial store
  139. brnz,pt $len, .L${bits}_cbc_enc_loop+4
  140. orn %g0, $omask, $omask
  141. ___
  142. $::code.=<<___ if ($::evp);
  143. st %f0, [$ivec + 0]
  144. st %f1, [$ivec + 4]
  145. st %f2, [$ivec + 8]
  146. st %f3, [$ivec + 12]
  147. ___
  148. $::code.=<<___ if (!$::evp);
  149. brnz,pn $ivoff, 3f
  150. nop
  151. std %f0, [$ivec + 0] ! write out ivec
  152. std %f2, [$ivec + 8]
  153. ret
  154. restore
  155. .align 16
  156. 3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
  157. mov 0xff, $omask
  158. srl $omask, $ivoff, $omask
  159. faligndata %f0, %f0, %f4
  160. faligndata %f0, %f2, %f6
  161. faligndata %f2, %f2, %f8
  162. stda %f4, [$ivec + $omask]0xc0
  163. std %f6, [$ivec + 8]
  164. add $ivec, 16, $ivec
  165. orn %g0, $omask, $omask
  166. stda %f8, [$ivec + $omask]0xc0
  167. ___
  168. $::code.=<<___;
  169. ret
  170. restore
  171. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  172. .align 32
  173. .L${bits}cbc_enc_blk:
  174. add $out, $len, $blk_init
  175. and $blk_init, 63, $blk_init ! tail
  176. sub $len, $blk_init, $len
  177. add $blk_init, 15, $blk_init ! round up to 16n
  178. srlx $len, 4, $len
  179. srl $blk_init, 4, $blk_init
  180. .L${bits}_cbc_enc_blk_loop:
  181. ldx [$inp + 0], %o0
  182. brz,pt $ileft, 5f
  183. ldx [$inp + 8], %o1
  184. ldx [$inp + 16], %o2
  185. sllx %o0, $ileft, %o0
  186. srlx %o1, $iright, %g1
  187. sllx %o1, $ileft, %o1
  188. or %g1, %o0, %o0
  189. srlx %o2, $iright, %o2
  190. or %o2, %o1, %o1
  191. 5:
  192. xor %g4, %o0, %o0 ! ^= rk[0]
  193. xor %g5, %o1, %o1
  194. movxtod %o0, %f12
  195. movxtod %o1, %f14
  196. fxor %f12, %f0, %f0 ! ^= ivec
  197. fxor %f14, %f2, %f2
  198. prefetch [$inp + 16+63], 20
  199. call _${alg}${bits}_encrypt_1x
  200. add $inp, 16, $inp
  201. sub $len, 1, $len
  202. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  203. add $out, 8, $out
  204. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  205. brnz,pt $len, .L${bits}_cbc_enc_blk_loop
  206. add $out, 8, $out
  207. membar #StoreLoad|#StoreStore
  208. brnz,pt $blk_init, .L${bits}_cbc_enc_loop
  209. mov $blk_init, $len
  210. ___
  211. $::code.=<<___ if ($::evp);
  212. st %f0, [$ivec + 0]
  213. st %f1, [$ivec + 4]
  214. st %f2, [$ivec + 8]
  215. st %f3, [$ivec + 12]
  216. ___
  217. $::code.=<<___ if (!$::evp);
  218. brnz,pn $ivoff, 3b
  219. nop
  220. std %f0, [$ivec + 0] ! write out ivec
  221. std %f2, [$ivec + 8]
  222. ___
  223. $::code.=<<___;
  224. ret
  225. restore
  226. .type ${alg}${bits}_t4_cbc_encrypt,#function
  227. .size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
  228. ___
  229. }
  230. sub alg_cbc_decrypt_implement {
  231. my ($alg,$bits) = @_;
  232. $::code.=<<___;
  233. .globl ${alg}${bits}_t4_cbc_decrypt
  234. .align 32
  235. ${alg}${bits}_t4_cbc_decrypt:
  236. save %sp, -$::frame, %sp
  237. cmp $len, 0
  238. be,pn $::size_t_cc, .L${bits}_cbc_dec_abort
  239. sub $inp, $out, $blk_init ! $inp!=$out
  240. ___
  241. $::code.=<<___ if (!$::evp);
  242. andcc $ivec, 7, $ivoff
  243. alignaddr $ivec, %g0, $ivec
  244. ldd [$ivec + 0], %f12 ! load ivec
  245. bz,pt %icc, 1f
  246. ldd [$ivec + 8], %f14
  247. ldd [$ivec + 16], %f0
  248. faligndata %f12, %f14, %f12
  249. faligndata %f14, %f0, %f14
  250. 1:
  251. ___
  252. $::code.=<<___ if ($::evp);
  253. ld [$ivec + 0], %f12 ! load ivec
  254. ld [$ivec + 4], %f13
  255. ld [$ivec + 8], %f14
  256. ld [$ivec + 12], %f15
  257. ___
  258. $::code.=<<___;
  259. prefetch [$inp], 20
  260. prefetch [$inp + 63], 20
  261. call _${alg}${bits}_load_deckey
  262. and $inp, 7, $ileft
  263. andn $inp, 7, $inp
  264. sll $ileft, 3, $ileft
  265. mov 64, $iright
  266. mov 0xff, $omask
  267. sub $iright, $ileft, $iright
  268. and $out, 7, $ooff
  269. cmp $len, 255
  270. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  271. movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
  272. brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
  273. srl $omask, $ooff, $omask
  274. andcc $len, 16, %g0 ! is number of blocks even?
  275. srlx $len, 4, $len
  276. alignaddrl $out, %g0, $out
  277. bz %icc, .L${bits}_cbc_dec_loop2x
  278. prefetch [$out], 22
  279. .L${bits}_cbc_dec_loop:
  280. ldx [$inp + 0], %o0
  281. brz,pt $ileft, 4f
  282. ldx [$inp + 8], %o1
  283. ldx [$inp + 16], %o2
  284. sllx %o0, $ileft, %o0
  285. srlx %o1, $iright, %g1
  286. sllx %o1, $ileft, %o1
  287. or %g1, %o0, %o0
  288. srlx %o2, $iright, %o2
  289. or %o2, %o1, %o1
  290. 4:
  291. xor %g4, %o0, %o2 ! ^= rk[0]
  292. xor %g5, %o1, %o3
  293. movxtod %o2, %f0
  294. movxtod %o3, %f2
  295. prefetch [$out + 63], 22
  296. prefetch [$inp + 16+63], 20
  297. call _${alg}${bits}_decrypt_1x
  298. add $inp, 16, $inp
  299. fxor %f12, %f0, %f0 ! ^= ivec
  300. fxor %f14, %f2, %f2
  301. movxtod %o0, %f12
  302. movxtod %o1, %f14
  303. brnz,pn $ooff, 2f
  304. sub $len, 1, $len
  305. std %f0, [$out + 0]
  306. std %f2, [$out + 8]
  307. brnz,pt $len, .L${bits}_cbc_dec_loop2x
  308. add $out, 16, $out
  309. ___
  310. $::code.=<<___ if ($::evp);
  311. st %f12, [$ivec + 0]
  312. st %f13, [$ivec + 4]
  313. st %f14, [$ivec + 8]
  314. st %f15, [$ivec + 12]
  315. ___
  316. $::code.=<<___ if (!$::evp);
  317. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  318. nop
  319. std %f12, [$ivec + 0] ! write out ivec
  320. std %f14, [$ivec + 8]
  321. ___
  322. $::code.=<<___;
  323. .L${bits}_cbc_dec_abort:
  324. ret
  325. restore
  326. .align 16
  327. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  328. ! and ~3x deterioration
  329. ! in inp==out case
  330. faligndata %f0, %f0, %f4 ! handle unaligned output
  331. faligndata %f0, %f2, %f6
  332. faligndata %f2, %f2, %f8
  333. stda %f4, [$out + $omask]0xc0 ! partial store
  334. std %f6, [$out + 8]
  335. add $out, 16, $out
  336. orn %g0, $omask, $omask
  337. stda %f8, [$out + $omask]0xc0 ! partial store
  338. brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
  339. orn %g0, $omask, $omask
  340. ___
  341. $::code.=<<___ if ($::evp);
  342. st %f12, [$ivec + 0]
  343. st %f13, [$ivec + 4]
  344. st %f14, [$ivec + 8]
  345. st %f15, [$ivec + 12]
  346. ___
  347. $::code.=<<___ if (!$::evp);
  348. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  349. nop
  350. std %f12, [$ivec + 0] ! write out ivec
  351. std %f14, [$ivec + 8]
  352. ___
  353. $::code.=<<___;
  354. ret
  355. restore
  356. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  357. .align 32
  358. .L${bits}_cbc_dec_loop2x:
  359. ldx [$inp + 0], %o0
  360. ldx [$inp + 8], %o1
  361. ldx [$inp + 16], %o2
  362. brz,pt $ileft, 4f
  363. ldx [$inp + 24], %o3
  364. ldx [$inp + 32], %o4
  365. sllx %o0, $ileft, %o0
  366. srlx %o1, $iright, %g1
  367. or %g1, %o0, %o0
  368. sllx %o1, $ileft, %o1
  369. srlx %o2, $iright, %g1
  370. or %g1, %o1, %o1
  371. sllx %o2, $ileft, %o2
  372. srlx %o3, $iright, %g1
  373. or %g1, %o2, %o2
  374. sllx %o3, $ileft, %o3
  375. srlx %o4, $iright, %o4
  376. or %o4, %o3, %o3
  377. 4:
  378. xor %g4, %o0, %o4 ! ^= rk[0]
  379. xor %g5, %o1, %o5
  380. movxtod %o4, %f0
  381. movxtod %o5, %f2
  382. xor %g4, %o2, %o4
  383. xor %g5, %o3, %o5
  384. movxtod %o4, %f4
  385. movxtod %o5, %f6
  386. prefetch [$out + 63], 22
  387. prefetch [$inp + 32+63], 20
  388. call _${alg}${bits}_decrypt_2x
  389. add $inp, 32, $inp
  390. movxtod %o0, %f8
  391. movxtod %o1, %f10
  392. fxor %f12, %f0, %f0 ! ^= ivec
  393. fxor %f14, %f2, %f2
  394. movxtod %o2, %f12
  395. movxtod %o3, %f14
  396. fxor %f8, %f4, %f4
  397. fxor %f10, %f6, %f6
  398. brnz,pn $ooff, 2f
  399. sub $len, 2, $len
  400. std %f0, [$out + 0]
  401. std %f2, [$out + 8]
  402. std %f4, [$out + 16]
  403. std %f6, [$out + 24]
  404. brnz,pt $len, .L${bits}_cbc_dec_loop2x
  405. add $out, 32, $out
  406. ___
  407. $::code.=<<___ if ($::evp);
  408. st %f12, [$ivec + 0]
  409. st %f13, [$ivec + 4]
  410. st %f14, [$ivec + 8]
  411. st %f15, [$ivec + 12]
  412. ___
  413. $::code.=<<___ if (!$::evp);
  414. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  415. nop
  416. std %f12, [$ivec + 0] ! write out ivec
  417. std %f14, [$ivec + 8]
  418. ___
  419. $::code.=<<___;
  420. ret
  421. restore
  422. .align 16
  423. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  424. ! and ~3x deterioration
  425. ! in inp==out case
  426. faligndata %f0, %f0, %f8 ! handle unaligned output
  427. faligndata %f0, %f2, %f0
  428. faligndata %f2, %f4, %f2
  429. faligndata %f4, %f6, %f4
  430. faligndata %f6, %f6, %f6
  431. stda %f8, [$out + $omask]0xc0 ! partial store
  432. std %f0, [$out + 8]
  433. std %f2, [$out + 16]
  434. std %f4, [$out + 24]
  435. add $out, 32, $out
  436. orn %g0, $omask, $omask
  437. stda %f6, [$out + $omask]0xc0 ! partial store
  438. brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
  439. orn %g0, $omask, $omask
  440. ___
  441. $::code.=<<___ if ($::evp);
  442. st %f12, [$ivec + 0]
  443. st %f13, [$ivec + 4]
  444. st %f14, [$ivec + 8]
  445. st %f15, [$ivec + 12]
  446. ___
  447. $::code.=<<___ if (!$::evp);
  448. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  449. nop
  450. std %f12, [$ivec + 0] ! write out ivec
  451. std %f14, [$ivec + 8]
  452. ret
  453. restore
  454. .align 16
  455. .L${bits}_cbc_dec_unaligned_ivec:
  456. alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
  457. mov 0xff, $omask
  458. srl $omask, $ivoff, $omask
  459. faligndata %f12, %f12, %f0
  460. faligndata %f12, %f14, %f2
  461. faligndata %f14, %f14, %f4
  462. stda %f0, [$ivec + $omask]0xc0
  463. std %f2, [$ivec + 8]
  464. add $ivec, 16, $ivec
  465. orn %g0, $omask, $omask
  466. stda %f4, [$ivec + $omask]0xc0
  467. ___
  468. $::code.=<<___;
  469. ret
  470. restore
  471. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  472. .align 32
  473. .L${bits}cbc_dec_blk:
  474. add $out, $len, $blk_init
  475. and $blk_init, 63, $blk_init ! tail
  476. sub $len, $blk_init, $len
  477. add $blk_init, 15, $blk_init ! round up to 16n
  478. srlx $len, 4, $len
  479. srl $blk_init, 4, $blk_init
  480. sub $len, 1, $len
  481. add $blk_init, 1, $blk_init
  482. .L${bits}_cbc_dec_blk_loop2x:
  483. ldx [$inp + 0], %o0
  484. ldx [$inp + 8], %o1
  485. ldx [$inp + 16], %o2
  486. brz,pt $ileft, 5f
  487. ldx [$inp + 24], %o3
  488. ldx [$inp + 32], %o4
  489. sllx %o0, $ileft, %o0
  490. srlx %o1, $iright, %g1
  491. or %g1, %o0, %o0
  492. sllx %o1, $ileft, %o1
  493. srlx %o2, $iright, %g1
  494. or %g1, %o1, %o1
  495. sllx %o2, $ileft, %o2
  496. srlx %o3, $iright, %g1
  497. or %g1, %o2, %o2
  498. sllx %o3, $ileft, %o3
  499. srlx %o4, $iright, %o4
  500. or %o4, %o3, %o3
  501. 5:
  502. xor %g4, %o0, %o4 ! ^= rk[0]
  503. xor %g5, %o1, %o5
  504. movxtod %o4, %f0
  505. movxtod %o5, %f2
  506. xor %g4, %o2, %o4
  507. xor %g5, %o3, %o5
  508. movxtod %o4, %f4
  509. movxtod %o5, %f6
  510. prefetch [$inp + 32+63], 20
  511. call _${alg}${bits}_decrypt_2x
  512. add $inp, 32, $inp
  513. subcc $len, 2, $len
  514. movxtod %o0, %f8
  515. movxtod %o1, %f10
  516. fxor %f12, %f0, %f0 ! ^= ivec
  517. fxor %f14, %f2, %f2
  518. movxtod %o2, %f12
  519. movxtod %o3, %f14
  520. fxor %f8, %f4, %f4
  521. fxor %f10, %f6, %f6
  522. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  523. add $out, 8, $out
  524. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  525. add $out, 8, $out
  526. stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  527. add $out, 8, $out
  528. stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  529. bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
  530. add $out, 8, $out
  531. add $blk_init, $len, $len
  532. andcc $len, 1, %g0 ! is number of blocks even?
  533. membar #StoreLoad|#StoreStore
  534. bnz,pt %icc, .L${bits}_cbc_dec_loop
  535. srl $len, 0, $len
  536. brnz,pn $len, .L${bits}_cbc_dec_loop2x
  537. nop
  538. ___
  539. $::code.=<<___ if ($::evp);
  540. st %f12, [$ivec + 0] ! write out ivec
  541. st %f13, [$ivec + 4]
  542. st %f14, [$ivec + 8]
  543. st %f15, [$ivec + 12]
  544. ___
  545. $::code.=<<___ if (!$::evp);
  546. brnz,pn $ivoff, 3b
  547. nop
  548. std %f12, [$ivec + 0] ! write out ivec
  549. std %f14, [$ivec + 8]
  550. ___
  551. $::code.=<<___;
  552. ret
  553. restore
  554. .type ${alg}${bits}_t4_cbc_decrypt,#function
  555. .size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
  556. ___
  557. }
  558. sub alg_ctr32_implement {
  559. my ($alg,$bits) = @_;
  560. $::code.=<<___;
  561. .globl ${alg}${bits}_t4_ctr32_encrypt
  562. .align 32
  563. ${alg}${bits}_t4_ctr32_encrypt:
  564. save %sp, -$::frame, %sp
  565. prefetch [$inp], 20
  566. prefetch [$inp + 63], 20
  567. call _${alg}${bits}_load_enckey
  568. sllx $len, 4, $len
  569. ld [$ivec + 0], %l4 ! counter
  570. ld [$ivec + 4], %l5
  571. ld [$ivec + 8], %l6
  572. ld [$ivec + 12], %l7
  573. sllx %l4, 32, %o5
  574. or %l5, %o5, %o5
  575. sllx %l6, 32, %g1
  576. xor %o5, %g4, %g4 ! ^= rk[0]
  577. xor %g1, %g5, %g5
  578. movxtod %g4, %f14 ! most significant 64 bits
  579. sub $inp, $out, $blk_init ! $inp!=$out
  580. and $inp, 7, $ileft
  581. andn $inp, 7, $inp
  582. sll $ileft, 3, $ileft
  583. mov 64, $iright
  584. mov 0xff, $omask
  585. sub $iright, $ileft, $iright
  586. and $out, 7, $ooff
  587. cmp $len, 255
  588. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  589. movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
  590. brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
  591. srl $omask, $ooff, $omask
  592. andcc $len, 16, %g0 ! is number of blocks even?
  593. alignaddrl $out, %g0, $out
  594. bz %icc, .L${bits}_ctr32_loop2x
  595. srlx $len, 4, $len
  596. .L${bits}_ctr32_loop:
  597. ldx [$inp + 0], %o0
  598. brz,pt $ileft, 4f
  599. ldx [$inp + 8], %o1
  600. ldx [$inp + 16], %o2
  601. sllx %o0, $ileft, %o0
  602. srlx %o1, $iright, %g1
  603. sllx %o1, $ileft, %o1
  604. or %g1, %o0, %o0
  605. srlx %o2, $iright, %o2
  606. or %o2, %o1, %o1
  607. 4:
  608. xor %g5, %l7, %g1 ! ^= rk[0]
  609. add %l7, 1, %l7
  610. movxtod %g1, %f2
  611. srl %l7, 0, %l7 ! clruw
  612. prefetch [$out + 63], 22
  613. prefetch [$inp + 16+63], 20
  614. ___
  615. $::code.=<<___ if ($alg eq "aes");
  616. aes_eround01 %f16, %f14, %f2, %f4
  617. aes_eround23 %f18, %f14, %f2, %f2
  618. ___
  619. $::code.=<<___ if ($alg eq "cmll");
  620. camellia_f %f16, %f2, %f14, %f2
  621. camellia_f %f18, %f14, %f2, %f0
  622. ___
  623. $::code.=<<___;
  624. call _${alg}${bits}_encrypt_1x+8
  625. add $inp, 16, $inp
  626. movxtod %o0, %f10
  627. movxtod %o1, %f12
  628. fxor %f10, %f0, %f0 ! ^= inp
  629. fxor %f12, %f2, %f2
  630. brnz,pn $ooff, 2f
  631. sub $len, 1, $len
  632. std %f0, [$out + 0]
  633. std %f2, [$out + 8]
  634. brnz,pt $len, .L${bits}_ctr32_loop2x
  635. add $out, 16, $out
  636. ret
  637. restore
  638. .align 16
  639. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  640. ! and ~3x deterioration
  641. ! in inp==out case
  642. faligndata %f0, %f0, %f4 ! handle unaligned output
  643. faligndata %f0, %f2, %f6
  644. faligndata %f2, %f2, %f8
  645. stda %f4, [$out + $omask]0xc0 ! partial store
  646. std %f6, [$out + 8]
  647. add $out, 16, $out
  648. orn %g0, $omask, $omask
  649. stda %f8, [$out + $omask]0xc0 ! partial store
  650. brnz,pt $len, .L${bits}_ctr32_loop2x+4
  651. orn %g0, $omask, $omask
  652. ret
  653. restore
  654. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  655. .align 32
  656. .L${bits}_ctr32_loop2x:
  657. ldx [$inp + 0], %o0
  658. ldx [$inp + 8], %o1
  659. ldx [$inp + 16], %o2
  660. brz,pt $ileft, 4f
  661. ldx [$inp + 24], %o3
  662. ldx [$inp + 32], %o4
  663. sllx %o0, $ileft, %o0
  664. srlx %o1, $iright, %g1
  665. or %g1, %o0, %o0
  666. sllx %o1, $ileft, %o1
  667. srlx %o2, $iright, %g1
  668. or %g1, %o1, %o1
  669. sllx %o2, $ileft, %o2
  670. srlx %o3, $iright, %g1
  671. or %g1, %o2, %o2
  672. sllx %o3, $ileft, %o3
  673. srlx %o4, $iright, %o4
  674. or %o4, %o3, %o3
  675. 4:
  676. xor %g5, %l7, %g1 ! ^= rk[0]
  677. add %l7, 1, %l7
  678. movxtod %g1, %f2
  679. srl %l7, 0, %l7 ! clruw
  680. xor %g5, %l7, %g1
  681. add %l7, 1, %l7
  682. movxtod %g1, %f6
  683. srl %l7, 0, %l7 ! clruw
  684. prefetch [$out + 63], 22
  685. prefetch [$inp + 32+63], 20
  686. ___
  687. $::code.=<<___ if ($alg eq "aes");
  688. aes_eround01 %f16, %f14, %f2, %f8
  689. aes_eround23 %f18, %f14, %f2, %f2
  690. aes_eround01 %f16, %f14, %f6, %f10
  691. aes_eround23 %f18, %f14, %f6, %f6
  692. ___
  693. $::code.=<<___ if ($alg eq "cmll");
  694. camellia_f %f16, %f2, %f14, %f2
  695. camellia_f %f16, %f6, %f14, %f6
  696. camellia_f %f18, %f14, %f2, %f0
  697. camellia_f %f18, %f14, %f6, %f4
  698. ___
  699. $::code.=<<___;
  700. call _${alg}${bits}_encrypt_2x+16
  701. add $inp, 32, $inp
  702. movxtod %o0, %f8
  703. movxtod %o1, %f10
  704. movxtod %o2, %f12
  705. fxor %f8, %f0, %f0 ! ^= inp
  706. movxtod %o3, %f8
  707. fxor %f10, %f2, %f2
  708. fxor %f12, %f4, %f4
  709. fxor %f8, %f6, %f6
  710. brnz,pn $ooff, 2f
  711. sub $len, 2, $len
  712. std %f0, [$out + 0]
  713. std %f2, [$out + 8]
  714. std %f4, [$out + 16]
  715. std %f6, [$out + 24]
  716. brnz,pt $len, .L${bits}_ctr32_loop2x
  717. add $out, 32, $out
  718. ret
  719. restore
  720. .align 16
  721. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  722. ! and ~3x deterioration
  723. ! in inp==out case
  724. faligndata %f0, %f0, %f8 ! handle unaligned output
  725. faligndata %f0, %f2, %f0
  726. faligndata %f2, %f4, %f2
  727. faligndata %f4, %f6, %f4
  728. faligndata %f6, %f6, %f6
  729. stda %f8, [$out + $omask]0xc0 ! partial store
  730. std %f0, [$out + 8]
  731. std %f2, [$out + 16]
  732. std %f4, [$out + 24]
  733. add $out, 32, $out
  734. orn %g0, $omask, $omask
  735. stda %f6, [$out + $omask]0xc0 ! partial store
  736. brnz,pt $len, .L${bits}_ctr32_loop2x+4
  737. orn %g0, $omask, $omask
  738. ret
  739. restore
  740. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  741. .align 32
  742. .L${bits}_ctr32_blk:
  743. add $out, $len, $blk_init
  744. and $blk_init, 63, $blk_init ! tail
  745. sub $len, $blk_init, $len
  746. add $blk_init, 15, $blk_init ! round up to 16n
  747. srlx $len, 4, $len
  748. srl $blk_init, 4, $blk_init
  749. sub $len, 1, $len
  750. add $blk_init, 1, $blk_init
  751. .L${bits}_ctr32_blk_loop2x:
  752. ldx [$inp + 0], %o0
  753. ldx [$inp + 8], %o1
  754. ldx [$inp + 16], %o2
  755. brz,pt $ileft, 5f
  756. ldx [$inp + 24], %o3
  757. ldx [$inp + 32], %o4
  758. sllx %o0, $ileft, %o0
  759. srlx %o1, $iright, %g1
  760. or %g1, %o0, %o0
  761. sllx %o1, $ileft, %o1
  762. srlx %o2, $iright, %g1
  763. or %g1, %o1, %o1
  764. sllx %o2, $ileft, %o2
  765. srlx %o3, $iright, %g1
  766. or %g1, %o2, %o2
  767. sllx %o3, $ileft, %o3
  768. srlx %o4, $iright, %o4
  769. or %o4, %o3, %o3
  770. 5:
  771. xor %g5, %l7, %g1 ! ^= rk[0]
  772. add %l7, 1, %l7
  773. movxtod %g1, %f2
  774. srl %l7, 0, %l7 ! clruw
  775. xor %g5, %l7, %g1
  776. add %l7, 1, %l7
  777. movxtod %g1, %f6
  778. srl %l7, 0, %l7 ! clruw
  779. prefetch [$inp + 32+63], 20
  780. ___
  781. $::code.=<<___ if ($alg eq "aes");
  782. aes_eround01 %f16, %f14, %f2, %f8
  783. aes_eround23 %f18, %f14, %f2, %f2
  784. aes_eround01 %f16, %f14, %f6, %f10
  785. aes_eround23 %f18, %f14, %f6, %f6
  786. ___
  787. $::code.=<<___ if ($alg eq "cmll");
  788. camellia_f %f16, %f2, %f14, %f2
  789. camellia_f %f16, %f6, %f14, %f6
  790. camellia_f %f18, %f14, %f2, %f0
  791. camellia_f %f18, %f14, %f6, %f4
  792. ___
  793. $::code.=<<___;
  794. call _${alg}${bits}_encrypt_2x+16
  795. add $inp, 32, $inp
  796. subcc $len, 2, $len
  797. movxtod %o0, %f8
  798. movxtod %o1, %f10
  799. movxtod %o2, %f12
  800. fxor %f8, %f0, %f0 ! ^= inp
  801. movxtod %o3, %f8
  802. fxor %f10, %f2, %f2
  803. fxor %f12, %f4, %f4
  804. fxor %f8, %f6, %f6
  805. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  806. add $out, 8, $out
  807. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  808. add $out, 8, $out
  809. stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  810. add $out, 8, $out
  811. stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  812. bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
  813. add $out, 8, $out
  814. add $blk_init, $len, $len
  815. andcc $len, 1, %g0 ! is number of blocks even?
  816. membar #StoreLoad|#StoreStore
  817. bnz,pt %icc, .L${bits}_ctr32_loop
  818. srl $len, 0, $len
  819. brnz,pn $len, .L${bits}_ctr32_loop2x
  820. nop
  821. ret
  822. restore
  823. .type ${alg}${bits}_t4_ctr32_encrypt,#function
  824. .size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
  825. ___
  826. }
  827. sub alg_xts_implement {
  828. my ($alg,$bits,$dir) = @_;
  829. my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
  830. my $rem=$ivec;
  831. $::code.=<<___;
  832. .globl ${alg}${bits}_t4_xts_${dir}crypt
  833. .align 32
  834. ${alg}${bits}_t4_xts_${dir}crypt:
  835. save %sp, -$::frame-16, %sp
  836. mov $ivec, %o0
  837. add %fp, $::bias-16, %o1
  838. call ${alg}_t4_encrypt
  839. mov $key2, %o2
  840. add %fp, $::bias-16, %l7
  841. ldxa [%l7]0x88, %g2
  842. add %fp, $::bias-8, %l7
  843. ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
  844. sethi %hi(0x76543210), %l7
  845. or %l7, %lo(0x76543210), %l7
  846. bmask %l7, %g0, %g0 ! byte swap mask
  847. prefetch [$inp], 20
  848. prefetch [$inp + 63], 20
  849. call _${alg}${bits}_load_${dir}ckey
  850. and $len, 15, $rem
  851. and $len, -16, $len
  852. ___
  853. $code.=<<___ if ($dir eq "de");
  854. mov 0, %l7
  855. movrnz $rem, 16, %l7
  856. sub $len, %l7, $len
  857. ___
  858. $code.=<<___;
  859. sub $inp, $out, $blk_init ! $inp!=$out
  860. and $inp, 7, $ileft
  861. andn $inp, 7, $inp
  862. sll $ileft, 3, $ileft
  863. mov 64, $iright
  864. mov 0xff, $omask
  865. sub $iright, $ileft, $iright
  866. and $out, 7, $ooff
  867. cmp $len, 255
  868. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  869. movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
  870. brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
  871. srl $omask, $ooff, $omask
  872. andcc $len, 16, %g0 ! is number of blocks even?
  873. ___
  874. $code.=<<___ if ($dir eq "de");
  875. brz,pn $len, .L${bits}_xts_${dir}steal
  876. ___
  877. $code.=<<___;
  878. alignaddrl $out, %g0, $out
  879. bz %icc, .L${bits}_xts_${dir}loop2x
  880. srlx $len, 4, $len
  881. .L${bits}_xts_${dir}loop:
  882. ldx [$inp + 0], %o0
  883. brz,pt $ileft, 4f
  884. ldx [$inp + 8], %o1
  885. ldx [$inp + 16], %o2
  886. sllx %o0, $ileft, %o0
  887. srlx %o1, $iright, %g1
  888. sllx %o1, $ileft, %o1
  889. or %g1, %o0, %o0
  890. srlx %o2, $iright, %o2
  891. or %o2, %o1, %o1
  892. 4:
  893. movxtod %g2, %f12
  894. movxtod %g3, %f14
  895. bshuffle %f12, %f12, %f12
  896. bshuffle %f14, %f14, %f14
  897. xor %g4, %o0, %o0 ! ^= rk[0]
  898. xor %g5, %o1, %o1
  899. movxtod %o0, %f0
  900. movxtod %o1, %f2
  901. fxor %f12, %f0, %f0 ! ^= tweak[0]
  902. fxor %f14, %f2, %f2
  903. prefetch [$out + 63], 22
  904. prefetch [$inp + 16+63], 20
  905. call _${alg}${bits}_${dir}crypt_1x
  906. add $inp, 16, $inp
  907. fxor %f12, %f0, %f0 ! ^= tweak[0]
  908. fxor %f14, %f2, %f2
  909. srax %g3, 63, %l7 ! next tweak value
  910. addcc %g2, %g2, %g2
  911. and %l7, 0x87, %l7
  912. addxc %g3, %g3, %g3
  913. xor %l7, %g2, %g2
  914. brnz,pn $ooff, 2f
  915. sub $len, 1, $len
  916. std %f0, [$out + 0]
  917. std %f2, [$out + 8]
  918. brnz,pt $len, .L${bits}_xts_${dir}loop2x
  919. add $out, 16, $out
  920. brnz,pn $rem, .L${bits}_xts_${dir}steal
  921. nop
  922. ret
  923. restore
  924. .align 16
  925. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  926. ! and ~3x deterioration
  927. ! in inp==out case
  928. faligndata %f0, %f0, %f4 ! handle unaligned output
  929. faligndata %f0, %f2, %f6
  930. faligndata %f2, %f2, %f8
  931. stda %f4, [$out + $omask]0xc0 ! partial store
  932. std %f6, [$out + 8]
  933. add $out, 16, $out
  934. orn %g0, $omask, $omask
  935. stda %f8, [$out + $omask]0xc0 ! partial store
  936. brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
  937. orn %g0, $omask, $omask
  938. brnz,pn $rem, .L${bits}_xts_${dir}steal
  939. nop
  940. ret
  941. restore
  942. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  943. .align 32
  944. .L${bits}_xts_${dir}loop2x:
  945. ldx [$inp + 0], %o0
  946. ldx [$inp + 8], %o1
  947. ldx [$inp + 16], %o2
  948. brz,pt $ileft, 4f
  949. ldx [$inp + 24], %o3
  950. ldx [$inp + 32], %o4
  951. sllx %o0, $ileft, %o0
  952. srlx %o1, $iright, %g1
  953. or %g1, %o0, %o0
  954. sllx %o1, $ileft, %o1
  955. srlx %o2, $iright, %g1
  956. or %g1, %o1, %o1
  957. sllx %o2, $ileft, %o2
  958. srlx %o3, $iright, %g1
  959. or %g1, %o2, %o2
  960. sllx %o3, $ileft, %o3
  961. srlx %o4, $iright, %o4
  962. or %o4, %o3, %o3
  963. 4:
  964. movxtod %g2, %f12
  965. movxtod %g3, %f14
  966. bshuffle %f12, %f12, %f12
  967. bshuffle %f14, %f14, %f14
  968. srax %g3, 63, %l7 ! next tweak value
  969. addcc %g2, %g2, %g2
  970. and %l7, 0x87, %l7
  971. addxc %g3, %g3, %g3
  972. xor %l7, %g2, %g2
  973. movxtod %g2, %f8
  974. movxtod %g3, %f10
  975. bshuffle %f8, %f8, %f8
  976. bshuffle %f10, %f10, %f10
  977. xor %g4, %o0, %o0 ! ^= rk[0]
  978. xor %g5, %o1, %o1
  979. xor %g4, %o2, %o2 ! ^= rk[0]
  980. xor %g5, %o3, %o3
  981. movxtod %o0, %f0
  982. movxtod %o1, %f2
  983. movxtod %o2, %f4
  984. movxtod %o3, %f6
  985. fxor %f12, %f0, %f0 ! ^= tweak[0]
  986. fxor %f14, %f2, %f2
  987. fxor %f8, %f4, %f4 ! ^= tweak[0]
  988. fxor %f10, %f6, %f6
  989. prefetch [$out + 63], 22
  990. prefetch [$inp + 32+63], 20
  991. call _${alg}${bits}_${dir}crypt_2x
  992. add $inp, 32, $inp
  993. movxtod %g2, %f8
  994. movxtod %g3, %f10
  995. srax %g3, 63, %l7 ! next tweak value
  996. addcc %g2, %g2, %g2
  997. and %l7, 0x87, %l7
  998. addxc %g3, %g3, %g3
  999. xor %l7, %g2, %g2
  1000. bshuffle %f8, %f8, %f8
  1001. bshuffle %f10, %f10, %f10
  1002. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1003. fxor %f14, %f2, %f2
  1004. fxor %f8, %f4, %f4
  1005. fxor %f10, %f6, %f6
  1006. brnz,pn $ooff, 2f
  1007. sub $len, 2, $len
  1008. std %f0, [$out + 0]
  1009. std %f2, [$out + 8]
  1010. std %f4, [$out + 16]
  1011. std %f6, [$out + 24]
  1012. brnz,pt $len, .L${bits}_xts_${dir}loop2x
  1013. add $out, 32, $out
  1014. fsrc2 %f4, %f0
  1015. fsrc2 %f6, %f2
  1016. brnz,pn $rem, .L${bits}_xts_${dir}steal
  1017. nop
  1018. ret
  1019. restore
  1020. .align 16
  1021. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  1022. ! and ~3x deterioration
  1023. ! in inp==out case
  1024. faligndata %f0, %f0, %f8 ! handle unaligned output
  1025. faligndata %f0, %f2, %f10
  1026. faligndata %f2, %f4, %f12
  1027. faligndata %f4, %f6, %f14
  1028. faligndata %f6, %f6, %f0
  1029. stda %f8, [$out + $omask]0xc0 ! partial store
  1030. std %f10, [$out + 8]
  1031. std %f12, [$out + 16]
  1032. std %f14, [$out + 24]
  1033. add $out, 32, $out
  1034. orn %g0, $omask, $omask
  1035. stda %f0, [$out + $omask]0xc0 ! partial store
  1036. brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
  1037. orn %g0, $omask, $omask
  1038. fsrc2 %f4, %f0
  1039. fsrc2 %f6, %f2
  1040. brnz,pn $rem, .L${bits}_xts_${dir}steal
  1041. nop
  1042. ret
  1043. restore
  1044. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  1045. .align 32
  1046. .L${bits}_xts_${dir}blk:
  1047. add $out, $len, $blk_init
  1048. and $blk_init, 63, $blk_init ! tail
  1049. sub $len, $blk_init, $len
  1050. add $blk_init, 15, $blk_init ! round up to 16n
  1051. srlx $len, 4, $len
  1052. srl $blk_init, 4, $blk_init
  1053. sub $len, 1, $len
  1054. add $blk_init, 1, $blk_init
  1055. .L${bits}_xts_${dir}blk2x:
  1056. ldx [$inp + 0], %o0
  1057. ldx [$inp + 8], %o1
  1058. ldx [$inp + 16], %o2
  1059. brz,pt $ileft, 5f
  1060. ldx [$inp + 24], %o3
  1061. ldx [$inp + 32], %o4
  1062. sllx %o0, $ileft, %o0
  1063. srlx %o1, $iright, %g1
  1064. or %g1, %o0, %o0
  1065. sllx %o1, $ileft, %o1
  1066. srlx %o2, $iright, %g1
  1067. or %g1, %o1, %o1
  1068. sllx %o2, $ileft, %o2
  1069. srlx %o3, $iright, %g1
  1070. or %g1, %o2, %o2
  1071. sllx %o3, $ileft, %o3
  1072. srlx %o4, $iright, %o4
  1073. or %o4, %o3, %o3
  1074. 5:
  1075. movxtod %g2, %f12
  1076. movxtod %g3, %f14
  1077. bshuffle %f12, %f12, %f12
  1078. bshuffle %f14, %f14, %f14
  1079. srax %g3, 63, %l7 ! next tweak value
  1080. addcc %g2, %g2, %g2
  1081. and %l7, 0x87, %l7
  1082. addxc %g3, %g3, %g3
  1083. xor %l7, %g2, %g2
  1084. movxtod %g2, %f8
  1085. movxtod %g3, %f10
  1086. bshuffle %f8, %f8, %f8
  1087. bshuffle %f10, %f10, %f10
  1088. xor %g4, %o0, %o0 ! ^= rk[0]
  1089. xor %g5, %o1, %o1
  1090. xor %g4, %o2, %o2 ! ^= rk[0]
  1091. xor %g5, %o3, %o3
  1092. movxtod %o0, %f0
  1093. movxtod %o1, %f2
  1094. movxtod %o2, %f4
  1095. movxtod %o3, %f6
  1096. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1097. fxor %f14, %f2, %f2
  1098. fxor %f8, %f4, %f4 ! ^= tweak[0]
  1099. fxor %f10, %f6, %f6
  1100. prefetch [$inp + 32+63], 20
  1101. call _${alg}${bits}_${dir}crypt_2x
  1102. add $inp, 32, $inp
  1103. movxtod %g2, %f8
  1104. movxtod %g3, %f10
  1105. srax %g3, 63, %l7 ! next tweak value
  1106. addcc %g2, %g2, %g2
  1107. and %l7, 0x87, %l7
  1108. addxc %g3, %g3, %g3
  1109. xor %l7, %g2, %g2
  1110. bshuffle %f8, %f8, %f8
  1111. bshuffle %f10, %f10, %f10
  1112. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1113. fxor %f14, %f2, %f2
  1114. fxor %f8, %f4, %f4
  1115. fxor %f10, %f6, %f6
  1116. subcc $len, 2, $len
  1117. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1118. add $out, 8, $out
  1119. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1120. add $out, 8, $out
  1121. stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1122. add $out, 8, $out
  1123. stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1124. bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
  1125. add $out, 8, $out
  1126. add $blk_init, $len, $len
  1127. andcc $len, 1, %g0 ! is number of blocks even?
  1128. membar #StoreLoad|#StoreStore
  1129. bnz,pt %icc, .L${bits}_xts_${dir}loop
  1130. srl $len, 0, $len
  1131. brnz,pn $len, .L${bits}_xts_${dir}loop2x
  1132. nop
  1133. fsrc2 %f4, %f0
  1134. fsrc2 %f6, %f2
  1135. brnz,pn $rem, .L${bits}_xts_${dir}steal
  1136. nop
  1137. ret
  1138. restore
  1139. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  1140. ___
  1141. $code.=<<___ if ($dir eq "en");
  1142. .align 32
  1143. .L${bits}_xts_${dir}steal:
  1144. std %f0, [%fp + $::bias-16] ! copy of output
  1145. std %f2, [%fp + $::bias-8]
  1146. srl $ileft, 3, $ileft
  1147. add %fp, $::bias-16, %l7
  1148. add $inp, $ileft, $inp ! original $inp+$len&-15
  1149. add $out, $ooff, $out ! original $out+$len&-15
  1150. mov 0, $ileft
  1151. nop ! align
  1152. .L${bits}_xts_${dir}stealing:
  1153. ldub [$inp + $ileft], %o0
  1154. ldub [%l7 + $ileft], %o1
  1155. dec $rem
  1156. stb %o0, [%l7 + $ileft]
  1157. stb %o1, [$out + $ileft]
  1158. brnz $rem, .L${bits}_xts_${dir}stealing
  1159. inc $ileft
  1160. mov %l7, $inp
  1161. sub $out, 16, $out
  1162. mov 0, $ileft
  1163. sub $out, $ooff, $out
  1164. ba .L${bits}_xts_${dir}loop ! one more time
  1165. mov 1, $len ! $rem is 0
  1166. ___
  1167. $code.=<<___ if ($dir eq "de");
  1168. .align 32
  1169. .L${bits}_xts_${dir}steal:
  1170. ldx [$inp + 0], %o0
  1171. brz,pt $ileft, 8f
  1172. ldx [$inp + 8], %o1
  1173. ldx [$inp + 16], %o2
  1174. sllx %o0, $ileft, %o0
  1175. srlx %o1, $iright, %g1
  1176. sllx %o1, $ileft, %o1
  1177. or %g1, %o0, %o0
  1178. srlx %o2, $iright, %o2
  1179. or %o2, %o1, %o1
  1180. 8:
  1181. srax %g3, 63, %l7 ! next tweak value
  1182. addcc %g2, %g2, %o2
  1183. and %l7, 0x87, %l7
  1184. addxc %g3, %g3, %o3
  1185. xor %l7, %o2, %o2
  1186. movxtod %o2, %f12
  1187. movxtod %o3, %f14
  1188. bshuffle %f12, %f12, %f12
  1189. bshuffle %f14, %f14, %f14
  1190. xor %g4, %o0, %o0 ! ^= rk[0]
  1191. xor %g5, %o1, %o1
  1192. movxtod %o0, %f0
  1193. movxtod %o1, %f2
  1194. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1195. fxor %f14, %f2, %f2
  1196. call _${alg}${bits}_${dir}crypt_1x
  1197. add $inp, 16, $inp
  1198. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1199. fxor %f14, %f2, %f2
  1200. std %f0, [%fp + $::bias-16]
  1201. std %f2, [%fp + $::bias-8]
  1202. srl $ileft, 3, $ileft
  1203. add %fp, $::bias-16, %l7
  1204. add $inp, $ileft, $inp ! original $inp+$len&-15
  1205. add $out, $ooff, $out ! original $out+$len&-15
  1206. mov 0, $ileft
  1207. add $out, 16, $out
  1208. nop ! align
  1209. .L${bits}_xts_${dir}stealing:
  1210. ldub [$inp + $ileft], %o0
  1211. ldub [%l7 + $ileft], %o1
  1212. dec $rem
  1213. stb %o0, [%l7 + $ileft]
  1214. stb %o1, [$out + $ileft]
  1215. brnz $rem, .L${bits}_xts_${dir}stealing
  1216. inc $ileft
  1217. mov %l7, $inp
  1218. sub $out, 16, $out
  1219. mov 0, $ileft
  1220. sub $out, $ooff, $out
  1221. ba .L${bits}_xts_${dir}loop ! one more time
  1222. mov 1, $len ! $rem is 0
  1223. ___
  1224. $code.=<<___;
  1225. ret
  1226. restore
  1227. .type ${alg}${bits}_t4_xts_${dir}crypt,#function
  1228. .size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
  1229. ___
  1230. }
  1231. # Purpose of these subroutines is to explicitly encode VIS instructions,
  1232. # so that one can compile the module without having to specify VIS
  1233. # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  1234. # Idea is to reserve for option to produce "universal" binary and let
  1235. # programmer detect if current CPU is VIS capable at run-time.
  1236. sub unvis {
  1237. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1238. my ($ref,$opf);
  1239. my %visopf = ( "faligndata" => 0x048,
  1240. "bshuffle" => 0x04c,
  1241. "fnot2" => 0x066,
  1242. "fxor" => 0x06c,
  1243. "fsrc2" => 0x078 );
  1244. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1245. if ($opf=$visopf{$mnemonic}) {
  1246. foreach ($rs1,$rs2,$rd) {
  1247. return $ref if (!/%f([0-9]{1,2})/);
  1248. $_=$1;
  1249. if ($1>=32) {
  1250. return $ref if ($1&1);
  1251. # re-encode for upper double register addressing
  1252. $_=($1|$1>>5)&31;
  1253. }
  1254. }
  1255. return sprintf ".word\t0x%08x !%s",
  1256. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  1257. $ref;
  1258. } else {
  1259. return $ref;
  1260. }
  1261. }
  1262. sub unvis3 {
  1263. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1264. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
  1265. my ($ref,$opf);
  1266. my %visopf = ( "addxc" => 0x011,
  1267. "addxccc" => 0x013,
  1268. "umulxhi" => 0x016,
  1269. "alignaddr" => 0x018,
  1270. "bmask" => 0x019,
  1271. "alignaddrl" => 0x01a );
  1272. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1273. if ($opf=$visopf{$mnemonic}) {
  1274. foreach ($rs1,$rs2,$rd) {
  1275. return $ref if (!/%([goli])([0-9])/);
  1276. $_=$bias{$1}+$2;
  1277. }
  1278. return sprintf ".word\t0x%08x !%s",
  1279. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  1280. $ref;
  1281. } else {
  1282. return $ref;
  1283. }
  1284. }
  1285. sub unaes_round { # 4-argument instructions
  1286. my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
  1287. my ($ref,$opf);
  1288. my %aesopf = ( "aes_eround01" => 0,
  1289. "aes_eround23" => 1,
  1290. "aes_dround01" => 2,
  1291. "aes_dround23" => 3,
  1292. "aes_eround01_l"=> 4,
  1293. "aes_eround23_l"=> 5,
  1294. "aes_dround01_l"=> 6,
  1295. "aes_dround23_l"=> 7,
  1296. "aes_kexpand1" => 8 );
  1297. $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
  1298. if (defined($opf=$aesopf{$mnemonic})) {
  1299. $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
  1300. foreach ($rs1,$rs2,$rd) {
  1301. return $ref if (!/%f([0-9]{1,2})/);
  1302. $_=$1;
  1303. if ($1>=32) {
  1304. return $ref if ($1&1);
  1305. # re-encode for upper double register addressing
  1306. $_=($1|$1>>5)&31;
  1307. }
  1308. }
  1309. return sprintf ".word\t0x%08x !%s",
  1310. 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
  1311. $ref;
  1312. } else {
  1313. return $ref;
  1314. }
  1315. }
  1316. sub unaes_kexpand { # 3-argument instructions
  1317. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1318. my ($ref,$opf);
  1319. my %aesopf = ( "aes_kexpand0" => 0x130,
  1320. "aes_kexpand2" => 0x131 );
  1321. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1322. if (defined($opf=$aesopf{$mnemonic})) {
  1323. foreach ($rs1,$rs2,$rd) {
  1324. return $ref if (!/%f([0-9]{1,2})/);
  1325. $_=$1;
  1326. if ($1>=32) {
  1327. return $ref if ($1&1);
  1328. # re-encode for upper double register addressing
  1329. $_=($1|$1>>5)&31;
  1330. }
  1331. }
  1332. return sprintf ".word\t0x%08x !%s",
  1333. 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
  1334. $ref;
  1335. } else {
  1336. return $ref;
  1337. }
  1338. }
  1339. sub uncamellia_f { # 4-argument instructions
  1340. my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
  1341. my ($ref,$opf);
  1342. $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
  1343. if (1) {
  1344. $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
  1345. foreach ($rs1,$rs2,$rd) {
  1346. return $ref if (!/%f([0-9]{1,2})/);
  1347. $_=$1;
  1348. if ($1>=32) {
  1349. return $ref if ($1&1);
  1350. # re-encode for upper double register addressing
  1351. $_=($1|$1>>5)&31;
  1352. }
  1353. }
  1354. return sprintf ".word\t0x%08x !%s",
  1355. 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
  1356. $ref;
  1357. } else {
  1358. return $ref;
  1359. }
  1360. }
  1361. sub uncamellia3 { # 3-argument instructions
  1362. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1363. my ($ref,$opf);
  1364. my %cmllopf = ( "camellia_fl" => 0x13c,
  1365. "camellia_fli" => 0x13d );
  1366. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1367. if (defined($opf=$cmllopf{$mnemonic})) {
  1368. foreach ($rs1,$rs2,$rd) {
  1369. return $ref if (!/%f([0-9]{1,2})/);
  1370. $_=$1;
  1371. if ($1>=32) {
  1372. return $ref if ($1&1);
  1373. # re-encode for upper double register addressing
  1374. $_=($1|$1>>5)&31;
  1375. }
  1376. }
  1377. return sprintf ".word\t0x%08x !%s",
  1378. 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
  1379. $ref;
  1380. } else {
  1381. return $ref;
  1382. }
  1383. }
  1384. sub unmovxtox { # 2-argument instructions
  1385. my ($mnemonic,$rs,$rd)=@_;
  1386. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
  1387. my ($ref,$opf);
  1388. my %movxopf = ( "movdtox" => 0x110,
  1389. "movstouw" => 0x111,
  1390. "movstosw" => 0x113,
  1391. "movxtod" => 0x118,
  1392. "movwtos" => 0x119 );
  1393. $ref = "$mnemonic\t$rs,$rd";
  1394. if (defined($opf=$movxopf{$mnemonic})) {
  1395. foreach ($rs,$rd) {
  1396. return $ref if (!/%([fgoli])([0-9]{1,2})/);
  1397. $_=$bias{$1}+$2;
  1398. if ($2>=32) {
  1399. return $ref if ($2&1);
  1400. # re-encode for upper double register addressing
  1401. $_=($2|$2>>5)&31;
  1402. }
  1403. }
  1404. return sprintf ".word\t0x%08x !%s",
  1405. 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
  1406. $ref;
  1407. } else {
  1408. return $ref;
  1409. }
  1410. }
  1411. sub undes {
  1412. my ($mnemonic)=shift;
  1413. my @args=@_;
  1414. my ($ref,$opf);
  1415. my %desopf = ( "des_round" => 0b1001,
  1416. "des_ip" => 0b100110100,
  1417. "des_iip" => 0b100110101,
  1418. "des_kexpand" => 0b100110110 );
  1419. $ref = "$mnemonic\t".join(",",@_);
  1420. if (defined($opf=$desopf{$mnemonic})) { # 4-arg
  1421. if ($mnemonic eq "des_round") {
  1422. foreach (@args[0..3]) {
  1423. return $ref if (!/%f([0-9]{1,2})/);
  1424. $_=$1;
  1425. if ($1>=32) {
  1426. return $ref if ($1&1);
  1427. # re-encode for upper double register addressing
  1428. $_=($1|$1>>5)&31;
  1429. }
  1430. }
  1431. return sprintf ".word\t0x%08x !%s",
  1432. 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
  1433. $ref;
  1434. } elsif ($mnemonic eq "des_kexpand") { # 3-arg
  1435. foreach (@args[0..2]) {
  1436. return $ref if (!/(%f)?([0-9]{1,2})/);
  1437. $_=$2;
  1438. if ($2>=32) {
  1439. return $ref if ($2&1);
  1440. # re-encode for upper double register addressing
  1441. $_=($2|$2>>5)&31;
  1442. }
  1443. }
  1444. return sprintf ".word\t0x%08x !%s",
  1445. 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
  1446. $ref;
  1447. } else { # 2-arg
  1448. foreach (@args[0..1]) {
  1449. return $ref if (!/%f([0-9]{1,2})/);
  1450. $_=$1;
  1451. if ($1>=32) {
  1452. return $ref if ($2&1);
  1453. # re-encode for upper double register addressing
  1454. $_=($1|$1>>5)&31;
  1455. }
  1456. }
  1457. return sprintf ".word\t0x%08x !%s",
  1458. 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
  1459. $ref;
  1460. }
  1461. } else {
  1462. return $ref;
  1463. }
  1464. }
  1465. sub emit_assembler {
  1466. foreach (split("\n",$::code)) {
  1467. s/\`([^\`]*)\`/eval $1/ge;
  1468. s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
  1469. s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
  1470. &unaes_round($1,$2,$3,$4,$5)
  1471. /geo or
  1472. s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  1473. &unaes_kexpand($1,$2,$3,$4)
  1474. /geo or
  1475. s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
  1476. &uncamellia_f($1,$2,$3,$4,$5)
  1477. /geo or
  1478. s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  1479. &uncamellia3($1,$2,$3,$4)
  1480. /geo or
  1481. s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
  1482. &undes($1,$2,$3,$4,$5)
  1483. /geo or
  1484. s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
  1485. &unmovxtox($1,$2,$3)
  1486. /geo or
  1487. s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
  1488. &unmovxtox($1,$2,$3)
  1489. /geo or
  1490. s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  1491. &unvis($1,$2,$3,$4)
  1492. /geo or
  1493. s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
  1494. &unvis3($1,$2,$3,$4)
  1495. /geo;
  1496. print $_,"\n";
  1497. }
  1498. }
  1499. 1;