'[PATCH 3/6] "PowerPC64" Add optimized GHASH'

[prev in list] [next in list] [prev in thread] [next in thread] 

List:       nettle-bugs
Subject:    [PATCH 3/6] "PowerPC64" Add optimized GHASH
From:       Maamoun TK <maamoun.tk () googlemail ! com>
Date:       2020-07-14 11:43:44
Message-ID: CA+1QVtQM7oB6jpaSV0Q+kF2feb3-XxsD+J_JSdBkUKjpSHBsSA () mail ! gmail ! com
[Download RAW message or body]

---
 gcm.c                     |  82 +++-
 powerpc64/P8/gcm-hash.asm | 998
++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 1066 insertions(+), 14 deletions(-)
 create mode 100644 powerpc64/P8/gcm-hash.asm

diff --git a/gcm.c b/gcm.c
index cf615daf..935d4420 100644
--- a/gcm.c
+++ b/gcm.c
@@ -140,6 +140,26 @@ gcm_gf_mul (union nettle_block16 *x, const union
nettle_block16 *table)
   memcpy (x->b, Z.b, sizeof(Z));
 }
 # elif GCM_TABLE_BITS == 8
+#  if HAVE_NATIVE_gcm_init_key
+
+#define gcm_init_key _nettle_gcm_init_key
+void
+_nettle_gcm_init_key (union nettle_block16 *table);
+/* For fat builds */
+void
+_nettle_gcm_init_key_c (union nettle_block16 *table);
+#  endif /* HAVE_NATIVE_gcm_init_key */
+#  if HAVE_NATIVE_gcm_hash
+
+#define gcm_hash _nettle_gcm_hash
+void
+_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x,
+   size_t length, const uint8_t *data);
+/* For fat builds */
+void
+_nettle_gcm_hash_c (const struct gcm_key *key, union nettle_block16 *x,
+   size_t length, const uint8_t *data);
+#  endif /* HAVE_NATIVE_gcm_hash */
 #  if HAVE_NATIVE_gcm_hash8

 #define gcm_hash _nettle_gcm_hash8
@@ -225,9 +245,45 @@ gcm_gf_mul (union nettle_block16 *x, const union
nettle_block16 *table)

 #endif /* GCM_TABLE_BITS */

+#if HAVE_NATIVE_gcm_fill
+
+#define gcm_fill _nettle_gcm_fill
+void
+_nettle_gcm_fill (uint8_t *ctr, size_t blocks, union nettle_block16
*buffer);
+/* For fat builds */
+void
+_nettle_gcm_fill_c (uint8_t *ctr, size_t blocks, union nettle_block16
*buffer);
+#endif /* HAVE_NATIVE_gcm_fill */
+
 /* Increment the rightmost 32 bits. */
 #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)

+#ifdef gcm_init_key
+void
+_nettle_gcm_init_key_c(union nettle_block16 *table)
+#else
+static void
+gcm_init_key(union nettle_block16 *table)
+#endif /* !gcm_init_key */
+{
+#if GCM_TABLE_BITS
+  /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
+     element */
+  unsigned i = (1<<GCM_TABLE_BITS)/2;
+
+  /* Algorithm 3 from the gcm paper. First do powers of two, then do
+     the rest by adding. */
+  while (i /= 2)
+    block16_mulx_ghash(&table[i], &table[2*i]);
+  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
+    {
+      unsigned j;
+      for (j = 1; j < i; j++)
+ block16_xor3(&table[i+j], &table[i], &table[j]);
+    }
+#endif
+}
+
 /* Initialization of GCM.
  * @ctx: The context of GCM
  * @cipher: The context of the underlying block cipher
@@ -245,24 +301,18 @@ gcm_set_key(struct gcm_key *key,
   memset(key->h[0].b, 0, GCM_BLOCK_SIZE);
   f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);

-#if GCM_TABLE_BITS
-  /* Algorithm 3 from the gcm paper. First do powers of two, then do
-     the rest by adding. */
-  while (i /= 2)
-    block16_mulx_ghash(&key->h[i], &key->h[2*i]);
-  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
-    {
-      unsigned j;
-      for (j = 1; j < i; j++)
- block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]);
-    }
-#endif
+  gcm_init_key(key->h);
 }

-#ifndef gcm_hash
+#ifdef gcm_hash
+void
+_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x,
+ size_t length, const uint8_t *data)
+#else
 static void
 gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
  size_t length, const uint8_t *data)
+#endif /* !gcm_hash */
 {
   for (; length >= GCM_BLOCK_SIZE;
        length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE)
@@ -276,7 +326,6 @@ gcm_hash(const struct gcm_key *key, union
nettle_block16 *x,
       gcm_gf_mul (x, key->h);
     }
 }
-#endif /* !gcm_hash */

 static void
 gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x,
@@ -333,9 +382,14 @@ gcm_update(struct gcm_ctx *ctx, const struct gcm_key
*key,
   ctx->auth_size += length;
 }

+#ifdef gcm_fill
+void
+_nettle_gcm_fill_c(uint8_t *ctr, size_t blocks, union nettle_block16
*buffer)
+#else
 static nettle_fill16_func gcm_fill;
 static void
 gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
+#endif /* !gcm_fill */
 {
   uint32_t c;

diff --git a/powerpc64/P8/gcm-hash.asm b/powerpc64/P8/gcm-hash.asm
new file mode 100644
index 00000000..b8f2178e
--- /dev/null
+++ b/powerpc64/P8/gcm-hash.asm
@@ -0,0 +1,998 @@
+C powerpc64/P8/gcm-hash.asm
+
+ifelse(<
+   Copyright (C) 2020 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+C Alignment of gcm_key table elements, which is declared in gcm.h
+define(<TableElemAlign>, <0x100>)
+
+C Register usage:
+
+define(<SP>, <1>)
+define(<TOCP>, <2>)
+
+define(<TABLE>, <3>)
+define(<X>, <4>)
+define(<LENGTH>, <5>)
+define(<DATA>, <6>)
+
+define(<zero>, <0>)
+define(<swap_mask>, <1>)
+define(<hidw_mask>, <2>)
+define(<lodw_mask>, <3>)
+define(<poly>, <4>)
+define(<poly_h>, <4>)
+define(<poly_l>, <5>)
+define(<RP>, <6>)
+define(<Mh>, <7>)
+define(<Ml>, <8>)
+define(<H>, <9>)
+define(<Hh>, <10>)
+define(<Hl>, <11>)
+define(<RP2>, <9>)
+define(<M2h>, <10>)
+define(<M2l>, <11>)
+
+define(<HX>, <41>)
+define(<HhX>, <42>)
+define(<HlX>, <43>)
+define(<H_HhX>, <44>)
+define(<H_HX>, <45>)
+define(<H_HlX>, <46>)
+
+define(<sl1>, <1>)
+define(<msb>, <5>)
+define(<H2>, <6>)
+define(<H2h>, <7>)
+define(<H2l>, <8>)
+define(<H_h>, <12>)
+define(<H_m>, <13>)
+define(<H_l>, <14>)
+define(<H_Hh>, <12>)
+define(<H_H>, <13>)
+define(<H_Hl>, <14>)
+define(<H_t>, <15>)
+define(<H2_h>, <16>)
+define(<H2_m>, <17>)
+define(<H2_l>, <18>)
+define(<H2_t>, <19>)
+
+define(<C0X>, <38>)
+define(<C1X>, <39>)
+define(<C2X>, <40>)
+define(<C3X>, <44>)
+define(<C4X>, <38>)
+define(<C5X>, <39>)
+define(<C6X>, <40>)
+define(<C7X>, <44>)
+
+define(<CX>, <45>)
+
+define(<C0>, <6>)
+define(<C1>, <7>)
+define(<C2>, <8>)
+define(<C3>, <12>)
+define(<C4>, <6>)
+define(<C5>, <7>)
+define(<C6>, <8>)
+define(<C7>, <12>)
+
+define(<C>, <13>)
+
+define(<Ch>, <14>)
+define(<Cl>, <15>)
+define(<Cm>, <16>)
+
+define(<C01h>, <14>)
+define(<C01l>, <15>)
+define(<C01>, <16>)
+define(<C23h>, <17>)
+define(<C23l>, <18>)
+define(<C23>, <19>)
+define(<C45h>, <20>)
+define(<C45l>, <21>)
+define(<C45>, <22>)
+define(<C67h>, <6>)
+define(<C67l>, <7>)
+define(<C67>, <8>)
+
+define(<H21>, <9>)
+define(<H21h>, <10>)
+define(<H21l>, <11>)
+define(<H43>, <23>)
+define(<H43h>, <24>)
+define(<H43l>, <25>)
+define(<H65>, <26>)
+define(<H65h>, <27>)
+define(<H65l>, <28>)
+define(<H87>, <29>)
+define(<H87h>, <30>)
+define(<H87l>, <31>)
+
+define(<H21X>, <41>)
+define(<H21hX>, <42>)
+define(<H21lX>, <43>)
+define(<H43X>, <55>)
+define(<H43hX>, <56>)
+define(<H43lX>, <57>)
+define(<H65X>, <58>)
+define(<H65hX>, <59>)
+define(<H65lX>, <60>)
+define(<H87X>, <61>)
+define(<H87hX>, <62>)
+define(<H87lX>, <63>)
+
+# gcm_fill registers:
+
+define(<CTR>, <3>)
+define(<BLOCKS>, <4>)
+define(<BUFFER>, <5>)
+
+define(<CTR0>, <2>)
+define(<CTR0S>, <3>)
+define(<CTR1>, <4>)
+define(<CTR2>, <5>)
+define(<CTR3>, <6>)
+define(<CTR4>, <7>)
+define(<CTR5>, <8>)
+define(<CTR6>, <9>)
+define(<CTR7>, <10>)
+
+define(<CTR0X>, <34>)
+define(<CTR0SX>, <35>)
+define(<CTR1X>, <36>)
+define(<CTR2X>, <37>)
+define(<CTR3X>, <38>)
+define(<CTR4X>, <39>)
+define(<CTR5X>, <40>)
+define(<CTR6X>, <41>)
+define(<CTR7X>, <42>)
+
+define(<I1>, <11>)
+define(<I2>, <12>)
+define(<I3>, <13>)
+define(<I4>, <14>)
+define(<I5>, <15>)
+define(<I6>, <16>)
+define(<I7>, <17>)
+define(<I8>, <18>)
+
+.file "gcm-hash.asm"
+
+IF_LE(<.abiversion 2>)
+.text
+
+ # void gcm_init_key (union gcm_block *table)
+
+define(<FUNC_ALIGN>, <5>)
+PROLOGUE(_nettle_gcm_init_key)
+ DATA_LOAD_VEC(poly,.polynomial,7)
+IF_LE(<DATA_LOAD_VEC(swap_mask,.swap_mask,7)>)
+ DATA_LOAD_VEC(hidw_mask,.hidw_mask,7)
+ DATA_LOAD_VEC(lodw_mask,.lodw_mask,7)
+
+ li    10,8*TableElemAlign
+  lxvd2x HX,10,TABLE # load H
+IF_LE(<vperm H,H,H,swap_mask>)
+
+ # --- calculate H = H shift left 1 modulo polynomial ---
+
+ vupkhsw    msb,H # most significant bit word-extend
+ vspltisb sl1,1 # splat 1 for shift left
+ vspltw      msb,msb,0 # most significant bit extend
+ vsl    H,H,sl1 # H shift left 1
+ vand msb,msb,poly
+ vxor zero,zero,zero
+ vxor H_t,H,msb
+
+ vsldoi H,H_t,H_t,8 # doubleword swap
+ vsldoi Hh,H,zero,8
+ vsldoi Hl,zero,H,8
+
+ # --- calculate H^2 = H*H ---
+
+ # reduction pre-processing
+ vsldoi poly_h,zero,poly,8
+ vsldoi poly_l,poly_h,poly_h,8
+
+ # polynomial multiplication "classical"
+ vpmsumd H_h,H_t,Hh # H^1h*H^1h
+ vpmsumd H_l,H_t,Hl # H^1l*H^1l
+ vpmsumd H_m,H_t,H # H^1h*H^1l⊕H^1l*H^1h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h   # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8   # [2]
+ vsldoi Ml,H_m,zero,8 # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor H_h,H_h,Mh       # [2]
+ vxor H_l,H_l,Ml       # [2]
+ vxor H_l,H_l,RP       # [1]
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l
+ vxor H_h,H_l,H_h
+ vxor H2_t,H_h,RP
+
+ vsldoi H2,H2_t,H2_t,8
+ vsldoi H2h,H2,zero,8
+ vsldoi H2l,zero,H2,8
+
+ # --- calculate [H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store H,[H^2.Hi⊕H^2.Lo:H^1.Hi⊕H^1.Lo] ---
+
+ li    8,0*TableElemAlign
+ li    9,1*TableElemAlign
+ li    10,2*TableElemAlign
+ stxvd2x HlX,8,TABLE
+ stxvd2x HX,9,TABLE
+ stxvd2x HhX,10,TABLE
+
+ li    8,3*TableElemAlign
+ li    9,4*TableElemAlign
+ li    10,5*TableElemAlign
+ stxvd2x H_HhX,8,TABLE
+ stxvd2x H_HX,9,TABLE
+ stxvd2x H_HlX,10,TABLE
+
+ # --- calculate H^3,H^4 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^2l
+ vpmsumd H_m,H_t,H2 # H^1h*H^2l⊕H^1l*H^2h
+ vpmsumd H_h,H_t,H2h # H^1h*H^2h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^2l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^2l⊕H^2l*H^2h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^2h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^3
+ vpmsumd    RP2,H2_l,poly_h # [1] H^4
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^3
+ vsldoi M2h,zero,H2_m,8 # [2] H^4
+ vsldoi Ml,H_m,zero,8 # [2] H^3
+ vsldoi M2l,H2_m,zero,8 # [2] H^4
+ vsldoi RP,RP,RP,8 # [1] H^3
+ vsldoi RP2,RP2,RP2,8 # [1] H^4
+ vxor H_h,H_h,Mh # [2] H^3
+ vxor H2_h,H2_h,M2h # [2] H^4
+ vxor H_l,H_l,Ml # [2] H^3
+ vxor H2_l,H2_l,M2l # [2] H^4
+ vxor H_l,H_l,RP # [1] H^3
+ vxor H2_l,H2_l,RP2 # [1] H^4
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^3
+ vpmsumd RP2,H2_l,poly_l # H^4
+ vxor H_h,H_l,H_h # H^3
+ vxor H2_h,H2_l,H2_h # H^4
+ vxor H_h,H_h,RP # H^3
+ vxor H2_h,H2_h,RP2 # H^4
+
+ vsldoi H2,H2_h,H2_h,8 # H^4
+ vsldoi H,H_h,H_h,8 # H^3
+ vsldoi H2l,zero,H2,8 # H^4
+ vsldoi H2h,H2,zero,8 # H^4
+
+ # --- calculate [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^4.Hi⊕H^4.Lo:H^3.Hi⊕H^3.Lo] ---
+
+ li    8,6*TableElemAlign
+ li    9,7*TableElemAlign
+ li    10,8*TableElemAlign
+ stxvd2x H_HhX,8,TABLE
+ stxvd2x H_HX,9,TABLE
+ stxvd2x H_HlX,10,TABLE
+
+ # --- calculate H^5,H^6 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^4l
+ vpmsumd H_m,H_t,H2 # H^1h*H^4l⊕H^1l*H^4h
+ vpmsumd H_h,H_t,H2h # H^1h*H^4h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^4l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^4l⊕H^2l*H^4h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^4h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^5
+ vpmsumd    RP2,H2_l,poly_h # [1] H^6
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^5
+ vsldoi M2h,zero,H2_m,8 # [2] H^6
+ vsldoi Ml,H_m,zero,8 # [2] H^5
+ vsldoi M2l,H2_m,zero,8 # [2] H^6
+ vsldoi RP,RP,RP,8 # [1] H^5
+ vsldoi RP2,RP2,RP2,8 # [1] H^6
+ vxor H_h,H_h,Mh # [2] H^5
+ vxor H2_h,H2_h,M2h # [2] H^6
+ vxor H_l,H_l,Ml # [2] H^5
+ vxor H2_l,H2_l,M2l # [2] H^6
+ vxor H_l,H_l,RP # [1] H^5
+ vxor H2_l,H2_l,RP2 # [1] H^6
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^5
+ vpmsumd RP2,H2_l,poly_l # H^6
+ vxor H_h,H_l,H_h # H^5
+ vxor H2_h,H2_l,H2_h # H^6
+ vxor H_h,H_h,RP # H^5
+ vxor H2_h,H2_h,RP2 # H^6
+
+ vsldoi H2,H2_h,H2_h,8 # H^6
+ vsldoi H,H_h,H_h,8 # H^5
+ vsldoi H2l,zero,H2,8 # H^6
+ vsldoi H2h,H2,zero,8 # H^6
+
+ # --- calculate [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^6.Hi⊕H^6.Lo:H^5.Hi⊕H^5.Lo] ---
+
+ li    8,9*TableElemAlign
+ li    9,10*TableElemAlign
+ li    10,11*TableElemAlign
+ stxvd2x H_HhX,8,TABLE
+ stxvd2x H_HX,9,TABLE
+ stxvd2x H_HlX,10,TABLE
+
+ # --- calculate H^7,H^8 ---
+
+ # polynomial multiplication "classical"
+ vpmsumd H_l,H_t,H2l # H^1l*H^6l
+ vpmsumd H_m,H_t,H2 # H^1h*H^6l⊕H^1l*H^6h
+ vpmsumd H_h,H_t,H2h # H^1h*H^6h
+ vpmsumd H2_l,H2_t,H2l # H^2l*H^6l
+ vpmsumd H2_m,H2_t,H2 # H^2h*H^6l⊕H^2l*H^6h
+ vpmsumd H2_h,H2_t,H2h # H^2h*H^6h
+
+ # reduction first phase     # [1]
+ vpmsumd RP,H_l,poly_h # [1] H^7
+ vpmsumd    RP2,H2_l,poly_h # [1] H^8
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,H_m,8 # [2] H^7
+ vsldoi M2h,zero,H2_m,8 # [2] H^8
+ vsldoi Ml,H_m,zero,8 # [2] H^7
+ vsldoi M2l,H2_m,zero,8 # [2] H^8
+ vsldoi RP,RP,RP,8 # [1] H^7
+ vsldoi RP2,RP2,RP2,8 # [1] H^8
+ vxor H_h,H_h,Mh # [2] H^7
+ vxor H2_h,H2_h,M2h # [2] H^8
+ vxor H_l,H_l,Ml # [2] H^7
+ vxor H2_l,H2_l,M2l # [2] H^8
+ vxor H_l,H_l,RP # [1] H^7
+ vxor H2_l,H2_l,RP2 # [1] H^8
+
+ # reduction second phase
+ vpmsumd RP,H_l,poly_l # H^7
+ vpmsumd RP2,H2_l,poly_l # H^8
+ vxor H_h,H_l,H_h # H^7
+ vxor H2_h,H2_l,H2_h # H^8
+ vxor H_h,H_h,RP # H^7
+ vxor H2_h,H2_h,RP2 # H^8
+
+ vsldoi H,H_h,H_h,8 # H^7
+ vsldoi H2,H2_h,H2_h,8 # H^8
+
+ # --- calculate [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] ---
+
+ vperm H_Hh,H2,H,lodw_mask
+ vperm H_Hl,H2,H,hidw_mask
+ vxor H_H,H_Hh,H_Hl
+
+ # --- store [H^8.Hi⊕H^8.Lo:H^7.Hi⊕H^7.Lo] ---
+
+ li    8,12*TableElemAlign
+ li    9,13*TableElemAlign
+ li    10,14*TableElemAlign
+ stxvd2x H_HhX,8,TABLE
+ stxvd2x H_HX,9,TABLE
+ stxvd2x H_HlX,10,TABLE
+
+  blr
+EPILOGUE(_nettle_gcm_init_key)
+
+ # void gcm_hash (const struct gcm_key *key, union gcm_block *x,
+ #                size_t length, const uint8_t *data)
+
+define(<FUNC_ALIGN>, <5>)
+PROLOGUE(_nettle_gcm_hash)
+ vxor zero,zero,zero
+
+ DATA_LOAD_VEC(poly,.polynomial,7)
+IF_LE(<DATA_LOAD_VEC(swap_mask,.swap_mask,7)>)
+ DATA_LOAD_VEC(hidw_mask,.hidw_mask,7)
+ DATA_LOAD_VEC(lodw_mask,.lodw_mask,7)
+
+ vsldoi poly_h,zero,poly,8
+ vsldoi poly_l,poly_h,poly_h,8
+
+ lxvd2x CX,0,X # load X
+IF_LE(<vperm C,C,C,swap_mask>)
+
+ srdi 7,LENGTH,7 # 8x loop count
+ cmpldi 7,0
+ beq L2x
+
+ # backup registers
+ stdu SP,-224(SP)
+ std 28,216(SP)
+ std 29,208(SP)
+ std 30,200(SP)
+ std 31,192(SP)
+    li 8,176
+    stvx 20,8,SP
+    subi 8,8,16
+    stvx 21,8,SP
+    subi 8,8,16
+    stvx 22,8,SP
+    subi 8,8,16
+    stvx 23,8,SP
+    subi 8,8,16
+    stvx 24,8,SP
+    subi 8,8,16
+    stvx 25,8,SP
+    subi 8,8,16
+    stvx 26,8,SP
+    subi 8,8,16
+    stvx 27,8,SP
+    subi 8,8,16
+    stvx 28,8,SP
+    subi 8,8,16
+    stvx 29,8,SP
+    subi 8,8,16
+    stvx 30,8,SP
+    subi 8,8,16
+    stvx 31,8,SP
+
+ # table loading
+ li 8,3*TableElemAlign
+ li 9,4*TableElemAlign
+ li 10,5*TableElemAlign
+ lxvd2x H21hX,8,TABLE
+ lxvd2x H21X,9,TABLE
+ lxvd2x H21lX,10,TABLE
+ li 8,6*TableElemAlign
+ li 9,7*TableElemAlign
+ li 10,8*TableElemAlign
+ lxvd2x H43hX,8,TABLE
+ lxvd2x H43X,9,TABLE
+ lxvd2x H43lX,10,TABLE
+ li 8,9*TableElemAlign
+ li 9,10*TableElemAlign
+ li 10,11*TableElemAlign
+ lxvd2x H65hX,8,TABLE
+ lxvd2x H65X,9,TABLE
+ lxvd2x H65lX,10,TABLE
+ li 8,12*TableElemAlign
+ li 9,13*TableElemAlign
+ li 10,14*TableElemAlign
+ lxvd2x H87hX,8,TABLE
+ lxvd2x H87X,9,TABLE
+ lxvd2x H87lX,10,TABLE
+
+ li 8,0x10
+ li 9,0x20
+ li 10,0x30
+ li 28,0x40
+ li 29,0x50
+ li 30,0x60
+ li 31,0x70
+
+ mtctr     7
+.align 5
+L8x_loop:
+ # input loading
+ lxvd2x C0X,0,DATA # load C0
+ lxvd2x C1X,8,DATA # load C1
+ lxvd2x C2X,9,DATA # load C2
+ lxvd2x C3X,10,DATA # load C3
+
+ # swap permuting
+IF_LE(<vperm C0,C0,C0,swap_mask
+ vperm C1,C1,C1,swap_mask
+ vperm C2,C2,C2,swap_mask
+ vperm C3,C3,C3,swap_mask>)
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C23h,C2,C3,hidw_mask
+ vperm C23l,C2,C3,lodw_mask
+ vperm C01h,C0,C1,hidw_mask
+ vperm C01l,C0,C1,lodw_mask
+
+ # input loading
+ lxvd2x C4X,28,DATA # load C4
+ lxvd2x C5X,29,DATA # load C5
+ lxvd2x C6X,30,DATA # load C6
+ lxvd2x C7X,31,DATA # load C7
+
+ # swap permuting
+IF_LE(<vperm C4,C4,C4,swap_mask
+ vperm C5,C5,C5,swap_mask
+ vperm C6,C6,C6,swap_mask
+ vperm C7,C7,C7,swap_mask>)
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C45h,C4,C5,hidw_mask
+ vperm C45l,C4,C5,lodw_mask
+ vperm C67h,C6,C7,hidw_mask
+ vperm C67l,C6,C7,lodw_mask
+ vxor C23,C23h,C23l
+ vxor C01,C01h,C01l
+ vxor C45,C45h,C45l
+ vxor C67,C67h,C67l
+
+ # polynomial multiplication "karatsuba"
+ vpmsumd C23h,C23h,H65h # H23 = H^6h*C2h⊕H^5h*C3h
+ vpmsumd C23l,C23l,H65l # L23 = H^6l*C2l⊕H^5l*C3l
+ vpmsumd C01h,C01h,H87h # H01 = H^8h*C0h⊕H^7h*C1h
+ vpmsumd C01l,C01l,H87l # L01 = H^8l*C0l⊕H^7l*C1l
+ vpmsumd C67h,C67h,H21h # H67 = H^2h*C6h⊕H^1h*C7h
+ vpmsumd C67l,C67l,H21l # L67 = H^2l*C6l⊕H^1l*C7l
+ vpmsumd C45h,C45h,H43h # H45 = H^4h*C4h⊕H^3h*C5h
+ vpmsumd C45l,C45l,H43l # L45 = H^4l*C4l⊕H^3l*C5l
+ vpmsumd C23,C23,H65 # M23 = (H^6h⊕H^5h)*(C2h⊕C3h)⊕(H^6l⊕H^5l)*(C2l⊕C3l)
+ vpmsumd C01,C01,H87 # M01 = (H^8h⊕H^7h)*(C0h⊕C1h)⊕(H^8l⊕H^7l)*(C0l⊕C1l)
+ vpmsumd C45,C45,H43 # M45 = (H^4h⊕H^3h)*(C4h⊕C5h)⊕(H^4l⊕H^3l)*(C4l⊕C5l)
+ vpmsumd C67,C67,H21 # M67 = (H^2h⊕H^1h)*(C6h⊕C7h)⊕(H^2l⊕H^1l)*(C6l⊕C7l)
+
+ # polynomial multiplication "karatsuba" post-processing
+ vxor C23,C23,C23h
+ vxor C01,C01,C01h
+ vxor C45,C45,C45h
+ vxor C67,C67,C67h
+ vxor C23,C23,C23l
+ vxor C01,C01,C01l
+ vxor C45,C45,C45l
+ vxor C67,C67,C67l
+
+ # deferred recombination of partial products
+ vxor C01h,C01h,C23h # H0 = H01⊕H23
+ vxor C45h,C45h,C67h # H1 = H45⊕H67
+ vxor C01l,C01l,C23l # L0 = L01⊕L23
+ vxor C45l,C45l,C67l # L1 = L45⊕L45
+ vxor C01,C01,C23 # M0 = M01⊕M23
+ vxor C45,C45,C67 # M1 = M45⊕M45
+ vxor C01h,C01h,C45h # H = H0⊕H1
+ vxor C01l,C01l,C45l # L = L0⊕L1
+ vxor C01,C01,C45 # M = M0⊕M1
+
+ # reduction first phase # [1]
+ vpmsumd RP,C01l,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,C01,8   # [2]
+ vsldoi Ml,C01,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor C01h,C01h,Mh     # [2]
+ vxor C01l,C01l,Ml     # [2]
+ vxor C01l,C01l,RP     # [1]
+
+ # reduction second phase
+ vpmsumd RP,C01l,poly_l
+ vxor C01h,C01l,C01h
+ vxor C,C01h,RP
+
+ addi DATA,DATA,0x80
+ bdnz L8x_loop
+
+    # restore registers
+ li 8,0
+    lvx 31,8,SP
+    addi 8,8,16
+    lvx 30,8,SP
+    addi 8,8,16
+    lvx 29,8,SP
+    addi 8,8,16
+    lvx 28,8,SP
+    addi 8,8,16
+    lvx 27,8,SP
+    addi 8,8,16
+    lvx 26,8,SP
+    addi 8,8,16
+    lvx 25,8,SP
+    addi 8,8,16
+    lvx 24,8,SP
+    addi 8,8,16
+    lvx 23,8,SP
+    addi 8,8,16
+    lvx 22,8,SP
+    addi 8,8,16
+    lvx 21,8,SP
+    addi 8,8,16
+    lvx 20,8,SP
+ ld 31,192(SP)
+ ld 30,200(SP)
+ ld 29,208(SP)
+ ld 28,216(SP)
+ addi SP,SP,224
+
+ clrldi   LENGTH,LENGTH,57
+L2x:
+ srdi 7,LENGTH,5
+ cmpldi 7,0
+ beq L1x
+
+ # table loading
+ li 8,3*TableElemAlign
+ li 9,4*TableElemAlign
+ li 10,5*TableElemAlign
+ lxvd2x H21hX,8,TABLE
+ lxvd2x H21X,9,TABLE
+ lxvd2x H21lX,10,TABLE
+
+ li 10,0x10
+
+ mtctr     7
+.align 5
+L2x_loop:
+ # input loading
+ lxvd2x C0X,0,DATA # load C0
+ lxvd2x C1X,10,DATA # load C1
+
+ # swap permuting
+IF_LE(<vperm C0,C0,C0,swap_mask
+ vperm C1,C1,C1,swap_mask>)
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ # polynomial multiplication "karatsuba" pre-processing
+ vperm C01h,C0,C1,hidw_mask
+ vperm C01l,C0,C1,lodw_mask
+ vxor C01,C01h,C01l
+
+ # polynomial multiplication "karatsuba"
+ vpmsumd C01h,C01h,H21h # H01 = H^2h*C0h⊕H^1h*C1h
+ vpmsumd C01l,C01l,H21l # L01 = H^2l*C0l⊕H^1l*C1l
+ vpmsumd C01,C01,H21 # M01 = (H^2h⊕H^1h)*(C0h⊕C1h)⊕(H^2l⊕H^1l)*(C0l⊕C1l)
+
+ # polynomial multiplication "karatsuba" post-processing
+ vxor C01,C01,C01h
+ vxor C01,C01,C01l
+
+ # reduction first phase # [1]
+ vpmsumd RP,C01l,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,C01,8   # [2]
+ vsldoi Ml,C01,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor C01h,C01h,Mh     # [2]
+ vxor C01l,C01l,Ml     # [2]
+ vxor C01l,C01l,RP     # [1]
+
+ # reduction second phase
+ vpmsumd RP,C01l,poly_l
+ vxor C01h,C01l,C01h
+ vxor C,C01h,RP
+
+ addi DATA,DATA,0x20
+ bdnz L2x_loop
+
+ clrldi   LENGTH,LENGTH,59
+L1x:
+ srdi 7,LENGTH,4
+ cmpldi 7,0
+ beq Lrem
+
+ # table loading
+ li 9,1*TableElemAlign
+ li 10,2*TableElemAlign
+ lxvd2x HlX,0,TABLE
+ lxvd2x HX, 9,TABLE
+ lxvd2x HhX,10,TABLE
+
+ # input loading
+ lxvd2x C0X,0,DATA # load C0
+
+ # swap permuting
+IF_LE(<vperm C0,C0,C0,swap_mask>)
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ vpmsumd Cl,C0,Hl # L = Hl*Cl
+ vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch
+ vpmsumd Ch,C0,Hh # H = Hh*Ch
+
+ # reduction first phase # [1]
+ vpmsumd RP,Cl,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,Cm,8   # [2]
+ vsldoi Ml,Cm,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor Ch,Ch,Mh   # [2]
+ vxor Cl,Cl,Ml       # [2]
+ vxor Cl,Cl,RP   # [1]
+
+ # reduction second phase
+ vpmsumd RP,Cl,poly_l
+ vxor Ch,Cl,Ch
+ vxor C,Ch,RP
+
+ addi DATA,DATA,0x10
+ clrldi   LENGTH,LENGTH,60
+Lrem:
+ cmpldi LENGTH,0
+ beq Ldone
+
+ # table loading
+ li 9,1*TableElemAlign
+ li 10,2*TableElemAlign
+ lxvd2x HlX,0,TABLE
+ lxvd2x HX, 9,TABLE
+ lxvd2x HhX,10,TABLE
+
+ # input loading
+ stdu SP,-16(SP)
+ stvx zero,0,SP
+Lst_loop:
+ subic.      LENGTH,LENGTH,1
+ lbzx 7,LENGTH,DATA
+ stbx 7,LENGTH,SP
+ bne Lst_loop
+ lxvd2x   C0X,0,SP
+ addi SP,SP,16
+
+ # swap permuting
+IF_LE(<vperm C0,C0,C0,swap_mask>)
+
+ # previous digest combining
+ vxor C0,C0,C
+
+ vpmsumd Cl,C0,Hl # L = Hl*Cl
+ vpmsumd Cm,C0,H # M = Hh*Cl⊕Hl*Ch
+ vpmsumd Ch,C0,Hh # H = Hh*Ch
+
+ # reduction first phase # [1]
+ vpmsumd RP,Cl,poly_h # [1]
+
+ # polynomial multiplication post-processing # [2]
+ vsldoi Mh,zero,Cm,8   # [2]
+ vsldoi Ml,Cm,zero,8   # [2]
+ vsldoi RP,RP,RP,8     # [1]
+ vxor Ch,Ch,Mh   # [2]
+ vxor Cl,Cl,Ml     # [2]
+ vxor Cl,Cl,RP   # [1]
+
+ # reduction second phase
+ vpmsumd RP,Cl,poly_l
+ vxor Ch,Cl,Ch
+ vxor C,Ch,RP
+
+Ldone:
+IF_LE(<vperm C,C,C,swap_mask>)
+ stxvd2x CX,0,X # store C
+ blr
+EPILOGUE(_nettle_gcm_hash)
+
+ # gcm_fill (uint8_t *ctr, size_t blocks, union gcm_block *buffer)
+
+define(<FUNC_ALIGN>, <5>)
+PROLOGUE(_nettle_gcm_fill)
+IF_LE(<DATA_LOAD_VEC(swap_mask,.swap_mask,6)>)
+
+ vxor zero,zero,zero
+ vspltisb I1,1
+ vspltisb I2,2
+ vspltisb I3,3
+ vspltisb I4,4
+ vspltisb I5,5
+ vspltisb I6,6
+ vspltisb I7,7
+ vspltisb I8,8
+ vsldoi I1,zero,I1,1
+ vsldoi I2,zero,I2,1
+ vsldoi I3,zero,I3,1
+ vsldoi I4,zero,I4,1
+ vsldoi I5,zero,I5,1
+ vsldoi I6,zero,I6,1
+ vsldoi I7,zero,I7,1
+ vsldoi I8,zero,I8,1
+
+ lxvd2x CTR0X,0,CTR
+ IF_LE(<vperm CTR0,CTR0,CTR0,swap_mask>)
+
+ srdi 6,BLOCKS,3 # 8x loop count
+ cmpldi 6,0
+ beq Lfill_4x
+
+ std 25,-56(SP);
+ std 26,-48(SP);
+ std 27,-40(SP);
+ std 28,-32(SP);
+ std 29,-24(SP);
+ std 30,-16(SP);
+ std 31,-8(SP);
+
+ li 25,0x10
+ li 26,0x20
+ li 27,0x30
+ li 28,0x40
+ li 29,0x50
+ li 30,0x60
+ li 31,0x70
+
+ mtctr 6
+ L8x_fill_loop:
+ vadduwm CTR1,CTR0,I1
+ vadduwm CTR2,CTR0,I2
+ vadduwm CTR3,CTR0,I3
+ vadduwm CTR4,CTR0,I4
+ vadduwm CTR5,CTR0,I5
+ vadduwm CTR6,CTR0,I6
+ vadduwm CTR7,CTR0,I7
+
+ IF_LE(<vperm CTR0S,CTR0,CTR0,swap_mask
+ vperm CTR1,CTR1,CTR1,swap_mask
+ vperm CTR2,CTR2,CTR2,swap_mask
+ vperm CTR3,CTR3,CTR3,swap_mask
+ vperm CTR4,CTR4,CTR4,swap_mask
+ vperm CTR5,CTR5,CTR5,swap_mask
+ vperm CTR6,CTR6,CTR6,swap_mask
+ vperm CTR7,CTR7,CTR7,swap_mask>)
+
+ IF_LE(<stxvd2x CTR0SX,0,BUFFER>)
+ IF_BE(<stxvd2x CTR0X,0,BUFFER>)
+ stxvd2x CTR1X,25,BUFFER
+ stxvd2x CTR2X,26,BUFFER
+ stxvd2x CTR3X,27,BUFFER
+ stxvd2x CTR4X,28,BUFFER
+ stxvd2x CTR5X,29,BUFFER
+ stxvd2x CTR6X,30,BUFFER
+ stxvd2x CTR7X,31,BUFFER
+
+ vadduwm CTR0,CTR0,I8
+ addi BUFFER,BUFFER,0x80
+ bdnz L8x_fill_loop
+
+ ld 25,-56(SP);
+ ld 26,-48(SP);
+ ld 27,-40(SP);
+ ld 28,-32(SP);
+ ld 29,-24(SP);
+ ld 30,-16(SP);
+ ld 31,-8(SP);
+
+ clrldi BLOCKS,BLOCKS,61
+
+ Lfill_4x:
+ srdi 6,BLOCKS,2
+ cmpldi 6,0
+ beq Lfill_2x
+
+ li 8,0x10
+ li 9,0x20
+ li 10,0x30
+
+ vadduwm CTR1,CTR0,I1
+ vadduwm CTR2,CTR0,I2
+ vadduwm CTR3,CTR0,I3
+
+ IF_LE(<vperm CTR0S,CTR0,CTR0,swap_mask
+ vperm CTR1,CTR1,CTR1,swap_mask
+ vperm CTR2,CTR2,CTR2,swap_mask
+ vperm CTR3,CTR3,CTR3,swap_mask>)
+
+ IF_LE(<stxvd2x CTR0SX,0,BUFFER>)
+ IF_BE(<stxvd2x CTR0X,0,BUFFER>)
+ stxvd2x CTR1X,8,BUFFER
+ stxvd2x CTR2X,9,BUFFER
+ stxvd2x CTR3X,10,BUFFER
+
+ vadduwm CTR0,CTR0,I4
+ addi BUFFER,BUFFER,0x40
+
+ clrldi BLOCKS,BLOCKS,62
+
+ Lfill_2x:
+ srdi 6,BLOCKS,1
+ cmpldi 6,0
+ beq Lfill_1x
+
+ li 10,0x10
+
+ vadduwm CTR1,CTR0,I1
+
+ IF_LE(<vperm CTR0S,CTR0,CTR0,swap_mask
+ vperm CTR1,CTR1,CTR1,swap_mask>)
+
+ IF_LE(<stxvd2x CTR0SX,0,BUFFER>)
+ IF_BE(<stxvd2x CTR0X,0,BUFFER>)
+ stxvd2x CTR1X,10,BUFFER
+
+ vadduwm CTR0,CTR0,I2
+ addi BUFFER,BUFFER,0x20
+
+ clrldi BLOCKS,BLOCKS,63
+
+ Lfill_1x:
+ cmpldi BLOCKS,0
+ beq Lfill_done
+
+ IF_LE(<vperm CTR0S,CTR0,CTR0,swap_mask>)
+
+ IF_LE(<stxvd2x CTR0SX,0,BUFFER>)
+ IF_BE(<stxvd2x CTR0X,0,BUFFER>)
+
+ vadduwm CTR0,CTR0,I1
+
+ Lfill_done:
+ IF_LE(<vperm CTR0,CTR0,CTR0,swap_mask>)
+ stxvd2x CTR0X,0,CTR
+
+ blr
+EPILOGUE(_nettle_gcm_fill)
+
+    .data
+IF_LE(<.align 4
+.polynomial:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+    .align 4
+.swap_mask:
+ .byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7
+    .align 4
+.hidw_mask:
+ .byte 23,22,21,20,19,18,17,16,7,6,5,4,3,2,1,0
+    .align 4
+.lodw_mask:
+ .byte 31,30,29,28,27,26,25,24,15,14,13,12,11,10,9,8>)
+IF_BE(<.align 4
+.polynomial:
+ .byte 0xc2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+    .align 4
+.hidw_mask:
+ .byte 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23
+    .align 4
+.lodw_mask:
+ .byte 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31>)
-- 
2.17.1
_______________________________________________
nettle-bugs mailing list
nettle-bugs@lists.lysator.liu.se
http://lists.lysator.liu.se/mailman/listinfo/nettle-bugs

[prev in list] [next in list] [prev in thread] [next in thread]