shithub: blake2

Download patch

ref: 22a1ce9b2f81115068688989a1c325662e026b52
parent: 802c795b4340ff69e2e14cdd326565d0a8372cf4
author: Samuel Neves <sneves@dei.uc.pt>
date: Tue Jan 14 14:02:48 EST 2014

Use unaligned instructions for non speed-critical memory accesses

--- a/sse/blake2b.c
+++ b/sse/blake2b.c
@@ -284,14 +284,14 @@
   const uint64_t m14 = ( ( uint64_t * )block )[14];
   const uint64_t m15 = ( ( uint64_t * )block )[15];
 #endif
-  row1l = LOAD( &S->h[0] );
-  row1h = LOAD( &S->h[2] );
-  row2l = LOAD( &S->h[4] );
-  row2h = LOAD( &S->h[6] );
-  row3l = LOAD( &blake2b_IV[0] );
-  row3h = LOAD( &blake2b_IV[2] );
-  row4l = _mm_xor_si128( LOAD( &blake2b_IV[4] ), LOAD( &S->t[0] ) );
-  row4h = _mm_xor_si128( LOAD( &blake2b_IV[6] ), LOAD( &S->f[0] ) );
+  row1l = LOADU( &S->h[0] );
+  row1h = LOADU( &S->h[2] );
+  row2l = LOADU( &S->h[4] );
+  row2h = LOADU( &S->h[6] );
+  row3l = LOADU( &blake2b_IV[0] );
+  row3h = LOADU( &blake2b_IV[2] );
+  row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) );
+  row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) );
   ROUND( 0 );
   ROUND( 1 );
   ROUND( 2 );
@@ -306,12 +306,12 @@
   ROUND( 11 );
   row1l = _mm_xor_si128( row3l, row1l );
   row1h = _mm_xor_si128( row3h, row1h );
-  STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) );
-  STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) );
+  STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) );
+  STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) );
   row2l = _mm_xor_si128( row4l, row2l );
   row2h = _mm_xor_si128( row4h, row2h );
-  STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) );
-  STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) );
+  STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) );
+  STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) );
   return 0;
 }
 
--- a/sse/blake2s.c
+++ b/sse/blake2s.c
@@ -274,10 +274,10 @@
   const uint32_t m14 = ( ( uint32_t * )block )[14];
   const uint32_t m15 = ( ( uint32_t * )block )[15];
 #endif
-  row1 = ff0 = LOAD( &S->h[0] );
-  row2 = ff1 = LOAD( &S->h[4] );
+  row1 = ff0 = LOADU( &S->h[0] );
+  row2 = ff1 = LOADU( &S->h[4] );
   row3 = _mm_setr_epi32( 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A );
-  row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOAD( &S->t[0] ) );
+  row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOADU( &S->t[0] ) );
   ROUND( 0 );
   ROUND( 1 );
   ROUND( 2 );
@@ -288,8 +288,8 @@
   ROUND( 7 );
   ROUND( 8 );
   ROUND( 9 );
-  STORE( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) );
-  STORE( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) );
+  STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) );
+  STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) );
   return 0;
 }
 
--