00001
00002
00003
00004
00005
00006
00007
00008 #include <botan/aes_intel.h>
00009 #include <botan/loadstor.h>
00010 #include <wmmintrin.h>
00011
00012 namespace Botan {
00013
00014 namespace {
00015
00016 __m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon)
00017 {
00018 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3));
00019 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00020 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00021 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00022 return _mm_xor_si128(key, key_with_rcon);
00023 }
00024
00025 void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon,
00026 u32bit out[], bool last)
00027 {
00028 __m128i key1 = *K1;
00029 __m128i key2 = *K2;
00030
00031 key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1));
00032 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
00033 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
00034 key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4));
00035 key1 = _mm_xor_si128(key1, key2_with_rcon);
00036
00037 *K1 = key1;
00038 _mm_storeu_si128((__m128i*)out, key1);
00039
00040 if(last)
00041 return;
00042
00043 key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4));
00044 key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3)));
00045
00046 *K2 = key2;
00047 out[4] = _mm_cvtsi128_si32(key2);
00048 out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4));
00049 }
00050
00051
00052
00053
00054 __m128i aes_256_key_expansion(__m128i key, __m128i key2)
00055 {
00056 __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00);
00057 key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2));
00058
00059 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00060 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00061 key = _mm_xor_si128(key, _mm_slli_si128(key, 4));
00062 return _mm_xor_si128(key, key_with_rcon);
00063 }
00064
00065 }
00066
00067 #define AES_ENC_4_ROUNDS(K) \
00068 do \
00069 { \
00070 B0 = _mm_aesenc_si128(B0, K); \
00071 B1 = _mm_aesenc_si128(B1, K); \
00072 B2 = _mm_aesenc_si128(B2, K); \
00073 B3 = _mm_aesenc_si128(B3, K); \
00074 } while(0)
00075
00076 #define AES_ENC_4_LAST_ROUNDS(K) \
00077 do \
00078 { \
00079 B0 = _mm_aesenclast_si128(B0, K); \
00080 B1 = _mm_aesenclast_si128(B1, K); \
00081 B2 = _mm_aesenclast_si128(B2, K); \
00082 B3 = _mm_aesenclast_si128(B3, K); \
00083 } while(0)
00084
00085 #define AES_DEC_4_ROUNDS(K) \
00086 do \
00087 { \
00088 B0 = _mm_aesdec_si128(B0, K); \
00089 B1 = _mm_aesdec_si128(B1, K); \
00090 B2 = _mm_aesdec_si128(B2, K); \
00091 B3 = _mm_aesdec_si128(B3, K); \
00092 } while(0)
00093
00094 #define AES_DEC_4_LAST_ROUNDS(K) \
00095 do \
00096 { \
00097 B0 = _mm_aesdeclast_si128(B0, K); \
00098 B1 = _mm_aesdeclast_si128(B1, K); \
00099 B2 = _mm_aesdeclast_si128(B2, K); \
00100 B3 = _mm_aesdeclast_si128(B3, K); \
00101 } while(0)
00102
00103
00104
00105
00106 void AES_128_Intel::encrypt_n(const byte in[], byte out[], u32bit blocks) const
00107 {
00108 const __m128i* in_mm = (const __m128i*)in;
00109 __m128i* out_mm = (__m128i*)out;
00110
00111 const __m128i* key_mm = (const __m128i*)&EK[0];
00112
00113 __m128i K0 = _mm_loadu_si128(key_mm);
00114 __m128i K1 = _mm_loadu_si128(key_mm + 1);
00115 __m128i K2 = _mm_loadu_si128(key_mm + 2);
00116 __m128i K3 = _mm_loadu_si128(key_mm + 3);
00117 __m128i K4 = _mm_loadu_si128(key_mm + 4);
00118 __m128i K5 = _mm_loadu_si128(key_mm + 5);
00119 __m128i K6 = _mm_loadu_si128(key_mm + 6);
00120 __m128i K7 = _mm_loadu_si128(key_mm + 7);
00121 __m128i K8 = _mm_loadu_si128(key_mm + 8);
00122 __m128i K9 = _mm_loadu_si128(key_mm + 9);
00123 __m128i K10 = _mm_loadu_si128(key_mm + 10);
00124
00125 while(blocks >= 4)
00126 {
00127 __m128i B0 = _mm_loadu_si128(in_mm + 0);
00128 __m128i B1 = _mm_loadu_si128(in_mm + 1);
00129 __m128i B2 = _mm_loadu_si128(in_mm + 2);
00130 __m128i B3 = _mm_loadu_si128(in_mm + 3);
00131
00132 B0 = _mm_xor_si128(B0, K0);
00133 B1 = _mm_xor_si128(B1, K0);
00134 B2 = _mm_xor_si128(B2, K0);
00135 B3 = _mm_xor_si128(B3, K0);
00136
00137 AES_ENC_4_ROUNDS(K1);
00138 AES_ENC_4_ROUNDS(K2);
00139 AES_ENC_4_ROUNDS(K3);
00140 AES_ENC_4_ROUNDS(K4);
00141 AES_ENC_4_ROUNDS(K5);
00142 AES_ENC_4_ROUNDS(K6);
00143 AES_ENC_4_ROUNDS(K7);
00144 AES_ENC_4_ROUNDS(K8);
00145 AES_ENC_4_ROUNDS(K9);
00146 AES_ENC_4_LAST_ROUNDS(K10);
00147
00148 _mm_storeu_si128(out_mm + 0, B0);
00149 _mm_storeu_si128(out_mm + 1, B1);
00150 _mm_storeu_si128(out_mm + 2, B2);
00151 _mm_storeu_si128(out_mm + 3, B3);
00152
00153 blocks -= 4;
00154 in_mm += 4;
00155 out_mm += 4;
00156 }
00157
00158 for(u32bit i = 0; i != blocks; ++i)
00159 {
00160 __m128i B = _mm_loadu_si128(in_mm + i);
00161
00162 B = _mm_xor_si128(B, K0);
00163
00164 B = _mm_aesenc_si128(B, K1);
00165 B = _mm_aesenc_si128(B, K2);
00166 B = _mm_aesenc_si128(B, K3);
00167 B = _mm_aesenc_si128(B, K4);
00168 B = _mm_aesenc_si128(B, K5);
00169 B = _mm_aesenc_si128(B, K6);
00170 B = _mm_aesenc_si128(B, K7);
00171 B = _mm_aesenc_si128(B, K8);
00172 B = _mm_aesenc_si128(B, K9);
00173 B = _mm_aesenclast_si128(B, K10);
00174
00175 _mm_storeu_si128(out_mm + i, B);
00176 }
00177 }
00178
00179
00180
00181
00182 void AES_128_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const
00183 {
00184 const __m128i* in_mm = (const __m128i*)in;
00185 __m128i* out_mm = (__m128i*)out;
00186
00187 const __m128i* key_mm = (const __m128i*)&DK[0];
00188
00189 __m128i K0 = _mm_loadu_si128(key_mm);
00190 __m128i K1 = _mm_loadu_si128(key_mm + 1);
00191 __m128i K2 = _mm_loadu_si128(key_mm + 2);
00192 __m128i K3 = _mm_loadu_si128(key_mm + 3);
00193 __m128i K4 = _mm_loadu_si128(key_mm + 4);
00194 __m128i K5 = _mm_loadu_si128(key_mm + 5);
00195 __m128i K6 = _mm_loadu_si128(key_mm + 6);
00196 __m128i K7 = _mm_loadu_si128(key_mm + 7);
00197 __m128i K8 = _mm_loadu_si128(key_mm + 8);
00198 __m128i K9 = _mm_loadu_si128(key_mm + 9);
00199 __m128i K10 = _mm_loadu_si128(key_mm + 10);
00200
00201 while(blocks >= 4)
00202 {
00203 __m128i B0 = _mm_loadu_si128(in_mm + 0);
00204 __m128i B1 = _mm_loadu_si128(in_mm + 1);
00205 __m128i B2 = _mm_loadu_si128(in_mm + 2);
00206 __m128i B3 = _mm_loadu_si128(in_mm + 3);
00207
00208 B0 = _mm_xor_si128(B0, K0);
00209 B1 = _mm_xor_si128(B1, K0);
00210 B2 = _mm_xor_si128(B2, K0);
00211 B3 = _mm_xor_si128(B3, K0);
00212
00213 AES_DEC_4_ROUNDS(K1);
00214 AES_DEC_4_ROUNDS(K2);
00215 AES_DEC_4_ROUNDS(K3);
00216 AES_DEC_4_ROUNDS(K4);
00217 AES_DEC_4_ROUNDS(K5);
00218 AES_DEC_4_ROUNDS(K6);
00219 AES_DEC_4_ROUNDS(K7);
00220 AES_DEC_4_ROUNDS(K8);
00221 AES_DEC_4_ROUNDS(K9);
00222 AES_DEC_4_LAST_ROUNDS(K10);
00223
00224 _mm_storeu_si128(out_mm + 0, B0);
00225 _mm_storeu_si128(out_mm + 1, B1);
00226 _mm_storeu_si128(out_mm + 2, B2);
00227 _mm_storeu_si128(out_mm + 3, B3);
00228
00229 blocks -= 4;
00230 in_mm += 4;
00231 out_mm += 4;
00232 }
00233
00234 for(u32bit i = 0; i != blocks; ++i)
00235 {
00236 __m128i B = _mm_loadu_si128(in_mm + i);
00237
00238 B = _mm_xor_si128(B, K0);
00239
00240 B = _mm_aesdec_si128(B, K1);
00241 B = _mm_aesdec_si128(B, K2);
00242 B = _mm_aesdec_si128(B, K3);
00243 B = _mm_aesdec_si128(B, K4);
00244 B = _mm_aesdec_si128(B, K5);
00245 B = _mm_aesdec_si128(B, K6);
00246 B = _mm_aesdec_si128(B, K7);
00247 B = _mm_aesdec_si128(B, K8);
00248 B = _mm_aesdec_si128(B, K9);
00249 B = _mm_aesdeclast_si128(B, K10);
00250
00251 _mm_storeu_si128(out_mm + i, B);
00252 }
00253 }
00254
00255
00256
00257
00258 void AES_128_Intel::key_schedule(const byte key[], u32bit)
00259 {
00260 #define AES_128_key_exp(K, RCON) \
00261 aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON))
00262
00263 __m128i K0 = _mm_loadu_si128((const __m128i*)(key));
00264 __m128i K1 = AES_128_key_exp(K0, 0x01);
00265 __m128i K2 = AES_128_key_exp(K1, 0x02);
00266 __m128i K3 = AES_128_key_exp(K2, 0x04);
00267 __m128i K4 = AES_128_key_exp(K3, 0x08);
00268 __m128i K5 = AES_128_key_exp(K4, 0x10);
00269 __m128i K6 = AES_128_key_exp(K5, 0x20);
00270 __m128i K7 = AES_128_key_exp(K6, 0x40);
00271 __m128i K8 = AES_128_key_exp(K7, 0x80);
00272 __m128i K9 = AES_128_key_exp(K8, 0x1B);
00273 __m128i K10 = AES_128_key_exp(K9, 0x36);
00274
00275 __m128i* EK_mm = (__m128i*)&EK[0];
00276 _mm_storeu_si128(EK_mm , K0);
00277 _mm_storeu_si128(EK_mm + 1, K1);
00278 _mm_storeu_si128(EK_mm + 2, K2);
00279 _mm_storeu_si128(EK_mm + 3, K3);
00280 _mm_storeu_si128(EK_mm + 4, K4);
00281 _mm_storeu_si128(EK_mm + 5, K5);
00282 _mm_storeu_si128(EK_mm + 6, K6);
00283 _mm_storeu_si128(EK_mm + 7, K7);
00284 _mm_storeu_si128(EK_mm + 8, K8);
00285 _mm_storeu_si128(EK_mm + 9, K9);
00286 _mm_storeu_si128(EK_mm + 10, K10);
00287
00288
00289
00290 __m128i* DK_mm = (__m128i*)&DK[0];
00291 _mm_storeu_si128(DK_mm , K10);
00292 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9));
00293 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8));
00294 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7));
00295 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6));
00296 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5));
00297 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4));
00298 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3));
00299 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2));
00300 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1));
00301 _mm_storeu_si128(DK_mm + 10, K0);
00302 }
00303
00304
00305
00306
00307 void AES_128_Intel::clear()
00308 {
00309 EK.clear();
00310 DK.clear();
00311 }
00312
00313
00314
00315
00316 void AES_192_Intel::encrypt_n(const byte in[], byte out[], u32bit blocks) const
00317 {
00318 const __m128i* in_mm = (const __m128i*)in;
00319 __m128i* out_mm = (__m128i*)out;
00320
00321 const __m128i* key_mm = (const __m128i*)&EK[0];
00322
00323 __m128i K0 = _mm_loadu_si128(key_mm);
00324 __m128i K1 = _mm_loadu_si128(key_mm + 1);
00325 __m128i K2 = _mm_loadu_si128(key_mm + 2);
00326 __m128i K3 = _mm_loadu_si128(key_mm + 3);
00327 __m128i K4 = _mm_loadu_si128(key_mm + 4);
00328 __m128i K5 = _mm_loadu_si128(key_mm + 5);
00329 __m128i K6 = _mm_loadu_si128(key_mm + 6);
00330 __m128i K7 = _mm_loadu_si128(key_mm + 7);
00331 __m128i K8 = _mm_loadu_si128(key_mm + 8);
00332 __m128i K9 = _mm_loadu_si128(key_mm + 9);
00333 __m128i K10 = _mm_loadu_si128(key_mm + 10);
00334 __m128i K11 = _mm_loadu_si128(key_mm + 11);
00335 __m128i K12 = _mm_loadu_si128(key_mm + 12);
00336
00337 while(blocks >= 4)
00338 {
00339 __m128i B0 = _mm_loadu_si128(in_mm + 0);
00340 __m128i B1 = _mm_loadu_si128(in_mm + 1);
00341 __m128i B2 = _mm_loadu_si128(in_mm + 2);
00342 __m128i B3 = _mm_loadu_si128(in_mm + 3);
00343
00344 B0 = _mm_xor_si128(B0, K0);
00345 B1 = _mm_xor_si128(B1, K0);
00346 B2 = _mm_xor_si128(B2, K0);
00347 B3 = _mm_xor_si128(B3, K0);
00348
00349 AES_ENC_4_ROUNDS(K1);
00350 AES_ENC_4_ROUNDS(K2);
00351 AES_ENC_4_ROUNDS(K3);
00352 AES_ENC_4_ROUNDS(K4);
00353 AES_ENC_4_ROUNDS(K5);
00354 AES_ENC_4_ROUNDS(K6);
00355 AES_ENC_4_ROUNDS(K7);
00356 AES_ENC_4_ROUNDS(K8);
00357 AES_ENC_4_ROUNDS(K9);
00358 AES_ENC_4_ROUNDS(K10);
00359 AES_ENC_4_ROUNDS(K11);
00360 AES_ENC_4_LAST_ROUNDS(K12);
00361
00362 _mm_storeu_si128(out_mm + 0, B0);
00363 _mm_storeu_si128(out_mm + 1, B1);
00364 _mm_storeu_si128(out_mm + 2, B2);
00365 _mm_storeu_si128(out_mm + 3, B3);
00366
00367 blocks -= 4;
00368 in_mm += 4;
00369 out_mm += 4;
00370 }
00371
00372 for(u32bit i = 0; i != blocks; ++i)
00373 {
00374 __m128i B = _mm_loadu_si128(in_mm + i);
00375
00376 B = _mm_xor_si128(B, K0);
00377
00378 B = _mm_aesenc_si128(B, K1);
00379 B = _mm_aesenc_si128(B, K2);
00380 B = _mm_aesenc_si128(B, K3);
00381 B = _mm_aesenc_si128(B, K4);
00382 B = _mm_aesenc_si128(B, K5);
00383 B = _mm_aesenc_si128(B, K6);
00384 B = _mm_aesenc_si128(B, K7);
00385 B = _mm_aesenc_si128(B, K8);
00386 B = _mm_aesenc_si128(B, K9);
00387 B = _mm_aesenc_si128(B, K10);
00388 B = _mm_aesenc_si128(B, K11);
00389 B = _mm_aesenclast_si128(B, K12);
00390
00391 _mm_storeu_si128(out_mm + i, B);
00392 }
00393 }
00394
00395
00396
00397
00398 void AES_192_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const
00399 {
00400 const __m128i* in_mm = (const __m128i*)in;
00401 __m128i* out_mm = (__m128i*)out;
00402
00403 const __m128i* key_mm = (const __m128i*)&DK[0];
00404
00405 __m128i K0 = _mm_loadu_si128(key_mm);
00406 __m128i K1 = _mm_loadu_si128(key_mm + 1);
00407 __m128i K2 = _mm_loadu_si128(key_mm + 2);
00408 __m128i K3 = _mm_loadu_si128(key_mm + 3);
00409 __m128i K4 = _mm_loadu_si128(key_mm + 4);
00410 __m128i K5 = _mm_loadu_si128(key_mm + 5);
00411 __m128i K6 = _mm_loadu_si128(key_mm + 6);
00412 __m128i K7 = _mm_loadu_si128(key_mm + 7);
00413 __m128i K8 = _mm_loadu_si128(key_mm + 8);
00414 __m128i K9 = _mm_loadu_si128(key_mm + 9);
00415 __m128i K10 = _mm_loadu_si128(key_mm + 10);
00416 __m128i K11 = _mm_loadu_si128(key_mm + 11);
00417 __m128i K12 = _mm_loadu_si128(key_mm + 12);
00418
00419 while(blocks >= 4)
00420 {
00421 __m128i B0 = _mm_loadu_si128(in_mm + 0);
00422 __m128i B1 = _mm_loadu_si128(in_mm + 1);
00423 __m128i B2 = _mm_loadu_si128(in_mm + 2);
00424 __m128i B3 = _mm_loadu_si128(in_mm + 3);
00425
00426 B0 = _mm_xor_si128(B0, K0);
00427 B1 = _mm_xor_si128(B1, K0);
00428 B2 = _mm_xor_si128(B2, K0);
00429 B3 = _mm_xor_si128(B3, K0);
00430
00431 AES_DEC_4_ROUNDS(K1);
00432 AES_DEC_4_ROUNDS(K2);
00433 AES_DEC_4_ROUNDS(K3);
00434 AES_DEC_4_ROUNDS(K4);
00435 AES_DEC_4_ROUNDS(K5);
00436 AES_DEC_4_ROUNDS(K6);
00437 AES_DEC_4_ROUNDS(K7);
00438 AES_DEC_4_ROUNDS(K8);
00439 AES_DEC_4_ROUNDS(K9);
00440 AES_DEC_4_ROUNDS(K10);
00441 AES_DEC_4_ROUNDS(K11);
00442 AES_DEC_4_LAST_ROUNDS(K12);
00443
00444 _mm_storeu_si128(out_mm + 0, B0);
00445 _mm_storeu_si128(out_mm + 1, B1);
00446 _mm_storeu_si128(out_mm + 2, B2);
00447 _mm_storeu_si128(out_mm + 3, B3);
00448
00449 blocks -= 4;
00450 in_mm += 4;
00451 out_mm += 4;
00452 }
00453
00454 for(u32bit i = 0; i != blocks; ++i)
00455 {
00456 __m128i B = _mm_loadu_si128(in_mm + i);
00457
00458 B = _mm_xor_si128(B, K0);
00459
00460 B = _mm_aesdec_si128(B, K1);
00461 B = _mm_aesdec_si128(B, K2);
00462 B = _mm_aesdec_si128(B, K3);
00463 B = _mm_aesdec_si128(B, K4);
00464 B = _mm_aesdec_si128(B, K5);
00465 B = _mm_aesdec_si128(B, K6);
00466 B = _mm_aesdec_si128(B, K7);
00467 B = _mm_aesdec_si128(B, K8);
00468 B = _mm_aesdec_si128(B, K9);
00469 B = _mm_aesdec_si128(B, K10);
00470 B = _mm_aesdec_si128(B, K11);
00471 B = _mm_aesdeclast_si128(B, K12);
00472
00473 _mm_storeu_si128(out_mm + i, B);
00474 }
00475 }
00476
00477
00478
00479
00480 void AES_192_Intel::key_schedule(const byte key[], u32bit)
00481 {
00482 __m128i K0 = _mm_loadu_si128((const __m128i*)(key));
00483 __m128i K1 = _mm_loadu_si128((const __m128i*)(key + 8));
00484 K1 = _mm_srli_si128(K1, 8);
00485
00486 load_le(&EK[0], key, 6);
00487
00488 #define AES_192_key_exp(RCON, EK_OFF) \
00489 aes_192_key_expansion(&K0, &K1, \
00490 _mm_aeskeygenassist_si128(K1, RCON), \
00491 EK + EK_OFF, EK_OFF == 48)
00492
00493 AES_192_key_exp(0x01, 6);
00494 AES_192_key_exp(0x02, 12);
00495 AES_192_key_exp(0x04, 18);
00496 AES_192_key_exp(0x08, 24);
00497 AES_192_key_exp(0x10, 30);
00498 AES_192_key_exp(0x20, 36);
00499 AES_192_key_exp(0x40, 42);
00500 AES_192_key_exp(0x80, 48);
00501
00502
00503 const __m128i* EK_mm = (const __m128i*)&EK[0];
00504 __m128i* DK_mm = (__m128i*)&DK[0];
00505 _mm_storeu_si128(DK_mm , EK_mm[12]);
00506 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(EK_mm[11]));
00507 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(EK_mm[10]));
00508 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(EK_mm[9]));
00509 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(EK_mm[8]));
00510 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(EK_mm[7]));
00511 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(EK_mm[6]));
00512 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(EK_mm[5]));
00513 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(EK_mm[4]));
00514 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(EK_mm[3]));
00515 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(EK_mm[2]));
00516 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(EK_mm[1]));
00517 _mm_storeu_si128(DK_mm + 12, EK_mm[0]);
00518 }
00519
00520
00521
00522
00523 void AES_192_Intel::clear()
00524 {
00525 EK.clear();
00526 DK.clear();
00527 }
00528
00529
00530
00531
00532 void AES_256_Intel::encrypt_n(const byte in[], byte out[], u32bit blocks) const
00533 {
00534 const __m128i* in_mm = (const __m128i*)in;
00535 __m128i* out_mm = (__m128i*)out;
00536
00537 const __m128i* key_mm = (const __m128i*)&EK[0];
00538
00539 __m128i K0 = _mm_loadu_si128(key_mm);
00540 __m128i K1 = _mm_loadu_si128(key_mm + 1);
00541 __m128i K2 = _mm_loadu_si128(key_mm + 2);
00542 __m128i K3 = _mm_loadu_si128(key_mm + 3);
00543 __m128i K4 = _mm_loadu_si128(key_mm + 4);
00544 __m128i K5 = _mm_loadu_si128(key_mm + 5);
00545 __m128i K6 = _mm_loadu_si128(key_mm + 6);
00546 __m128i K7 = _mm_loadu_si128(key_mm + 7);
00547 __m128i K8 = _mm_loadu_si128(key_mm + 8);
00548 __m128i K9 = _mm_loadu_si128(key_mm + 9);
00549 __m128i K10 = _mm_loadu_si128(key_mm + 10);
00550 __m128i K11 = _mm_loadu_si128(key_mm + 11);
00551 __m128i K12 = _mm_loadu_si128(key_mm + 12);
00552 __m128i K13 = _mm_loadu_si128(key_mm + 13);
00553 __m128i K14 = _mm_loadu_si128(key_mm + 14);
00554
00555 while(blocks >= 4)
00556 {
00557 __m128i B0 = _mm_loadu_si128(in_mm + 0);
00558 __m128i B1 = _mm_loadu_si128(in_mm + 1);
00559 __m128i B2 = _mm_loadu_si128(in_mm + 2);
00560 __m128i B3 = _mm_loadu_si128(in_mm + 3);
00561
00562 B0 = _mm_xor_si128(B0, K0);
00563 B1 = _mm_xor_si128(B1, K0);
00564 B2 = _mm_xor_si128(B2, K0);
00565 B3 = _mm_xor_si128(B3, K0);
00566
00567 AES_ENC_4_ROUNDS(K1);
00568 AES_ENC_4_ROUNDS(K2);
00569 AES_ENC_4_ROUNDS(K3);
00570 AES_ENC_4_ROUNDS(K4);
00571 AES_ENC_4_ROUNDS(K5);
00572 AES_ENC_4_ROUNDS(K6);
00573 AES_ENC_4_ROUNDS(K7);
00574 AES_ENC_4_ROUNDS(K8);
00575 AES_ENC_4_ROUNDS(K9);
00576 AES_ENC_4_ROUNDS(K10);
00577 AES_ENC_4_ROUNDS(K11);
00578 AES_ENC_4_ROUNDS(K12);
00579 AES_ENC_4_ROUNDS(K13);
00580 AES_ENC_4_LAST_ROUNDS(K14);
00581
00582 _mm_storeu_si128(out_mm + 0, B0);
00583 _mm_storeu_si128(out_mm + 1, B1);
00584 _mm_storeu_si128(out_mm + 2, B2);
00585 _mm_storeu_si128(out_mm + 3, B3);
00586
00587 blocks -= 4;
00588 in_mm += 4;
00589 out_mm += 4;
00590 }
00591
00592 for(u32bit i = 0; i != blocks; ++i)
00593 {
00594 __m128i B = _mm_loadu_si128(in_mm + i);
00595
00596 B = _mm_xor_si128(B, K0);
00597
00598 B = _mm_aesenc_si128(B, K1);
00599 B = _mm_aesenc_si128(B, K2);
00600 B = _mm_aesenc_si128(B, K3);
00601 B = _mm_aesenc_si128(B, K4);
00602 B = _mm_aesenc_si128(B, K5);
00603 B = _mm_aesenc_si128(B, K6);
00604 B = _mm_aesenc_si128(B, K7);
00605 B = _mm_aesenc_si128(B, K8);
00606 B = _mm_aesenc_si128(B, K9);
00607 B = _mm_aesenc_si128(B, K10);
00608 B = _mm_aesenc_si128(B, K11);
00609 B = _mm_aesenc_si128(B, K12);
00610 B = _mm_aesenc_si128(B, K13);
00611 B = _mm_aesenclast_si128(B, K14);
00612
00613 _mm_storeu_si128(out_mm + i, B);
00614 }
00615 }
00616
00617
00618
00619
00620 void AES_256_Intel::decrypt_n(const byte in[], byte out[], u32bit blocks) const
00621 {
00622 const __m128i* in_mm = (const __m128i*)in;
00623 __m128i* out_mm = (__m128i*)out;
00624
00625 const __m128i* key_mm = (const __m128i*)&DK[0];
00626
00627 __m128i K0 = _mm_loadu_si128(key_mm);
00628 __m128i K1 = _mm_loadu_si128(key_mm + 1);
00629 __m128i K2 = _mm_loadu_si128(key_mm + 2);
00630 __m128i K3 = _mm_loadu_si128(key_mm + 3);
00631 __m128i K4 = _mm_loadu_si128(key_mm + 4);
00632 __m128i K5 = _mm_loadu_si128(key_mm + 5);
00633 __m128i K6 = _mm_loadu_si128(key_mm + 6);
00634 __m128i K7 = _mm_loadu_si128(key_mm + 7);
00635 __m128i K8 = _mm_loadu_si128(key_mm + 8);
00636 __m128i K9 = _mm_loadu_si128(key_mm + 9);
00637 __m128i K10 = _mm_loadu_si128(key_mm + 10);
00638 __m128i K11 = _mm_loadu_si128(key_mm + 11);
00639 __m128i K12 = _mm_loadu_si128(key_mm + 12);
00640 __m128i K13 = _mm_loadu_si128(key_mm + 13);
00641 __m128i K14 = _mm_loadu_si128(key_mm + 14);
00642
00643 while(blocks >= 4)
00644 {
00645 __m128i B0 = _mm_loadu_si128(in_mm + 0);
00646 __m128i B1 = _mm_loadu_si128(in_mm + 1);
00647 __m128i B2 = _mm_loadu_si128(in_mm + 2);
00648 __m128i B3 = _mm_loadu_si128(in_mm + 3);
00649
00650 B0 = _mm_xor_si128(B0, K0);
00651 B1 = _mm_xor_si128(B1, K0);
00652 B2 = _mm_xor_si128(B2, K0);
00653 B3 = _mm_xor_si128(B3, K0);
00654
00655 AES_DEC_4_ROUNDS(K1);
00656 AES_DEC_4_ROUNDS(K2);
00657 AES_DEC_4_ROUNDS(K3);
00658 AES_DEC_4_ROUNDS(K4);
00659 AES_DEC_4_ROUNDS(K5);
00660 AES_DEC_4_ROUNDS(K6);
00661 AES_DEC_4_ROUNDS(K7);
00662 AES_DEC_4_ROUNDS(K8);
00663 AES_DEC_4_ROUNDS(K9);
00664 AES_DEC_4_ROUNDS(K10);
00665 AES_DEC_4_ROUNDS(K11);
00666 AES_DEC_4_ROUNDS(K12);
00667 AES_DEC_4_ROUNDS(K13);
00668 AES_DEC_4_LAST_ROUNDS(K14);
00669
00670 _mm_storeu_si128(out_mm + 0, B0);
00671 _mm_storeu_si128(out_mm + 1, B1);
00672 _mm_storeu_si128(out_mm + 2, B2);
00673 _mm_storeu_si128(out_mm + 3, B3);
00674
00675 blocks -= 4;
00676 in_mm += 4;
00677 out_mm += 4;
00678 }
00679
00680 for(u32bit i = 0; i != blocks; ++i)
00681 {
00682 __m128i B = _mm_loadu_si128(in_mm + i);
00683
00684 B = _mm_xor_si128(B, K0);
00685
00686 B = _mm_aesdec_si128(B, K1);
00687 B = _mm_aesdec_si128(B, K2);
00688 B = _mm_aesdec_si128(B, K3);
00689 B = _mm_aesdec_si128(B, K4);
00690 B = _mm_aesdec_si128(B, K5);
00691 B = _mm_aesdec_si128(B, K6);
00692 B = _mm_aesdec_si128(B, K7);
00693 B = _mm_aesdec_si128(B, K8);
00694 B = _mm_aesdec_si128(B, K9);
00695 B = _mm_aesdec_si128(B, K10);
00696 B = _mm_aesdec_si128(B, K11);
00697 B = _mm_aesdec_si128(B, K12);
00698 B = _mm_aesdec_si128(B, K13);
00699 B = _mm_aesdeclast_si128(B, K14);
00700
00701 _mm_storeu_si128(out_mm + i, B);
00702 }
00703 }
00704
00705
00706
00707
00708 void AES_256_Intel::key_schedule(const byte key[], u32bit)
00709 {
00710 __m128i K0 = _mm_loadu_si128((const __m128i*)(key));
00711 __m128i K1 = _mm_loadu_si128((const __m128i*)(key + 16));
00712
00713 __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01));
00714 __m128i K3 = aes_256_key_expansion(K1, K2);
00715
00716 __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02));
00717 __m128i K5 = aes_256_key_expansion(K3, K4);
00718
00719 __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04));
00720 __m128i K7 = aes_256_key_expansion(K5, K6);
00721
00722 __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08));
00723 __m128i K9 = aes_256_key_expansion(K7, K8);
00724
00725 __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10));
00726 __m128i K11 = aes_256_key_expansion(K9, K10);
00727
00728 __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20));
00729 __m128i K13 = aes_256_key_expansion(K11, K12);
00730
00731 __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40));
00732
00733 __m128i* EK_mm = (__m128i*)&EK[0];
00734 _mm_storeu_si128(EK_mm , K0);
00735 _mm_storeu_si128(EK_mm + 1, K1);
00736 _mm_storeu_si128(EK_mm + 2, K2);
00737 _mm_storeu_si128(EK_mm + 3, K3);
00738 _mm_storeu_si128(EK_mm + 4, K4);
00739 _mm_storeu_si128(EK_mm + 5, K5);
00740 _mm_storeu_si128(EK_mm + 6, K6);
00741 _mm_storeu_si128(EK_mm + 7, K7);
00742 _mm_storeu_si128(EK_mm + 8, K8);
00743 _mm_storeu_si128(EK_mm + 9, K9);
00744 _mm_storeu_si128(EK_mm + 10, K10);
00745 _mm_storeu_si128(EK_mm + 11, K11);
00746 _mm_storeu_si128(EK_mm + 12, K12);
00747 _mm_storeu_si128(EK_mm + 13, K13);
00748 _mm_storeu_si128(EK_mm + 14, K14);
00749
00750
00751
00752 __m128i* DK_mm = (__m128i*)&DK[0];
00753 _mm_storeu_si128(DK_mm , K14);
00754 _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13));
00755 _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12));
00756 _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11));
00757 _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10));
00758 _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9));
00759 _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8));
00760 _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7));
00761 _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6));
00762 _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5));
00763 _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4));
00764 _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3));
00765 _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2));
00766 _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1));
00767 _mm_storeu_si128(DK_mm + 14, K0);
00768 }
00769
00770
00771
00772
00773 void AES_256_Intel::clear()
00774 {
00775 EK.clear();
00776 DK.clear();
00777 }
00778
00779 }