Bitcoin ABC 0.30.5
P2P Digital Currency
sha256_sse4.cpp
Go to the documentation of this file.
1// Copyright (c) 2017 The Bitcoin Core developers
2// Distributed under the MIT software license, see the accompanying
3// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4//
5// This is a translation to GCC extended asm syntax from YASM code by Intel
6// (available at the bottom of this file).
7
8#include <cstdint>
9#include <cstdlib>
10
11#if defined(__x86_64__) || defined(__amd64__)
12
13namespace sha256_sse4 {
14void Transform(uint32_t *s, const uint8_t *chunk, size_t blocks) {
15 static const uint32_t K256 alignas(16)[] = {
16 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
17 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
18 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
19 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
20 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
21 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
22 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
23 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
24 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
25 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
26 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
27 };
28 static const uint32_t FLIP_MASK alignas(16)[] = {0x00010203, 0x04050607,
29 0x08090a0b, 0x0c0d0e0f};
30 static const uint32_t SHUF_00BA alignas(16)[] = {0x03020100, 0x0b0a0908,
31 0xffffffff, 0xffffffff};
32 static const uint32_t SHUF_DC00 alignas(16)[] = {0xffffffff, 0xffffffff,
33 0x03020100, 0x0b0a0908};
34 uint32_t a, b, c, d, f, g, h, y0, y1, y2;
35 uint64_t tbl;
36 uint64_t inp_end, inp;
37 uint32_t xfer alignas(16)[4];
38
39 __asm__ __volatile__(
40 "shl $0x6,%2;"
41 "je Ldone_hash_%=;"
42 "add %1,%2;"
43 "mov %2,%14;"
44 "mov (%0),%3;"
45 "mov 0x4(%0),%4;"
46 "mov 0x8(%0),%5;"
47 "mov 0xc(%0),%6;"
48 "mov 0x10(%0),%k2;"
49 "mov 0x14(%0),%7;"
50 "mov 0x18(%0),%8;"
51 "mov 0x1c(%0),%9;"
52 "movdqa %18,%%xmm12;"
53 "movdqa %19,%%xmm10;"
54 "movdqa %20,%%xmm11;"
55
56 "Lloop0_%=:"
57 "lea %17,%13;"
58 "movdqu (%1),%%xmm4;"
59 "pshufb %%xmm12,%%xmm4;"
60 "movdqu 0x10(%1),%%xmm5;"
61 "pshufb %%xmm12,%%xmm5;"
62 "movdqu 0x20(%1),%%xmm6;"
63 "pshufb %%xmm12,%%xmm6;"
64 "movdqu 0x30(%1),%%xmm7;"
65 "pshufb %%xmm12,%%xmm7;"
66 "mov %1,%15;"
67 "mov $3,%1;"
68
69 "Lloop1_%=:"
70 "movdqa 0x0(%13),%%xmm9;"
71 "paddd %%xmm4,%%xmm9;"
72 "movdqa %%xmm9,%16;"
73 "movdqa %%xmm7,%%xmm0;"
74 "mov %k2,%10;"
75 "ror $0xe,%10;"
76 "mov %3,%11;"
77 "palignr $0x4,%%xmm6,%%xmm0;"
78 "ror $0x9,%11;"
79 "xor %k2,%10;"
80 "mov %7,%12;"
81 "ror $0x5,%10;"
82 "movdqa %%xmm5,%%xmm1;"
83 "xor %3,%11;"
84 "xor %8,%12;"
85 "paddd %%xmm4,%%xmm0;"
86 "xor %k2,%10;"
87 "and %k2,%12;"
88 "ror $0xb,%11;"
89 "palignr $0x4,%%xmm4,%%xmm1;"
90 "xor %3,%11;"
91 "ror $0x6,%10;"
92 "xor %8,%12;"
93 "movdqa %%xmm1,%%xmm2;"
94 "ror $0x2,%11;"
95 "add %10,%12;"
96 "add %16,%12;"
97 "movdqa %%xmm1,%%xmm3;"
98 "mov %3,%10;"
99 "add %12,%9;"
100 "mov %3,%12;"
101 "pslld $0x19,%%xmm1;"
102 "or %5,%10;"
103 "add %9,%6;"
104 "and %5,%12;"
105 "psrld $0x7,%%xmm2;"
106 "and %4,%10;"
107 "add %11,%9;"
108 "por %%xmm2,%%xmm1;"
109 "or %12,%10;"
110 "add %10,%9;"
111 "movdqa %%xmm3,%%xmm2;"
112 "mov %6,%10;"
113 "mov %9,%11;"
114 "movdqa %%xmm3,%%xmm8;"
115 "ror $0xe,%10;"
116 "xor %6,%10;"
117 "mov %k2,%12;"
118 "ror $0x9,%11;"
119 "pslld $0xe,%%xmm3;"
120 "xor %9,%11;"
121 "ror $0x5,%10;"
122 "xor %7,%12;"
123 "psrld $0x12,%%xmm2;"
124 "ror $0xb,%11;"
125 "xor %6,%10;"
126 "and %6,%12;"
127 "ror $0x6,%10;"
128 "pxor %%xmm3,%%xmm1;"
129 "xor %9,%11;"
130 "xor %7,%12;"
131 "psrld $0x3,%%xmm8;"
132 "add %10,%12;"
133 "add 4+%16,%12;"
134 "ror $0x2,%11;"
135 "pxor %%xmm2,%%xmm1;"
136 "mov %9,%10;"
137 "add %12,%8;"
138 "mov %9,%12;"
139 "pxor %%xmm8,%%xmm1;"
140 "or %4,%10;"
141 "add %8,%5;"
142 "and %4,%12;"
143 "pshufd $0xfa,%%xmm7,%%xmm2;"
144 "and %3,%10;"
145 "add %11,%8;"
146 "paddd %%xmm1,%%xmm0;"
147 "or %12,%10;"
148 "add %10,%8;"
149 "movdqa %%xmm2,%%xmm3;"
150 "mov %5,%10;"
151 "mov %8,%11;"
152 "ror $0xe,%10;"
153 "movdqa %%xmm2,%%xmm8;"
154 "xor %5,%10;"
155 "ror $0x9,%11;"
156 "mov %6,%12;"
157 "xor %8,%11;"
158 "ror $0x5,%10;"
159 "psrlq $0x11,%%xmm2;"
160 "xor %k2,%12;"
161 "psrlq $0x13,%%xmm3;"
162 "xor %5,%10;"
163 "and %5,%12;"
164 "psrld $0xa,%%xmm8;"
165 "ror $0xb,%11;"
166 "xor %8,%11;"
167 "xor %k2,%12;"
168 "ror $0x6,%10;"
169 "pxor %%xmm3,%%xmm2;"
170 "add %10,%12;"
171 "ror $0x2,%11;"
172 "add 8+%16,%12;"
173 "pxor %%xmm2,%%xmm8;"
174 "mov %8,%10;"
175 "add %12,%7;"
176 "mov %8,%12;"
177 "pshufb %%xmm10,%%xmm8;"
178 "or %3,%10;"
179 "add %7,%4;"
180 "and %3,%12;"
181 "paddd %%xmm8,%%xmm0;"
182 "and %9,%10;"
183 "add %11,%7;"
184 "pshufd $0x50,%%xmm0,%%xmm2;"
185 "or %12,%10;"
186 "add %10,%7;"
187 "movdqa %%xmm2,%%xmm3;"
188 "mov %4,%10;"
189 "ror $0xe,%10;"
190 "mov %7,%11;"
191 "movdqa %%xmm2,%%xmm4;"
192 "ror $0x9,%11;"
193 "xor %4,%10;"
194 "mov %5,%12;"
195 "ror $0x5,%10;"
196 "psrlq $0x11,%%xmm2;"
197 "xor %7,%11;"
198 "xor %6,%12;"
199 "psrlq $0x13,%%xmm3;"
200 "xor %4,%10;"
201 "and %4,%12;"
202 "ror $0xb,%11;"
203 "psrld $0xa,%%xmm4;"
204 "xor %7,%11;"
205 "ror $0x6,%10;"
206 "xor %6,%12;"
207 "pxor %%xmm3,%%xmm2;"
208 "ror $0x2,%11;"
209 "add %10,%12;"
210 "add 12+%16,%12;"
211 "pxor %%xmm2,%%xmm4;"
212 "mov %7,%10;"
213 "add %12,%k2;"
214 "mov %7,%12;"
215 "pshufb %%xmm11,%%xmm4;"
216 "or %9,%10;"
217 "add %k2,%3;"
218 "and %9,%12;"
219 "paddd %%xmm0,%%xmm4;"
220 "and %8,%10;"
221 "add %11,%k2;"
222 "or %12,%10;"
223 "add %10,%k2;"
224 "movdqa 0x10(%13),%%xmm9;"
225 "paddd %%xmm5,%%xmm9;"
226 "movdqa %%xmm9,%16;"
227 "movdqa %%xmm4,%%xmm0;"
228 "mov %3,%10;"
229 "ror $0xe,%10;"
230 "mov %k2,%11;"
231 "palignr $0x4,%%xmm7,%%xmm0;"
232 "ror $0x9,%11;"
233 "xor %3,%10;"
234 "mov %4,%12;"
235 "ror $0x5,%10;"
236 "movdqa %%xmm6,%%xmm1;"
237 "xor %k2,%11;"
238 "xor %5,%12;"
239 "paddd %%xmm5,%%xmm0;"
240 "xor %3,%10;"
241 "and %3,%12;"
242 "ror $0xb,%11;"
243 "palignr $0x4,%%xmm5,%%xmm1;"
244 "xor %k2,%11;"
245 "ror $0x6,%10;"
246 "xor %5,%12;"
247 "movdqa %%xmm1,%%xmm2;"
248 "ror $0x2,%11;"
249 "add %10,%12;"
250 "add %16,%12;"
251 "movdqa %%xmm1,%%xmm3;"
252 "mov %k2,%10;"
253 "add %12,%6;"
254 "mov %k2,%12;"
255 "pslld $0x19,%%xmm1;"
256 "or %8,%10;"
257 "add %6,%9;"
258 "and %8,%12;"
259 "psrld $0x7,%%xmm2;"
260 "and %7,%10;"
261 "add %11,%6;"
262 "por %%xmm2,%%xmm1;"
263 "or %12,%10;"
264 "add %10,%6;"
265 "movdqa %%xmm3,%%xmm2;"
266 "mov %9,%10;"
267 "mov %6,%11;"
268 "movdqa %%xmm3,%%xmm8;"
269 "ror $0xe,%10;"
270 "xor %9,%10;"
271 "mov %3,%12;"
272 "ror $0x9,%11;"
273 "pslld $0xe,%%xmm3;"
274 "xor %6,%11;"
275 "ror $0x5,%10;"
276 "xor %4,%12;"
277 "psrld $0x12,%%xmm2;"
278 "ror $0xb,%11;"
279 "xor %9,%10;"
280 "and %9,%12;"
281 "ror $0x6,%10;"
282 "pxor %%xmm3,%%xmm1;"
283 "xor %6,%11;"
284 "xor %4,%12;"
285 "psrld $0x3,%%xmm8;"
286 "add %10,%12;"
287 "add 4+%16,%12;"
288 "ror $0x2,%11;"
289 "pxor %%xmm2,%%xmm1;"
290 "mov %6,%10;"
291 "add %12,%5;"
292 "mov %6,%12;"
293 "pxor %%xmm8,%%xmm1;"
294 "or %7,%10;"
295 "add %5,%8;"
296 "and %7,%12;"
297 "pshufd $0xfa,%%xmm4,%%xmm2;"
298 "and %k2,%10;"
299 "add %11,%5;"
300 "paddd %%xmm1,%%xmm0;"
301 "or %12,%10;"
302 "add %10,%5;"
303 "movdqa %%xmm2,%%xmm3;"
304 "mov %8,%10;"
305 "mov %5,%11;"
306 "ror $0xe,%10;"
307 "movdqa %%xmm2,%%xmm8;"
308 "xor %8,%10;"
309 "ror $0x9,%11;"
310 "mov %9,%12;"
311 "xor %5,%11;"
312 "ror $0x5,%10;"
313 "psrlq $0x11,%%xmm2;"
314 "xor %3,%12;"
315 "psrlq $0x13,%%xmm3;"
316 "xor %8,%10;"
317 "and %8,%12;"
318 "psrld $0xa,%%xmm8;"
319 "ror $0xb,%11;"
320 "xor %5,%11;"
321 "xor %3,%12;"
322 "ror $0x6,%10;"
323 "pxor %%xmm3,%%xmm2;"
324 "add %10,%12;"
325 "ror $0x2,%11;"
326 "add 8+%16,%12;"
327 "pxor %%xmm2,%%xmm8;"
328 "mov %5,%10;"
329 "add %12,%4;"
330 "mov %5,%12;"
331 "pshufb %%xmm10,%%xmm8;"
332 "or %k2,%10;"
333 "add %4,%7;"
334 "and %k2,%12;"
335 "paddd %%xmm8,%%xmm0;"
336 "and %6,%10;"
337 "add %11,%4;"
338 "pshufd $0x50,%%xmm0,%%xmm2;"
339 "or %12,%10;"
340 "add %10,%4;"
341 "movdqa %%xmm2,%%xmm3;"
342 "mov %7,%10;"
343 "ror $0xe,%10;"
344 "mov %4,%11;"
345 "movdqa %%xmm2,%%xmm5;"
346 "ror $0x9,%11;"
347 "xor %7,%10;"
348 "mov %8,%12;"
349 "ror $0x5,%10;"
350 "psrlq $0x11,%%xmm2;"
351 "xor %4,%11;"
352 "xor %9,%12;"
353 "psrlq $0x13,%%xmm3;"
354 "xor %7,%10;"
355 "and %7,%12;"
356 "ror $0xb,%11;"
357 "psrld $0xa,%%xmm5;"
358 "xor %4,%11;"
359 "ror $0x6,%10;"
360 "xor %9,%12;"
361 "pxor %%xmm3,%%xmm2;"
362 "ror $0x2,%11;"
363 "add %10,%12;"
364 "add 12+%16,%12;"
365 "pxor %%xmm2,%%xmm5;"
366 "mov %4,%10;"
367 "add %12,%3;"
368 "mov %4,%12;"
369 "pshufb %%xmm11,%%xmm5;"
370 "or %6,%10;"
371 "add %3,%k2;"
372 "and %6,%12;"
373 "paddd %%xmm0,%%xmm5;"
374 "and %5,%10;"
375 "add %11,%3;"
376 "or %12,%10;"
377 "add %10,%3;"
378 "movdqa 0x20(%13),%%xmm9;"
379 "paddd %%xmm6,%%xmm9;"
380 "movdqa %%xmm9,%16;"
381 "movdqa %%xmm5,%%xmm0;"
382 "mov %k2,%10;"
383 "ror $0xe,%10;"
384 "mov %3,%11;"
385 "palignr $0x4,%%xmm4,%%xmm0;"
386 "ror $0x9,%11;"
387 "xor %k2,%10;"
388 "mov %7,%12;"
389 "ror $0x5,%10;"
390 "movdqa %%xmm7,%%xmm1;"
391 "xor %3,%11;"
392 "xor %8,%12;"
393 "paddd %%xmm6,%%xmm0;"
394 "xor %k2,%10;"
395 "and %k2,%12;"
396 "ror $0xb,%11;"
397 "palignr $0x4,%%xmm6,%%xmm1;"
398 "xor %3,%11;"
399 "ror $0x6,%10;"
400 "xor %8,%12;"
401 "movdqa %%xmm1,%%xmm2;"
402 "ror $0x2,%11;"
403 "add %10,%12;"
404 "add %16,%12;"
405 "movdqa %%xmm1,%%xmm3;"
406 "mov %3,%10;"
407 "add %12,%9;"
408 "mov %3,%12;"
409 "pslld $0x19,%%xmm1;"
410 "or %5,%10;"
411 "add %9,%6;"
412 "and %5,%12;"
413 "psrld $0x7,%%xmm2;"
414 "and %4,%10;"
415 "add %11,%9;"
416 "por %%xmm2,%%xmm1;"
417 "or %12,%10;"
418 "add %10,%9;"
419 "movdqa %%xmm3,%%xmm2;"
420 "mov %6,%10;"
421 "mov %9,%11;"
422 "movdqa %%xmm3,%%xmm8;"
423 "ror $0xe,%10;"
424 "xor %6,%10;"
425 "mov %k2,%12;"
426 "ror $0x9,%11;"
427 "pslld $0xe,%%xmm3;"
428 "xor %9,%11;"
429 "ror $0x5,%10;"
430 "xor %7,%12;"
431 "psrld $0x12,%%xmm2;"
432 "ror $0xb,%11;"
433 "xor %6,%10;"
434 "and %6,%12;"
435 "ror $0x6,%10;"
436 "pxor %%xmm3,%%xmm1;"
437 "xor %9,%11;"
438 "xor %7,%12;"
439 "psrld $0x3,%%xmm8;"
440 "add %10,%12;"
441 "add 4+%16,%12;"
442 "ror $0x2,%11;"
443 "pxor %%xmm2,%%xmm1;"
444 "mov %9,%10;"
445 "add %12,%8;"
446 "mov %9,%12;"
447 "pxor %%xmm8,%%xmm1;"
448 "or %4,%10;"
449 "add %8,%5;"
450 "and %4,%12;"
451 "pshufd $0xfa,%%xmm5,%%xmm2;"
452 "and %3,%10;"
453 "add %11,%8;"
454 "paddd %%xmm1,%%xmm0;"
455 "or %12,%10;"
456 "add %10,%8;"
457 "movdqa %%xmm2,%%xmm3;"
458 "mov %5,%10;"
459 "mov %8,%11;"
460 "ror $0xe,%10;"
461 "movdqa %%xmm2,%%xmm8;"
462 "xor %5,%10;"
463 "ror $0x9,%11;"
464 "mov %6,%12;"
465 "xor %8,%11;"
466 "ror $0x5,%10;"
467 "psrlq $0x11,%%xmm2;"
468 "xor %k2,%12;"
469 "psrlq $0x13,%%xmm3;"
470 "xor %5,%10;"
471 "and %5,%12;"
472 "psrld $0xa,%%xmm8;"
473 "ror $0xb,%11;"
474 "xor %8,%11;"
475 "xor %k2,%12;"
476 "ror $0x6,%10;"
477 "pxor %%xmm3,%%xmm2;"
478 "add %10,%12;"
479 "ror $0x2,%11;"
480 "add 8+%16,%12;"
481 "pxor %%xmm2,%%xmm8;"
482 "mov %8,%10;"
483 "add %12,%7;"
484 "mov %8,%12;"
485 "pshufb %%xmm10,%%xmm8;"
486 "or %3,%10;"
487 "add %7,%4;"
488 "and %3,%12;"
489 "paddd %%xmm8,%%xmm0;"
490 "and %9,%10;"
491 "add %11,%7;"
492 "pshufd $0x50,%%xmm0,%%xmm2;"
493 "or %12,%10;"
494 "add %10,%7;"
495 "movdqa %%xmm2,%%xmm3;"
496 "mov %4,%10;"
497 "ror $0xe,%10;"
498 "mov %7,%11;"
499 "movdqa %%xmm2,%%xmm6;"
500 "ror $0x9,%11;"
501 "xor %4,%10;"
502 "mov %5,%12;"
503 "ror $0x5,%10;"
504 "psrlq $0x11,%%xmm2;"
505 "xor %7,%11;"
506 "xor %6,%12;"
507 "psrlq $0x13,%%xmm3;"
508 "xor %4,%10;"
509 "and %4,%12;"
510 "ror $0xb,%11;"
511 "psrld $0xa,%%xmm6;"
512 "xor %7,%11;"
513 "ror $0x6,%10;"
514 "xor %6,%12;"
515 "pxor %%xmm3,%%xmm2;"
516 "ror $0x2,%11;"
517 "add %10,%12;"
518 "add 12+%16,%12;"
519 "pxor %%xmm2,%%xmm6;"
520 "mov %7,%10;"
521 "add %12,%k2;"
522 "mov %7,%12;"
523 "pshufb %%xmm11,%%xmm6;"
524 "or %9,%10;"
525 "add %k2,%3;"
526 "and %9,%12;"
527 "paddd %%xmm0,%%xmm6;"
528 "and %8,%10;"
529 "add %11,%k2;"
530 "or %12,%10;"
531 "add %10,%k2;"
532 "movdqa 0x30(%13),%%xmm9;"
533 "paddd %%xmm7,%%xmm9;"
534 "movdqa %%xmm9,%16;"
535 "add $0x40,%13;"
536 "movdqa %%xmm6,%%xmm0;"
537 "mov %3,%10;"
538 "ror $0xe,%10;"
539 "mov %k2,%11;"
540 "palignr $0x4,%%xmm5,%%xmm0;"
541 "ror $0x9,%11;"
542 "xor %3,%10;"
543 "mov %4,%12;"
544 "ror $0x5,%10;"
545 "movdqa %%xmm4,%%xmm1;"
546 "xor %k2,%11;"
547 "xor %5,%12;"
548 "paddd %%xmm7,%%xmm0;"
549 "xor %3,%10;"
550 "and %3,%12;"
551 "ror $0xb,%11;"
552 "palignr $0x4,%%xmm7,%%xmm1;"
553 "xor %k2,%11;"
554 "ror $0x6,%10;"
555 "xor %5,%12;"
556 "movdqa %%xmm1,%%xmm2;"
557 "ror $0x2,%11;"
558 "add %10,%12;"
559 "add %16,%12;"
560 "movdqa %%xmm1,%%xmm3;"
561 "mov %k2,%10;"
562 "add %12,%6;"
563 "mov %k2,%12;"
564 "pslld $0x19,%%xmm1;"
565 "or %8,%10;"
566 "add %6,%9;"
567 "and %8,%12;"
568 "psrld $0x7,%%xmm2;"
569 "and %7,%10;"
570 "add %11,%6;"
571 "por %%xmm2,%%xmm1;"
572 "or %12,%10;"
573 "add %10,%6;"
574 "movdqa %%xmm3,%%xmm2;"
575 "mov %9,%10;"
576 "mov %6,%11;"
577 "movdqa %%xmm3,%%xmm8;"
578 "ror $0xe,%10;"
579 "xor %9,%10;"
580 "mov %3,%12;"
581 "ror $0x9,%11;"
582 "pslld $0xe,%%xmm3;"
583 "xor %6,%11;"
584 "ror $0x5,%10;"
585 "xor %4,%12;"
586 "psrld $0x12,%%xmm2;"
587 "ror $0xb,%11;"
588 "xor %9,%10;"
589 "and %9,%12;"
590 "ror $0x6,%10;"
591 "pxor %%xmm3,%%xmm1;"
592 "xor %6,%11;"
593 "xor %4,%12;"
594 "psrld $0x3,%%xmm8;"
595 "add %10,%12;"
596 "add 4+%16,%12;"
597 "ror $0x2,%11;"
598 "pxor %%xmm2,%%xmm1;"
599 "mov %6,%10;"
600 "add %12,%5;"
601 "mov %6,%12;"
602 "pxor %%xmm8,%%xmm1;"
603 "or %7,%10;"
604 "add %5,%8;"
605 "and %7,%12;"
606 "pshufd $0xfa,%%xmm6,%%xmm2;"
607 "and %k2,%10;"
608 "add %11,%5;"
609 "paddd %%xmm1,%%xmm0;"
610 "or %12,%10;"
611 "add %10,%5;"
612 "movdqa %%xmm2,%%xmm3;"
613 "mov %8,%10;"
614 "mov %5,%11;"
615 "ror $0xe,%10;"
616 "movdqa %%xmm2,%%xmm8;"
617 "xor %8,%10;"
618 "ror $0x9,%11;"
619 "mov %9,%12;"
620 "xor %5,%11;"
621 "ror $0x5,%10;"
622 "psrlq $0x11,%%xmm2;"
623 "xor %3,%12;"
624 "psrlq $0x13,%%xmm3;"
625 "xor %8,%10;"
626 "and %8,%12;"
627 "psrld $0xa,%%xmm8;"
628 "ror $0xb,%11;"
629 "xor %5,%11;"
630 "xor %3,%12;"
631 "ror $0x6,%10;"
632 "pxor %%xmm3,%%xmm2;"
633 "add %10,%12;"
634 "ror $0x2,%11;"
635 "add 8+%16,%12;"
636 "pxor %%xmm2,%%xmm8;"
637 "mov %5,%10;"
638 "add %12,%4;"
639 "mov %5,%12;"
640 "pshufb %%xmm10,%%xmm8;"
641 "or %k2,%10;"
642 "add %4,%7;"
643 "and %k2,%12;"
644 "paddd %%xmm8,%%xmm0;"
645 "and %6,%10;"
646 "add %11,%4;"
647 "pshufd $0x50,%%xmm0,%%xmm2;"
648 "or %12,%10;"
649 "add %10,%4;"
650 "movdqa %%xmm2,%%xmm3;"
651 "mov %7,%10;"
652 "ror $0xe,%10;"
653 "mov %4,%11;"
654 "movdqa %%xmm2,%%xmm7;"
655 "ror $0x9,%11;"
656 "xor %7,%10;"
657 "mov %8,%12;"
658 "ror $0x5,%10;"
659 "psrlq $0x11,%%xmm2;"
660 "xor %4,%11;"
661 "xor %9,%12;"
662 "psrlq $0x13,%%xmm3;"
663 "xor %7,%10;"
664 "and %7,%12;"
665 "ror $0xb,%11;"
666 "psrld $0xa,%%xmm7;"
667 "xor %4,%11;"
668 "ror $0x6,%10;"
669 "xor %9,%12;"
670 "pxor %%xmm3,%%xmm2;"
671 "ror $0x2,%11;"
672 "add %10,%12;"
673 "add 12+%16,%12;"
674 "pxor %%xmm2,%%xmm7;"
675 "mov %4,%10;"
676 "add %12,%3;"
677 "mov %4,%12;"
678 "pshufb %%xmm11,%%xmm7;"
679 "or %6,%10;"
680 "add %3,%k2;"
681 "and %6,%12;"
682 "paddd %%xmm0,%%xmm7;"
683 "and %5,%10;"
684 "add %11,%3;"
685 "or %12,%10;"
686 "add %10,%3;"
687 "sub $0x1,%1;"
688 "jne Lloop1_%=;"
689 "mov $0x2,%1;"
690
691 "Lloop2_%=:"
692 "paddd 0x0(%13),%%xmm4;"
693 "movdqa %%xmm4,%16;"
694 "mov %k2,%10;"
695 "ror $0xe,%10;"
696 "mov %3,%11;"
697 "xor %k2,%10;"
698 "ror $0x9,%11;"
699 "mov %7,%12;"
700 "xor %3,%11;"
701 "ror $0x5,%10;"
702 "xor %8,%12;"
703 "xor %k2,%10;"
704 "ror $0xb,%11;"
705 "and %k2,%12;"
706 "xor %3,%11;"
707 "ror $0x6,%10;"
708 "xor %8,%12;"
709 "add %10,%12;"
710 "ror $0x2,%11;"
711 "add %16,%12;"
712 "mov %3,%10;"
713 "add %12,%9;"
714 "mov %3,%12;"
715 "or %5,%10;"
716 "add %9,%6;"
717 "and %5,%12;"
718 "and %4,%10;"
719 "add %11,%9;"
720 "or %12,%10;"
721 "add %10,%9;"
722 "mov %6,%10;"
723 "ror $0xe,%10;"
724 "mov %9,%11;"
725 "xor %6,%10;"
726 "ror $0x9,%11;"
727 "mov %k2,%12;"
728 "xor %9,%11;"
729 "ror $0x5,%10;"
730 "xor %7,%12;"
731 "xor %6,%10;"
732 "ror $0xb,%11;"
733 "and %6,%12;"
734 "xor %9,%11;"
735 "ror $0x6,%10;"
736 "xor %7,%12;"
737 "add %10,%12;"
738 "ror $0x2,%11;"
739 "add 4+%16,%12;"
740 "mov %9,%10;"
741 "add %12,%8;"
742 "mov %9,%12;"
743 "or %4,%10;"
744 "add %8,%5;"
745 "and %4,%12;"
746 "and %3,%10;"
747 "add %11,%8;"
748 "or %12,%10;"
749 "add %10,%8;"
750 "mov %5,%10;"
751 "ror $0xe,%10;"
752 "mov %8,%11;"
753 "xor %5,%10;"
754 "ror $0x9,%11;"
755 "mov %6,%12;"
756 "xor %8,%11;"
757 "ror $0x5,%10;"
758 "xor %k2,%12;"
759 "xor %5,%10;"
760 "ror $0xb,%11;"
761 "and %5,%12;"
762 "xor %8,%11;"
763 "ror $0x6,%10;"
764 "xor %k2,%12;"
765 "add %10,%12;"
766 "ror $0x2,%11;"
767 "add 8+%16,%12;"
768 "mov %8,%10;"
769 "add %12,%7;"
770 "mov %8,%12;"
771 "or %3,%10;"
772 "add %7,%4;"
773 "and %3,%12;"
774 "and %9,%10;"
775 "add %11,%7;"
776 "or %12,%10;"
777 "add %10,%7;"
778 "mov %4,%10;"
779 "ror $0xe,%10;"
780 "mov %7,%11;"
781 "xor %4,%10;"
782 "ror $0x9,%11;"
783 "mov %5,%12;"
784 "xor %7,%11;"
785 "ror $0x5,%10;"
786 "xor %6,%12;"
787 "xor %4,%10;"
788 "ror $0xb,%11;"
789 "and %4,%12;"
790 "xor %7,%11;"
791 "ror $0x6,%10;"
792 "xor %6,%12;"
793 "add %10,%12;"
794 "ror $0x2,%11;"
795 "add 12+%16,%12;"
796 "mov %7,%10;"
797 "add %12,%k2;"
798 "mov %7,%12;"
799 "or %9,%10;"
800 "add %k2,%3;"
801 "and %9,%12;"
802 "and %8,%10;"
803 "add %11,%k2;"
804 "or %12,%10;"
805 "add %10,%k2;"
806 "paddd 0x10(%13),%%xmm5;"
807 "movdqa %%xmm5,%16;"
808 "add $0x20,%13;"
809 "mov %3,%10;"
810 "ror $0xe,%10;"
811 "mov %k2,%11;"
812 "xor %3,%10;"
813 "ror $0x9,%11;"
814 "mov %4,%12;"
815 "xor %k2,%11;"
816 "ror $0x5,%10;"
817 "xor %5,%12;"
818 "xor %3,%10;"
819 "ror $0xb,%11;"
820 "and %3,%12;"
821 "xor %k2,%11;"
822 "ror $0x6,%10;"
823 "xor %5,%12;"
824 "add %10,%12;"
825 "ror $0x2,%11;"
826 "add %16,%12;"
827 "mov %k2,%10;"
828 "add %12,%6;"
829 "mov %k2,%12;"
830 "or %8,%10;"
831 "add %6,%9;"
832 "and %8,%12;"
833 "and %7,%10;"
834 "add %11,%6;"
835 "or %12,%10;"
836 "add %10,%6;"
837 "mov %9,%10;"
838 "ror $0xe,%10;"
839 "mov %6,%11;"
840 "xor %9,%10;"
841 "ror $0x9,%11;"
842 "mov %3,%12;"
843 "xor %6,%11;"
844 "ror $0x5,%10;"
845 "xor %4,%12;"
846 "xor %9,%10;"
847 "ror $0xb,%11;"
848 "and %9,%12;"
849 "xor %6,%11;"
850 "ror $0x6,%10;"
851 "xor %4,%12;"
852 "add %10,%12;"
853 "ror $0x2,%11;"
854 "add 4+%16,%12;"
855 "mov %6,%10;"
856 "add %12,%5;"
857 "mov %6,%12;"
858 "or %7,%10;"
859 "add %5,%8;"
860 "and %7,%12;"
861 "and %k2,%10;"
862 "add %11,%5;"
863 "or %12,%10;"
864 "add %10,%5;"
865 "mov %8,%10;"
866 "ror $0xe,%10;"
867 "mov %5,%11;"
868 "xor %8,%10;"
869 "ror $0x9,%11;"
870 "mov %9,%12;"
871 "xor %5,%11;"
872 "ror $0x5,%10;"
873 "xor %3,%12;"
874 "xor %8,%10;"
875 "ror $0xb,%11;"
876 "and %8,%12;"
877 "xor %5,%11;"
878 "ror $0x6,%10;"
879 "xor %3,%12;"
880 "add %10,%12;"
881 "ror $0x2,%11;"
882 "add 8+%16,%12;"
883 "mov %5,%10;"
884 "add %12,%4;"
885 "mov %5,%12;"
886 "or %k2,%10;"
887 "add %4,%7;"
888 "and %k2,%12;"
889 "and %6,%10;"
890 "add %11,%4;"
891 "or %12,%10;"
892 "add %10,%4;"
893 "mov %7,%10;"
894 "ror $0xe,%10;"
895 "mov %4,%11;"
896 "xor %7,%10;"
897 "ror $0x9,%11;"
898 "mov %8,%12;"
899 "xor %4,%11;"
900 "ror $0x5,%10;"
901 "xor %9,%12;"
902 "xor %7,%10;"
903 "ror $0xb,%11;"
904 "and %7,%12;"
905 "xor %4,%11;"
906 "ror $0x6,%10;"
907 "xor %9,%12;"
908 "add %10,%12;"
909 "ror $0x2,%11;"
910 "add 12+%16,%12;"
911 "mov %4,%10;"
912 "add %12,%3;"
913 "mov %4,%12;"
914 "or %6,%10;"
915 "add %3,%k2;"
916 "and %6,%12;"
917 "and %5,%10;"
918 "add %11,%3;"
919 "or %12,%10;"
920 "add %10,%3;"
921 "movdqa %%xmm6,%%xmm4;"
922 "movdqa %%xmm7,%%xmm5;"
923 "sub $0x1,%1;"
924 "jne Lloop2_%=;"
925 "add (%0),%3;"
926 "mov %3,(%0);"
927 "add 0x4(%0),%4;"
928 "mov %4,0x4(%0);"
929 "add 0x8(%0),%5;"
930 "mov %5,0x8(%0);"
931 "add 0xc(%0),%6;"
932 "mov %6,0xc(%0);"
933 "add 0x10(%0),%k2;"
934 "mov %k2,0x10(%0);"
935 "add 0x14(%0),%7;"
936 "mov %7,0x14(%0);"
937 "add 0x18(%0),%8;"
938 "mov %8,0x18(%0);"
939 "add 0x1c(%0),%9;"
940 "mov %9,0x1c(%0);"
941 "mov %15,%1;"
942 "add $0x40,%1;"
943 "cmp %14,%1;"
944 "jne Lloop0_%=;"
945
946 "Ldone_hash_%=:"
947
948 : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c),
949 "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0),
950 "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
951 : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
952 : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
953 "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12");
954}
955} // namespace sha256_sse4
956
957/*
958;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
959; Copyright (c) 2012, Intel Corporation
960;
961; All rights reserved.
962;
963; Redistribution and use in source and binary forms, with or without
964; modification, are permitted provided that the following conditions are
965; met:
966;
967; * Redistributions of source code must retain the above copyright
968; notice, this list of conditions and the following disclaimer.
969;
970; * Redistributions in binary form must reproduce the above copyright
971; notice, this list of conditions and the following disclaimer in the
972; documentation and/or other materials provided with the
973; distribution.
974;
975; * Neither the name of the Intel Corporation nor the names of its
976; contributors may be used to endorse or promote products derived from
977; this software without specific prior written permission.
978;
979;
980; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
981; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
982; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
983; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
984; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
985; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
986; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
987; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
988; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
989; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
990; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
991;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
992;
993; Example YASM command lines:
994; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8
995sha256_sse4.asm
996; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o
997sha256_sse4.asm
998;
999;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1000;
1001; This code is described in an Intel White-Paper:
1002; "Fast SHA-256 Implementations on Intel Architecture Processors"
1003;
1004; To find it, surf to http://www.intel.com/p/en_US/embedded
1005; and search for that title.
1006; The paper is expected to be released roughly at the end of April, 2012
1007;
1008;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1009; This code schedules 1 blocks at a time, with 4 lanes per block
1010;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1011
1012%define MOVDQ movdqu ;; assume buffers not aligned
1013
1014;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1015
1016; addm [mem], reg
1017; Add reg to mem using reg-mem add and store
1018%macro addm 2
1019 add %2, %1
1020 mov %1, %2
1021%endm
1022
1023;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1024
1025; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1026; Load xmm with mem and byte swap each dword
1027%macro COPY_XMM_AND_BSWAP 3
1028 MOVDQ %1, %2
1029 pshufb %1, %3
1030%endmacro
1031
1032;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1033
1034%define X0 xmm4
1035%define X1 xmm5
1036%define X2 xmm6
1037%define X3 xmm7
1038
1039%define XTMP0 xmm0
1040%define XTMP1 xmm1
1041%define XTMP2 xmm2
1042%define XTMP3 xmm3
1043%define XTMP4 xmm8
1044%define XFER xmm9
1045
1046%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
1047%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
1048%define BYTE_FLIP_MASK xmm12
1049
1050%ifdef LINUX
1051%define NUM_BLKS rdx ; 3rd arg
1052%define CTX rsi ; 2nd arg
1053%define INP rdi ; 1st arg
1054
1055%define SRND rdi ; clobbers INP
1056%define c ecx
1057%define d r8d
1058%define e edx
1059%else
1060%define NUM_BLKS r8 ; 3rd arg
1061%define CTX rdx ; 2nd arg
1062%define INP rcx ; 1st arg
1063
1064%define SRND rcx ; clobbers INP
1065%define c edi
1066%define d esi
1067%define e r8d
1068
1069%endif
1070%define TBL rbp
1071%define a eax
1072%define b ebx
1073
1074%define f r9d
1075%define g r10d
1076%define h r11d
1077
1078%define y0 r13d
1079%define y1 r14d
1080%define y2 r15d
1081
1082
1083
1084_INP_END_SIZE equ 8
1085_INP_SIZE equ 8
1086_XFER_SIZE equ 8
1087%ifdef LINUX
1088_XMM_SAVE_SIZE equ 0
1089%else
1090_XMM_SAVE_SIZE equ 7*16
1091%endif
1092; STACK_SIZE plus pushes must be an odd multiple of 8
1093_ALIGN_SIZE equ 8
1094
1095_INP_END equ 0
1096_INP equ _INP_END + _INP_END_SIZE
1097_XFER equ _INP + _INP_SIZE
1098_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
1099STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
1100
1101; rotate_Xs
1102; Rotate values of symbols X0...X3
1103%macro rotate_Xs 0
1104%xdefine X_ X0
1105%xdefine X0 X1
1106%xdefine X1 X2
1107%xdefine X2 X3
1108%xdefine X3 X_
1109%endm
1110
1111; ROTATE_ARGS
1112; Rotate values of symbols a...h
1113%macro ROTATE_ARGS 0
1114%xdefine TMP_ h
1115%xdefine h g
1116%xdefine g f
1117%xdefine f e
1118%xdefine e d
1119%xdefine d c
1120%xdefine c b
1121%xdefine b a
1122%xdefine a TMP_
1123%endm
1124
1125%macro FOUR_ROUNDS_AND_SCHED 0
1126 ;; compute s0 four at a time and s1 two at a time
1127 ;; compute W[-16] + W[-7] 4 at a time
1128 movdqa XTMP0, X3
1129 mov y0, e ; y0 = e
1130 ror y0, (25-11) ; y0 = e >> (25-11)
1131 mov y1, a ; y1 = a
1132 palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
1133 ror y1, (22-13) ; y1 = a >> (22-13)
1134 xor y0, e ; y0 = e ^ (e >> (25-11))
1135 mov y2, f ; y2 = f
1136 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1137 movdqa XTMP1, X1
1138 xor y1, a ; y1 = a ^ (a >> (22-13)
1139 xor y2, g ; y2 = f^g
1140 paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
1141 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1142 and y2, e ; y2 = (f^g)&e
1143 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1144 ;; compute s0
1145 palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
1146 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1147 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1148 xor y2, g ; y2 = CH = ((f^g)&e)^g
1149 movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
1150 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1151 add y2, y0 ; y2 = S1 + CH
1152 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1153 movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
1154 mov y0, a ; y0 = a
1155 add h, y2 ; h = h + S1 + CH + k + w
1156 mov y2, a ; y2 = a
1157 pslld XTMP1, (32-7)
1158 or y0, c ; y0 = a|c
1159 add d, h ; d = d + h + S1 + CH + k + w
1160 and y2, c ; y2 = a&c
1161 psrld XTMP2, 7
1162 and y0, b ; y0 = (a|c)&b
1163 add h, y1 ; h = h + S1 + CH + k + w + S0
1164 por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
1165 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1166 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1167
1168ROTATE_ARGS
1169 movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
1170 mov y0, e ; y0 = e
1171 mov y1, a ; y1 = a
1172 movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
1173 ror y0, (25-11) ; y0 = e >> (25-11)
1174 xor y0, e ; y0 = e ^ (e >> (25-11))
1175 mov y2, f ; y2 = f
1176 ror y1, (22-13) ; y1 = a >> (22-13)
1177 pslld XTMP3, (32-18)
1178 xor y1, a ; y1 = a ^ (a >> (22-13)
1179 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1180 xor y2, g ; y2 = f^g
1181 psrld XTMP2, 18
1182 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1183 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1184 and y2, e ; y2 = (f^g)&e
1185 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1186 pxor XTMP1, XTMP3
1187 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1188 xor y2, g ; y2 = CH = ((f^g)&e)^g
1189 psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
1190 add y2, y0 ; y2 = S1 + CH
1191 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1192 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1193 pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1194 mov y0, a ; y0 = a
1195 add h, y2 ; h = h + S1 + CH + k + w
1196 mov y2, a ; y2 = a
1197 pxor XTMP1, XTMP4 ; XTMP1 = s0
1198 or y0, c ; y0 = a|c
1199 add d, h ; d = d + h + S1 + CH + k + w
1200 and y2, c ; y2 = a&c
1201 ;; compute low s1
1202 pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
1203 and y0, b ; y0 = (a|c)&b
1204 add h, y1 ; h = h + S1 + CH + k + w + S0
1205 paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
1206 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1207 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1208
1209ROTATE_ARGS
1210 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
1211 mov y0, e ; y0 = e
1212 mov y1, a ; y1 = a
1213 ror y0, (25-11) ; y0 = e >> (25-11)
1214 movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
1215 xor y0, e ; y0 = e ^ (e >> (25-11))
1216 ror y1, (22-13) ; y1 = a >> (22-13)
1217 mov y2, f ; y2 = f
1218 xor y1, a ; y1 = a ^ (a >> (22-13)
1219 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1220 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
1221 xor y2, g ; y2 = f^g
1222 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
1223 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1224 and y2, e ; y2 = (f^g)&e
1225 psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
1226 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1227 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1228 xor y2, g ; y2 = CH = ((f^g)&e)^g
1229 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1230 pxor XTMP2, XTMP3
1231 add y2, y0 ; y2 = S1 + CH
1232 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1233 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1234 pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
1235 mov y0, a ; y0 = a
1236 add h, y2 ; h = h + S1 + CH + k + w
1237 mov y2, a ; y2 = a
1238 pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
1239 or y0, c ; y0 = a|c
1240 add d, h ; d = d + h + S1 + CH + k + w
1241 and y2, c ; y2 = a&c
1242 paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
1243 and y0, b ; y0 = (a|c)&b
1244 add h, y1 ; h = h + S1 + CH + k + w + S0
1245 ;; compute high s1
1246 pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1247 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1248 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1249
1250ROTATE_ARGS
1251 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
1252 mov y0, e ; y0 = e
1253 ror y0, (25-11) ; y0 = e >> (25-11)
1254 mov y1, a ; y1 = a
1255 movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
1256 ror y1, (22-13) ; y1 = a >> (22-13)
1257 xor y0, e ; y0 = e ^ (e >> (25-11))
1258 mov y2, f ; y2 = f
1259 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1260 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
1261 xor y1, a ; y1 = a ^ (a >> (22-13)
1262 xor y2, g ; y2 = f^g
1263 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
1264 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1265 and y2, e ; y2 = (f^g)&e
1266 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1267 psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
1268 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1269 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1270 xor y2, g ; y2 = CH = ((f^g)&e)^g
1271 pxor XTMP2, XTMP3
1272 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1273 add y2, y0 ; y2 = S1 + CH
1274 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1275 pxor X0, XTMP2 ; X0 = s1 {xDxC}
1276 mov y0, a ; y0 = a
1277 add h, y2 ; h = h + S1 + CH + k + w
1278 mov y2, a ; y2 = a
1279 pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
1280 or y0, c ; y0 = a|c
1281 add d, h ; d = d + h + S1 + CH + k + w
1282 and y2, c ; y2 = a&c
1283 paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
1284 and y0, b ; y0 = (a|c)&b
1285 add h, y1 ; h = h + S1 + CH + k + w + S0
1286 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1287 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1288
1289ROTATE_ARGS
1290rotate_Xs
1291%endm
1292
1293;; input is [rsp + _XFER + %1 * 4]
1294%macro DO_ROUND 1
1295 mov y0, e ; y0 = e
1296 ror y0, (25-11) ; y0 = e >> (25-11)
1297 mov y1, a ; y1 = a
1298 xor y0, e ; y0 = e ^ (e >> (25-11))
1299 ror y1, (22-13) ; y1 = a >> (22-13)
1300 mov y2, f ; y2 = f
1301 xor y1, a ; y1 = a ^ (a >> (22-13)
1302 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1303 xor y2, g ; y2 = f^g
1304 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1305 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1306 and y2, e ; y2 = (f^g)&e
1307 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1308 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1309 xor y2, g ; y2 = CH = ((f^g)&e)^g
1310 add y2, y0 ; y2 = S1 + CH
1311 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1312 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
1313 mov y0, a ; y0 = a
1314 add h, y2 ; h = h + S1 + CH + k + w
1315 mov y2, a ; y2 = a
1316 or y0, c ; y0 = a|c
1317 add d, h ; d = d + h + S1 + CH + k + w
1318 and y2, c ; y2 = a&c
1319 and y0, b ; y0 = (a|c)&b
1320 add h, y1 ; h = h + S1 + CH + k + w + S0
1321 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1322 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1323 ROTATE_ARGS
1324%endm
1325
1326;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1327;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1328;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1329;; arg 1 : pointer to input data
1330;; arg 2 : pointer to digest
1331;; arg 3 : Num blocks
1332section .text
1333global sha256_sse4
1334align 32
1335sha256_sse4:
1336 push rbx
1337%ifndef LINUX
1338 push rsi
1339 push rdi
1340%endif
1341 push rbp
1342 push r13
1343 push r14
1344 push r15
1345
1346 sub rsp,STACK_SIZE
1347%ifndef LINUX
1348 movdqa [rsp + _XMM_SAVE + 0*16],xmm6
1349 movdqa [rsp + _XMM_SAVE + 1*16],xmm7
1350 movdqa [rsp + _XMM_SAVE + 2*16],xmm8
1351 movdqa [rsp + _XMM_SAVE + 3*16],xmm9
1352 movdqa [rsp + _XMM_SAVE + 4*16],xmm10
1353 movdqa [rsp + _XMM_SAVE + 5*16],xmm11
1354 movdqa [rsp + _XMM_SAVE + 6*16],xmm12
1355%endif
1356
1357 shl NUM_BLKS, 6 ; convert to bytes
1358 jz done_hash
1359 add NUM_BLKS, INP ; pointer to end of data
1360 mov [rsp + _INP_END], NUM_BLKS
1361
1362 ;; load initial digest
1363 mov a,[4*0 + CTX]
1364 mov b,[4*1 + CTX]
1365 mov c,[4*2 + CTX]
1366 mov d,[4*3 + CTX]
1367 mov e,[4*4 + CTX]
1368 mov f,[4*5 + CTX]
1369 mov g,[4*6 + CTX]
1370 mov h,[4*7 + CTX]
1371
1372 movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1373 movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
1374 movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
1375
1376loop0:
1377 lea TBL,[K256 wrt rip]
1378
1379 ;; byte swap first 16 dwords
1380 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
1381 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
1382 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
1383 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
1384
1385 mov [rsp + _INP], INP
1386
1387 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1388 mov SRND, 3
1389align 16
1390loop1:
1391 movdqa XFER, [TBL + 0*16]
1392 paddd XFER, X0
1393 movdqa [rsp + _XFER], XFER
1394 FOUR_ROUNDS_AND_SCHED
1395
1396 movdqa XFER, [TBL + 1*16]
1397 paddd XFER, X0
1398 movdqa [rsp + _XFER], XFER
1399 FOUR_ROUNDS_AND_SCHED
1400
1401 movdqa XFER, [TBL + 2*16]
1402 paddd XFER, X0
1403 movdqa [rsp + _XFER], XFER
1404 FOUR_ROUNDS_AND_SCHED
1405
1406 movdqa XFER, [TBL + 3*16]
1407 paddd XFER, X0
1408 movdqa [rsp + _XFER], XFER
1409 add TBL, 4*16
1410 FOUR_ROUNDS_AND_SCHED
1411
1412 sub SRND, 1
1413 jne loop1
1414
1415 mov SRND, 2
1416loop2:
1417 paddd X0, [TBL + 0*16]
1418 movdqa [rsp + _XFER], X0
1419 DO_ROUND 0
1420 DO_ROUND 1
1421 DO_ROUND 2
1422 DO_ROUND 3
1423 paddd X1, [TBL + 1*16]
1424 movdqa [rsp + _XFER], X1
1425 add TBL, 2*16
1426 DO_ROUND 0
1427 DO_ROUND 1
1428 DO_ROUND 2
1429 DO_ROUND 3
1430
1431 movdqa X0, X2
1432 movdqa X1, X3
1433
1434 sub SRND, 1
1435 jne loop2
1436
1437 addm [4*0 + CTX],a
1438 addm [4*1 + CTX],b
1439 addm [4*2 + CTX],c
1440 addm [4*3 + CTX],d
1441 addm [4*4 + CTX],e
1442 addm [4*5 + CTX],f
1443 addm [4*6 + CTX],g
1444 addm [4*7 + CTX],h
1445
1446 mov INP, [rsp + _INP]
1447 add INP, 64
1448 cmp INP, [rsp + _INP_END]
1449 jne loop0
1450
1451done_hash:
1452%ifndef LINUX
1453 movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
1454 movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
1455 movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
1456 movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
1457 movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
1458 movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
1459 movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
1460%endif
1461
1462 add rsp, STACK_SIZE
1463
1464 pop r15
1465 pop r14
1466 pop r13
1467 pop rbp
1468%ifndef LINUX
1469 pop rdi
1470 pop rsi
1471%endif
1472 pop rbx
1473
1474 ret
1475
1476
1477section .data
1478align 64
1479K256:
1480 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1481 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1482 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1483 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1484 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1485 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1486 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1487 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1488 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1489 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1490 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1491 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1492 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1493 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1494 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1495 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1496
1497PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1498
1499; shuffle xBxA -> 00BA
1500_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1501
1502; shuffle xDxC -> DC00
1503_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1504*/
1505
1506#endif
void Transform(uint32_t *s, const uint8_t *chunk, size_t blocks)