1 /**
2 * Comba word operations
3 * 
4 * Copyright:
5 * (C) 1999-2010,2014 Jack Lloyd
6 * (C) 2014-2015 Etienne Cimon
7 *      2006 Luca Piccarreta
8 *
9 * License:
10 * Botan is released under the Simplified BSD License (see LICENSE.md)
11 */
12 module botan_math.mp_word;
13 import botan_math.mul128;
14 public import botan_math.mp_types;
15 /*
16 * Word Multiply/Add
17 */
18 word word_madd2(word a, word b, word* c)
19 {
20 	static if (BOTAN_HAS_MP_DWORD) {
21 		const dword s = cast(dword)(a) * b + *c;
22 		*c = cast(word)(s >> BOTAN_MP_WORD_BITS);
23 		return cast(word)(s);
24 	} else {
25 		version(D_InlineAsm_X86_64) {
26 			word* _a = &a;
27 			asm pure nothrow @nogc {
28 				
29 				mov RAX, a;
30 				mov RBX, b;
31 				mul RBX;
32 				mov RCX, c;
33 				add RAX, [RCX];
34 				adc RDX, 0;
35 				mov [RCX], RDX;
36 				mov RBX, _a;
37 				mov [RBX], RAX;
38 			}
39 			return a;
40 		}
41 		else {
42 			static assert(BOTAN_MP_WORD_BITS == 64, "Unexpected word size");
43 			
44 			word[2] res;
45 			
46 			mul64x64_128(a, b, res);
47 			
48 			res[0] += *c;
49 			res[1] += (res[0] < *c); // carry?
50 			
51 			*c = res[1];
52 			return res[0];
53 		}
54 	}
55 }
56 
57 /*
58 * Word Multiply/Add
59 */
60 word word_madd3(word a, word b, word c, word* d)
61 {
62 	static if (BOTAN_HAS_MP_DWORD) {
63 		const dword s = cast(dword)(a) * b + c + *d;
64 		*d = cast(word)(s >> BOTAN_MP_WORD_BITS);
65 		return cast(word)(s);
66 	} else {
67 		version(D_InlineAsm_X86_64) {
68 			word* _a = &a;
69 			asm pure nothrow @nogc {
70 				mov RAX, a;
71 				mov RBX, b;
72 				mul RBX;
73 				mov RBX, d;
74 				add RAX, c;
75 				adc RDX, 0;
76 				add RAX, [RBX];
77 				adc RDX, 0;
78 				mov [RBX], RDX;
79 				mov RBX, _a;
80 				mov [RBX], RAX;
81 			}
82 			return a;
83 		}
84 		else {
85 			static assert(BOTAN_MP_WORD_BITS == 64, "Unexpected word size");
86 			
87 			word[2] res;
88 			
89 			mul64x64_128(a, b, res);
90 			
91 			res[0] += c;
92 			res[1] += (res[0] < c); // carry?
93 			
94 			res[0] += *d;
95 			res[1] += (res[0] < *d); // carry?
96 			
97 			*d = res[1];
98 			return res[0];
99 		}
100 	}
101 }
102 
103 
104 /*
105 * Word Addition
106 */
107 word word_add(word x, word y, word* carry)
108 {
109 	word z = x + y;
110 	word c1 = (z < x);
111 	z += *carry;
112 	*carry = c1 | (z < *carry);
113 	return z;
114 }
115 
116 /*
117 * Eight Word Block Addition, Two Argument
118 */
119 word word8_add2(ref word[8] x, const ref word[8] y, word carry)
120 {
121 	version (D_InlineAsm_X86_64) {
122 		word* _x = x.ptr;
123 		word* _y = cast(word*)y.ptr;
124 
125 		asm pure nothrow @nogc {
126 			mov RDI,_x;
127 			mov RSI,_y;
128 			xor RAX,RAX;
129 			sub RAX,carry; //force CF=1 iff *carry==1
130 			mov RAX,[RSI];
131 			adc [RDI],RAX;
132 			
133 			mov RAX,[RSI+8];
134 			adc [RDI+8],RAX;
135 			mov RAX,[RSI+16];
136 			adc [RDI+16],RAX;
137 			mov RAX,[RSI+24];
138 			adc [RDI+24],RAX;
139 			mov RAX,[RSI+32];
140 			adc [RDI+32],RAX;
141 			mov RAX,[RSI+40];
142 			adc [RDI+40],RAX;
143 			mov RAX,[RSI+48];
144 			adc [RDI+48],RAX;
145 			mov RAX,[RSI+56];
146 			adc [RDI+56],RAX;
147 			sbb RAX,RAX;
148 			neg RAX;
149 			mov carry, RAX;
150 		}
151 		return carry;
152 	} else version (D_InlineAsm_X86) {
153 			
154 		word* _x = x.ptr;
155 		word* _y = cast(word*)y.ptr;
156 		asm pure nothrow @nogc {
157 			mov EDI,_x;
158 			mov ESI,_y;
159 			xor EAX,EAX;
160 			sub EAX,carry; //force CF=1 iff *carry==1
161 			mov EAX,[ESI];
162 			adc [EDI],EAX;
163 			mov EAX,[ESI+4];
164 			adc [EDI+4],EAX;
165 			mov EAX,[ESI+8];
166 			adc [EDI+8],EAX;
167 			mov EAX,[ESI+12];
168 			adc [EDI+12],EAX;
169 			mov EAX,[ESI+16];
170 			adc [EDI+16],EAX;
171 			mov EAX,[ESI+20];
172 			adc [EDI+20],EAX;
173 			mov EAX,[ESI+24];
174 			adc [EDI+24],EAX;
175 			mov EAX,[ESI+28];
176 			adc [EDI+28],EAX;
177 			sbb EAX,EAX;
178 			neg EAX;
179 			mov carry, EAX;
180 		}
181 		return carry;
182 	} else {
183 		void word_add_i(size_t i) {
184 			word z = x.ptr[i] + y.ptr[i];
185 			word c1 = (z < x.ptr[i]);
186 			z += carry;
187 			carry = c1 | (z < carry);
188 			x.ptr[i] = z;
189 		}
190 		word_add_i(0);
191 		word_add_i(1);
192 		word_add_i(2);
193 		word_add_i(3);
194 		word_add_i(4);
195 		word_add_i(5);
196 		word_add_i(6);
197 		word_add_i(7);
198 		return carry;
199 	}
200 }
201 
202 /*
203 * Eight Word Block Addition, Three Argument
204 */
205 word word8_add3(ref word[8] z, const ref word[8] x, const ref word[8] y, word carry)
206 {
207 	version(D_InlineAsm_X86_64) {
208 
209 		word* _z = z.ptr;
210 		word* _x = cast(word*)x.ptr;
211 		word* _y = cast(word*)y.ptr;
212 		asm pure nothrow @nogc {
213 
214 			mov RBX,_x;
215 			mov RSI,_y;
216 			mov RDI,_z;
217 			xor RAX,RAX;
218 			sub RAX,carry; //force CF=1 iff *carry==1
219 			mov RAX,[RBX];
220 			adc RAX,[RSI];
221 			mov [RDI],RAX;
222 				
223 			mov RAX,[RBX+8];
224 			adc RAX,[RSI+8];
225 			mov [RDI+8],RAX;
226 				
227 			mov RAX,[RBX+16];
228 			adc RAX,[RSI+16];
229 			mov [RDI+16],RAX;
230 				
231 			mov RAX,[RBX+24];
232 			adc RAX,[RSI+24];
233 			mov [RDI+24],RAX;
234 				
235 			mov RAX,[RBX+32];
236 			adc RAX,[RSI+32];
237 			mov [RDI+32],RAX;
238 				
239 			mov RAX,[RBX+40];
240 			adc RAX,[RSI+40];
241 			mov [RDI+40],RAX;
242 				
243 			mov RAX,[RBX+48];
244 			adc RAX,[RSI+48];
245 			mov [RDI+48],RAX;
246 				
247 			mov RAX,[RBX+56];
248 			adc RAX,[RSI+56];
249 			mov [RDI+56],RAX;
250 				
251 			sbb RAX,RAX;
252 			neg RAX;
253 			mov carry, RAX;
254 		}
255 		return carry;
256 	} else version (D_InlineAsm_X86) {
257 		word* _z = z.ptr;
258 		word* _x = cast(word*)x.ptr;
259 		word* _y = cast(word*)y.ptr;
260 		asm pure nothrow @nogc {
261 			
262 			mov EBX,_x;
263 			mov ESI,_y;
264 			mov EDI,_z;
265 			xor EAX,EAX;
266 			sub EAX,carry; //force CF=1 iff *carry==1
267 			mov EAX,[EBX];
268 			adc EAX,[ESI];
269 			mov [EDI],EAX;
270 			
271 			mov EAX,[EBX+4];
272 			adc EAX,[ESI+4];
273 			mov [EDI+4],EAX;
274 			
275 			mov EAX,[EBX+8];
276 			adc EAX,[ESI+8];
277 			mov [EDI+8],EAX;
278 			
279 			mov EAX,[EBX+12];
280 			adc EAX,[ESI+12];
281 			mov [EDI+12],EAX;
282 			
283 			mov EAX,[EBX+16];
284 			adc EAX,[ESI+16];
285 			mov [EDI+16],EAX;
286 			
287 			mov EAX,[EBX+20];
288 			adc EAX,[ESI+20];
289 			mov [EDI+20],EAX;
290 			
291 			mov EAX,[EBX+24];
292 			adc EAX,[ESI+24];
293 			mov [EDI+24],EAX;
294 			
295 			mov EAX,[EBX+28];
296 			adc EAX,[ESI+28];
297 			mov [EDI+28],EAX;
298 			
299 			sbb EAX,EAX;
300 			neg EAX;
301 			mov carry, EAX;
302 		}
303 		return carry;
304 	}
305 	else {
306 		z[0] = word_add(x[0], y[0], &carry);
307 		z[1] = word_add(x[1], y[1], &carry);
308 		z[2] = word_add(x[2], y[2], &carry);
309 		z[3] = word_add(x[3], y[3], &carry);
310 		z[4] = word_add(x[4], y[4], &carry);
311 		z[5] = word_add(x[5], y[5], &carry);
312 		z[6] = word_add(x[6], y[6], &carry);
313 		z[7] = word_add(x[7], y[7], &carry);
314 		return carry;
315 	}
316 }
317 
318 /*
319 * Word Subtraction
320 */
321 word word_sub(word x, word y, word* carry)
322 {
323 	word t0 = x - y;
324 	word c1 = (t0 > x);
325 	word z = t0 - *carry;
326 	*carry = c1 | (z > t0);
327 	return z;
328 }
329 
330 /*
331 * Eight Word Block Subtraction, Two Argument
332 */
333 word word8_sub2(ref word[8] x, const ref word[8] y, word carry)
334 {
335 	version(D_InlineAsm_X86_64) {
336 		word* _x = x.ptr;
337 		word[8] ret;
338 		word* _z = ret.ptr;
339 		word* _y = cast(word*)y.ptr;
340 		asm pure nothrow @nogc {
341 			mov RBX,_x;
342 			mov RSI,_y;
343 			mov RDI, _z;
344 			xor RAX,RAX;
345 			sub RAX,carry; //force CF=1 iff *carry==1
346 			mov RAX,[RBX];
347 			sbb RAX,[RSI];
348 			mov [RDI],RAX;
349 			mov RAX,[RBX+8];
350 			sbb RAX,[RSI+8];
351 			mov [RDI+8],RAX;
352 			mov RAX,[RBX+16];
353 			sbb RAX,[RSI+16];
354 			mov [RDI+16],RAX;
355 			mov RAX,[RBX+24];
356 			sbb RAX,[RSI+24];
357 			mov [RDI+24],RAX;
358 			mov RAX,[RBX+32];
359 			sbb RAX,[RSI+32];
360 			mov [RDI+32],RAX;
361 			mov RAX,[RBX+40];
362 			sbb RAX,[RSI+40];
363 			mov [RDI+40],RAX;
364 			mov RAX,[RBX+48];
365 			sbb RAX,[RSI+48];
366 			mov [RDI+48],RAX;
367 			mov RAX,[RBX+56];
368 			sbb RAX,[RSI+56];
369 			mov [RDI+56],RAX;
370 			sbb RAX,RAX;
371 			neg RAX;
372 			mov carry, RAX;
373 		}
374 		x[0 .. 8] = ret[0 .. 8];
375 		return carry;
376 
377 	}
378 	else version (D_InlineAsm_X86) {
379 		word* _x = x.ptr;
380 		word* _y = cast(word*)y.ptr;
381 		word[8] ret;
382 		word* _z = ret.ptr;
383 		asm pure nothrow @nogc {
384 			mov EBX,_x;
385 			mov EDI,_z;
386 			mov ESI,_y;
387 			xor EAX,EAX;
388 			sub EAX,carry; //force CF=1 iff *carry==1
389 			mov EAX,[EBX];
390 			sbb EAX,[ESI];
391 			mov [EDI],EAX;
392 			mov EAX,[EBX+4];
393 			sbb EAX,[ESI+4];
394 			mov [EDI+4],EAX;
395 			mov EAX,[EBX+8];
396 			sbb EAX,[ESI+8];
397 			mov [EDI+8],EAX;
398 			mov EAX,[EBX+12];
399 			sbb EAX,[ESI+12];
400 			mov [EDI+12],EAX;
401 			mov EAX,[EBX+16];
402 			sbb EAX,[ESI+16];
403 			mov [EDI+16],EAX;
404 			mov EAX,[EBX+20];
405 			sbb EAX,[ESI+20];
406 			mov [EDI+20],EAX;
407 			mov EAX,[EBX+24];
408 			sbb EAX,[ESI+24];
409 			mov [EDI+24],EAX;
410 			mov EAX,[EBX+28];
411 			sbb EAX,[ESI+28];
412 			mov [EDI+28],EAX;
413 			sbb EAX,EAX;
414 			neg EAX;
415 			mov carry, EAX;
416 		}
417 		x[0 .. 8] = ret[0 .. 8];
418 		return carry;
419 
420 	} else {
421 		x[0] = word_sub(x[0], y[0], &carry);
422 		x[1] = word_sub(x[1], y[1], &carry);
423 		x[2] = word_sub(x[2], y[2], &carry);
424 		x[3] = word_sub(x[3], y[3], &carry);
425 		x[4] = word_sub(x[4], y[4], &carry);
426 		x[5] = word_sub(x[5], y[5], &carry);
427 		x[6] = word_sub(x[6], y[6], &carry);
428 		x[7] = word_sub(x[7], y[7], &carry);
429 		return carry;
430 	}
431 }
432 
433 /*
434 * Eight Word Block Subtraction, Two Argument
435 */
436 word word8_sub2_rev(ref word[8] x, const ref word[8] y, word carry)
437 {
438 	x[0] = word_sub(y[0], x[0], &carry);
439 	x[1] = word_sub(y[1], x[1], &carry);
440 	x[2] = word_sub(y[2], x[2], &carry);
441 	x[3] = word_sub(y[3], x[3], &carry);
442 	x[4] = word_sub(y[4], x[4], &carry);
443 	x[5] = word_sub(y[5], x[5], &carry);
444 	x[6] = word_sub(y[6], x[6], &carry);
445 	x[7] = word_sub(y[7], x[7], &carry);
446 	return carry;
447 }
448 
449 /*
450 * Eight Word Block Subtraction, Three Argument
451 */
452 word word8_sub3(ref word[8] z, const ref word[8] x, const ref word[8] y, word carry)
453 {
454 	version(D_InlineAsm_X86_64) {
455 		word* _z = z.ptr;
456 		clearMem(_z, z.length);
457 		
458 		word* _x = cast(word*)x.ptr;
459 		word* _y = cast(word*)y.ptr;
460 		asm pure nothrow @nogc {
461 			mov RBX,_x;
462 			mov RSI,_y;
463 			xor RAX,RAX;
464 			sub RAX,carry; //force CF=1 iff *carry==1
465 			mov RDI,_z;
466 			mov RAX,[RBX];
467 			sbb RAX,[RSI];
468 			mov [RDI],RAX;
469 			mov RAX,[RBX+8];
470 			sbb RAX,[RSI+8];
471 			mov [RDI+8],RAX;
472 			mov RAX,[RBX+16];
473 			sbb RAX,[RSI+16];
474 			mov [RDI+16],RAX;
475 			mov RAX,[RBX+24];
476 			sbb RAX,[RSI+24];
477 			mov [RDI+24],RAX;
478 			mov RAX,[RBX+32];
479 			sbb RAX,[RSI+32];
480 			mov [RDI+32],RAX;
481 			mov RAX,[RBX+40];
482 			sbb RAX,[RSI+40];
483 			mov [RDI+40],RAX;
484 			mov RAX,[RBX+48];
485 			sbb RAX,[RSI+48];
486 			mov [RDI+48],RAX;
487 			mov RAX,[RBX+56];
488 			sbb RAX,[RSI+56];
489 			mov [RDI+56],RAX;
490 			sbb RAX,RAX;
491 			neg RAX;
492 			mov carry, RAX;
493 		}
494 		return carry;
495 	} else version (D_InlineAsm_X86) {
496 
497 		word* _z = z.ptr;
498 		word* _x = cast(word*)x.ptr;
499 		word* _y = cast(word*)y.ptr;
500 		asm {
501 			mov EBX,_x;
502 			mov ESI,_y;
503 			xor EAX,EAX;
504 			sub EAX,carry; //force CF=1 iff *carry==1
505 			mov EDI,_z;
506 			mov EAX,[EBX];
507 			sbb EAX,[ESI];
508 			mov [EDI],EAX;
509 			mov EAX,[EBX+4];
510 			sbb EAX,[ESI+4];
511 			mov [EDI+4],EAX;
512 			mov EAX,[EBX+8];
513 			sbb EAX,[ESI+8];
514 			mov [EDI+8],EAX;
515 			mov EAX,[EBX+12];
516 			sbb EAX,[ESI+12];
517 			mov [EDI+12],EAX;
518 			mov EAX,[EBX+16];
519 			sbb EAX,[ESI+16];
520 			mov [EDI+16],EAX;
521 			mov EAX,[EBX+20];
522 			sbb EAX,[ESI+20];
523 			mov [EDI+20],EAX;
524 			mov EAX,[EBX+24];
525 			sbb EAX,[ESI+24];
526 			mov [EDI+24],EAX;
527 			mov EAX,[EBX+28];
528 			sbb EAX,[ESI+28];
529 			mov [EDI+28],EAX;
530 			sbb EAX,EAX;
531 			neg EAX;
532 			mov carry, EAX;
533 		}
534 		return carry;
535 	}
536 	else {
537 		z[0] = word_sub(x[0], y[0], &carry);
538 		z[1] = word_sub(x[1], y[1], &carry);
539 		z[2] = word_sub(x[2], y[2], &carry);
540 		z[3] = word_sub(x[3], y[3], &carry);
541 		z[4] = word_sub(x[4], y[4], &carry);
542 		z[5] = word_sub(x[5], y[5], &carry);
543 		z[6] = word_sub(x[6], y[6], &carry);
544 		z[7] = word_sub(x[7], y[7], &carry);
545 		return carry;
546 	}
547 }
548 
549 /*
550 * Eight Word Block Linear Multiplication
551 */
552 word word8_linmul2(ref word[8] x, word y, word carry)
553 {
554 	version(D_InlineAsm_X86_64) {
555 		word* _x = x.ptr;
556 		word[8] ret;
557 		word* _z = ret.ptr;
558 		asm pure nothrow @nogc {
559 			mov RSI, _x;
560 			mov RDI, _z;
561 			mov RCX, carry;
562 			
563 			mov RAX, [RSI];
564 			mov RBX, y;
565 			mul RBX;
566 			add RAX, RCX;
567 			adc RDX, 0;
568 			mov RCX, RDX;
569 			mov [RDI], RAX;
570 			
571 			mov RAX, [RSI+8];
572 			mov RBX, y;
573 			mul RBX;
574 			add RAX, RCX;
575 			adc RDX, 0;
576 			mov RCX, RDX;
577 			mov [RDI+8], RAX;
578 			
579 			mov RAX, [RSI+16];
580 			mov RBX, y;
581 			mul RBX;
582 			add RAX, RCX;
583 			adc RDX, 0;
584 			mov RCX, RDX;
585 			mov [RDI+16], RAX;
586 			
587 			mov RAX, [RSI+24];
588 			mov RBX, y;
589 			mul RBX;
590 			add RAX, RCX;
591 			adc RDX, 0;
592 			mov RCX, RDX;
593 			mov [RDI+24], RAX;
594 			
595 			mov RAX, [RSI+32];
596 			mov RBX, y;
597 			mul RBX;
598 			add RAX, RCX;
599 			adc RDX, 0;
600 			mov RCX, RDX;
601 			mov [RDI+32], RAX;
602 			
603 			mov RAX, [RSI+40];
604 			mov RBX, y;
605 			mul RBX;
606 			add RAX, RCX;
607 			adc RDX, 0;
608 			mov RCX, RDX;
609 			mov [RDI+40], RAX;
610 			
611 			mov RAX, [RSI+48];
612 			mov RBX, y;
613 			mul RBX;
614 			add RAX, RCX;
615 			adc RDX, 0;
616 			mov RCX, RDX;
617 			mov [RDI+48], RAX;
618 			
619 			mov RAX, [RSI+56];
620 			mov RBX, y;
621 			mul RBX;
622 			add RAX, RCX;
623 			adc RDX, 0;
624 			mov carry, RDX;
625 			mov [RDI+56], RAX;
626 		}
627 		x[0 .. 8] = ret[0 .. 8];
628 		return carry;
629 	}
630 	else {
631 		x[0] = word_madd2(x[0], y, &carry);
632 		x[1] = word_madd2(x[1], y, &carry);
633 		x[2] = word_madd2(x[2], y, &carry);
634 		x[3] = word_madd2(x[3], y, &carry);
635 		x[4] = word_madd2(x[4], y, &carry);
636 		x[5] = word_madd2(x[5], y, &carry);
637 		x[6] = word_madd2(x[6], y, &carry);
638 		x[7] = word_madd2(x[7], y, &carry);
639 		return carry;
640 	}
641 }
642 
643 /*
644 * Eight Word Block Linear Multiplication
645 */
646 word word8_linmul3(ref word[8] z, const ref word[8] x, word y, word carry)
647 {
648 	
649 	version(D_InlineAsm_X86_64) {
650 		word* _x = cast(word*)x.ptr;
651 		word* _z = z.ptr;
652 		clearMem(_z, z.length);
653 		asm pure nothrow @nogc {
654 			mov RSI, _x;
655 			mov RDI, _z;
656 			mov RCX, carry;
657 			
658 			mov RAX, [RSI];
659 			mov RBX, y;
660 			mul RBX;
661 			add RAX, RCX;
662 			adc RDX, 0;
663 			mov RCX, RDX;
664 			mov [RDI], RAX;
665 			
666 			mov RAX, [RSI+8];
667 			mov RBX, y;
668 			mul RBX;
669 			add RAX, RCX;
670 			adc RDX, 0;
671 			mov RCX, RDX;
672 			mov [RDI+8], RAX;
673 			
674 			mov RAX, [RSI+16];
675 			mov RBX, y;
676 			mul RBX;
677 			add RAX, RCX;
678 			adc RDX, 0;
679 			mov RCX, RDX;
680 			mov [RDI+16], RAX;
681 		
682 			mov RAX, [RSI+24];
683 			mov RBX, y;
684 			mul RBX;
685 			add RAX, RCX;
686 			adc RDX, 0;
687 			mov RCX, RDX;
688 			mov [RDI+24], RAX;
689 			
690 			mov RAX, [RSI+32];
691 			mov RBX, y;
692 			mul RBX;
693 			add RAX, RCX;
694 			adc RDX, 0;
695 			mov RCX, RDX;
696 			mov [RDI+32], RAX;
697 			
698 			mov RAX, [RSI+40];
699 			mov RBX, y;
700 			mul RBX;
701 			add RAX, RCX;
702 			adc RDX, 0;
703 			mov RCX, RDX;
704 			mov [RDI+40], RAX;
705 			
706 			mov RAX, [RSI+48];
707 			mov RBX, y;
708 			mul RBX;
709 			add RAX, RCX;
710 			adc RDX, 0;
711 			mov RCX, RDX;
712 			mov [RDI+48], RAX;
713 			
714 			mov RAX, [RSI+56];
715 			mov RBX, y;
716 			mul RBX;
717 			add RAX, RCX;
718 			adc RDX, 0;
719 			mov carry, RDX;
720 			mov [RDI+56], RAX;
721 		}
722 		return carry;
723 	}
724 	else {
725 		z[0] = word_madd2(x[0], y, &carry);
726 		z[1] = word_madd2(x[1], y, &carry);
727 		z[2] = word_madd2(x[2], y, &carry);
728 		z[3] = word_madd2(x[3], y, &carry);
729 		z[4] = word_madd2(x[4], y, &carry);
730 		z[5] = word_madd2(x[5], y, &carry);
731 		z[6] = word_madd2(x[6], y, &carry);
732 		z[7] = word_madd2(x[7], y, &carry);
733 		return carry;
734 	}
735 }
736 
737 /*
738 * Eight Word Block Multiply/Add
739 */
740 word word8_madd3(ref word[8] z, const ref word[8] x, word y, word carry)
741 {
742 	version(D_InlineAsm_X86_64) {
743 		word* _x = cast(word*)x.ptr;
744 		word* _z = z.ptr;
745 		word[8] ret; word* _z1 = ret.ptr;
746 		size_t word_size = word.sizeof;
747 		asm pure nothrow @nogc {
748 			mov R8, _x;
749 			mov RSI, _z;
750 			mov R10, y;
751 			mov RDI, _z1;
752 			mov RCX, carry;
753 			
754 			mov RAX, [R8];
755 			mov RBX, R10;
756 			mul RBX;
757 			add RAX, [RSI];
758 			adc RDX, 0;
759 			add RAX, RCX;
760 			adc RDX, 0;
761 			mov RCX, RDX;
762 			mov [RDI], RAX;
763 			add R8, 8;
764 			
765 			mov RAX, [R8];
766 			mov RBX, R10;
767 			mul RBX;
768 			add RAX, [RSI+8];
769 			adc RDX, 0;
770 			add RAX, RCX;
771 			adc RDX, 0;
772 			mov RCX, RDX;
773 			mov [RDI+8], RAX;
774 			add R8, 8;
775 			
776 			mov RAX, [R8];
777 			mov RBX, R10;
778 			mul RBX;
779 			add RAX, [RSI+16];
780 			adc RDX, 0;
781 			add RAX, RCX;
782 			adc RDX, 0;
783 			mov RCX, RDX;
784 			mov [RDI+16], RAX;
785 			add R8, 8;
786 
787 			mov RAX, [R8];
788 			mov RBX, R10;
789 			mul RBX;
790 			add RAX, [RSI+24];
791 			adc RDX, 0;
792 			add RAX, RCX;
793 			adc RDX, 0;
794 			mov RCX, RDX;
795 			mov [RDI+24], RAX;
796 			add R8, 8;
797 		
798 			mov RAX, [R8];
799 			mov RBX, R10;
800 			mul RBX;
801 			add RAX, [RSI+32];
802 			adc RDX, 0;
803 			add RAX, RCX;
804 			adc RDX, 0;
805 			mov RCX, RDX;
806 			mov [RDI+32], RAX;
807 			add R8, 8;
808 			
809 			mov RAX, [R8];
810 			mov RBX, R10;
811 			mul RBX;
812 			add RAX, [RSI+40];
813 			adc RDX, 0;
814 			add RAX, RCX;
815 			adc RDX, 0;
816 			mov RCX, RDX;
817 			mov [RDI+40], RAX;
818 			add R8, 8;
819 
820 			mov RAX, [R8];
821 			mov RBX, R10;
822 			mul RBX;
823 			add RAX, [RSI+48];
824 			adc RDX, 0;
825 			add RAX, RCX;
826 			adc RDX, 0;
827 			mov RCX, RDX;
828 			mov [EDI+48], RAX;
829 			add R8, 8;
830 
831 			mov RAX, [R8];
832 			mov RBX, R10;
833 			mul RBX;
834 			add RAX, [RSI+56];
835 			adc RDX, 0;
836 			add RAX, RCX;
837 			adc RDX, 0;
838 			mov carry, RDX;
839 			mov [RDI+56], RAX;
840 		}
841 		z[0 .. 8] = ret[0..8];
842 		return carry;
843 	} else {
844 		z[0] = word_madd3(x[0], y, z[0], &carry);
845 		z[1] = word_madd3(x[1], y, z[1], &carry);
846 		z[2] = word_madd3(x[2], y, z[2], &carry);
847 		z[3] = word_madd3(x[3], y, z[3], &carry);
848 		z[4] = word_madd3(x[4], y, z[4], &carry);
849 		z[5] = word_madd3(x[5], y, z[5], &carry);
850 		z[6] = word_madd3(x[6], y, z[6], &carry);
851 		z[7] = word_madd3(x[7], y, z[7], &carry);
852 		return carry;
853 	}
854 }
855 
856 /*
857 * Multiply-Add Accumulator
858 */
859 void word3_muladd(word* w2, word* w1, word* w0, word a, word b)
860 {
861 	version (D_InlineAsm_X86_64) {
862 
863 		asm pure nothrow @nogc {
864 			mov R13, w0;
865 			mov R14, w1;
866 			mov R15, w2;
867 			mov RAX, a;
868 			mov RBX, b;
869 			mul RBX;
870 			
871 			add [R13], RAX;
872 			adc [R14], RDX;
873 			adc [R15], 0;
874 
875 		}
876 	} else {
877 		word carry = *w0;
878 		*w0 = word_madd2(a, b, &carry);
879 		*w1 += carry;
880 		*w2 += (*w1 < carry) ? 1 : 0;
881 	}
882 }
883 
884 /*
885 * Multiply-Add Accumulator
886 */
887 void word3_muladd_2(word* w2, word* w1, word* w0, word a, word b)
888 {
889 	version(D_InlineAsm_X86_64) {
890 
891 		asm pure nothrow @nogc {
892 			mov R13, w0;
893 			mov R14, w1;
894 			mov R15, w2;
895 			
896 			mov RAX, a;
897 			mov RBX, b;
898 			mul RBX;
899 
900 			add [R13], RAX;
901 			adc [R14], RDX;
902 			adc [R15], 0;
903 
904 			add [R13], RAX;
905 			adc [R14], RDX;
906 			adc [R15], 0;
907 		}
908 	}
909 	else {
910 		word carry = 0;
911 		a = word_madd2(a, b, &carry);
912 		b = carry;
913 		
914 		word top = (b >> (BOTAN_MP_WORD_BITS-1));
915 		b <<= 1;
916 		b |= (a >> (BOTAN_MP_WORD_BITS-1));
917 		a <<= 1;
918 		
919 		carry = 0;
920 		*w0 = word_add(*w0, a, &carry);
921 		*w1 = word_add(*w1, b, &carry);
922 		*w2 = word_add(*w2, top, &carry);
923 	}
924 }