-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
crypto: twofish - add 3-way parallel x86_64 assembler implemention
Patch adds 3-way parallel x86_64 assembly implementation of twofish as new module. New assembler functions crypt data in three blocks chunks, improving cipher performance on out-of-order CPUs. Patch has been tested with tcrypt and automated filesystem tests. Summary of the tcrypt benchmarks: Twofish 3-way-asm vs twofish asm (128bit 8kb block ECB) encrypt: 1.3x speed decrypt: 1.3x speed Twofish 3-way-asm vs twofish asm (128bit 8kb block CBC) encrypt: 1.07x speed decrypt: 1.4x speed Twofish 3-way-asm vs twofish asm (128bit 8kb block CTR) encrypt: 1.4x speed Twofish 3-way-asm vs AES asm (128bit 8kb block ECB) encrypt: 1.0x speed decrypt: 1.0x speed Twofish 3-way-asm vs AES asm (128bit 8kb block CBC) encrypt: 0.84x speed decrypt: 1.09x speed Twofish 3-way-asm vs AES asm (128bit 8kb block CTR) encrypt: 1.15x speed Full output: http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-twofish-3way-asm-x86_64.txt http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-twofish-asm-x86_64.txt http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-aes-asm-x86_64.txt Tests were run on: vendor_id : AuthenticAMD cpu family : 16 model : 10 model name : AMD Phenom(tm) II X6 1055T Processor Also userspace test were run on: vendor_id : GenuineIntel cpu family : 6 model : 15 model name : Intel(R) Xeon(R) CPU E7330 @ 2.40GHz stepping : 11 Userspace test results: Encryption/decryption of twofish 3-way vs x86_64-asm on AMD Phenom II: encrypt: 1.27x decrypt: 1.25x Encryption/decryption of twofish 3-way vs x86_64-asm on Intel Xeon E7330: encrypt: 1.36x decrypt: 1.36x Signed-off-by: Jussi Kivilinna <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
- Loading branch information
Showing
4 changed files
with
810 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,316 @@ | ||
/* | ||
* Twofish Cipher 3-way parallel algorithm (x86_64) | ||
* | ||
* Copyright (C) 2011 Jussi Kivilinna <[email protected]> | ||
* | ||
* This program is free software; you can redistribute it and/or modify | ||
* it under the terms of the GNU General Public License as published by | ||
* the Free Software Foundation; either version 2 of the License, or | ||
* (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
* GNU General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU General Public License | ||
* along with this program; if not, write to the Free Software | ||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | ||
* USA | ||
* | ||
*/ | ||
|
||
.file "twofish-x86_64-asm-3way.S" | ||
.text | ||
|
||
/* structure of crypto context */ | ||
#define s0 0 | ||
#define s1 1024 | ||
#define s2 2048 | ||
#define s3 3072 | ||
#define w 4096 | ||
#define k 4128 | ||
|
||
/********************************************************************** | ||
3-way twofish | ||
**********************************************************************/ | ||
#define CTX %rdi | ||
#define RIO %rdx | ||
|
||
#define RAB0 %rax | ||
#define RAB1 %rbx | ||
#define RAB2 %rcx | ||
|
||
#define RAB0d %eax | ||
#define RAB1d %ebx | ||
#define RAB2d %ecx | ||
|
||
#define RAB0bh %ah | ||
#define RAB1bh %bh | ||
#define RAB2bh %ch | ||
|
||
#define RAB0bl %al | ||
#define RAB1bl %bl | ||
#define RAB2bl %cl | ||
|
||
#define RCD0 %r8 | ||
#define RCD1 %r9 | ||
#define RCD2 %r10 | ||
|
||
#define RCD0d %r8d | ||
#define RCD1d %r9d | ||
#define RCD2d %r10d | ||
|
||
#define RX0 %rbp | ||
#define RX1 %r11 | ||
#define RX2 %r12 | ||
|
||
#define RX0d %ebp | ||
#define RX1d %r11d | ||
#define RX2d %r12d | ||
|
||
#define RY0 %r13 | ||
#define RY1 %r14 | ||
#define RY2 %r15 | ||
|
||
#define RY0d %r13d | ||
#define RY1d %r14d | ||
#define RY2d %r15d | ||
|
||
#define RT0 %rdx | ||
#define RT1 %rsi | ||
|
||
#define RT0d %edx | ||
#define RT1d %esi | ||
|
||
#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ | ||
movzbl ab ## bl, tmp2 ## d; \ | ||
movzbl ab ## bh, tmp1 ## d; \ | ||
rorq $(rot), ab; \ | ||
op1##l T0(CTX, tmp2, 4), dst ## d; \ | ||
op2##l T1(CTX, tmp1, 4), dst ## d; | ||
|
||
/* | ||
* Combined G1 & G2 function. Reordered with help of rotates to have moves | ||
* at begining. | ||
*/ | ||
#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ | ||
/* G1,1 && G2,1 */ \ | ||
do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ | ||
do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ | ||
\ | ||
do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ | ||
do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ | ||
\ | ||
do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ | ||
do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ | ||
\ | ||
/* G1,2 && G2,2 */ \ | ||
do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ | ||
do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ | ||
xchgq cd ## 0, ab ## 0; \ | ||
\ | ||
do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ | ||
do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ | ||
xchgq cd ## 1, ab ## 1; \ | ||
\ | ||
do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ | ||
do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ | ||
xchgq cd ## 2, ab ## 2; | ||
|
||
#define enc_round_end(ab, x, y, n) \ | ||
addl y ## d, x ## d; \ | ||
addl x ## d, y ## d; \ | ||
addl k+4*(2*(n))(CTX), x ## d; \ | ||
xorl ab ## d, x ## d; \ | ||
addl k+4*(2*(n)+1)(CTX), y ## d; \ | ||
shrq $32, ab; \ | ||
roll $1, ab ## d; \ | ||
xorl y ## d, ab ## d; \ | ||
shlq $32, ab; \ | ||
rorl $1, x ## d; \ | ||
orq x, ab; | ||
|
||
#define dec_round_end(ba, x, y, n) \ | ||
addl y ## d, x ## d; \ | ||
addl x ## d, y ## d; \ | ||
addl k+4*(2*(n))(CTX), x ## d; \ | ||
addl k+4*(2*(n)+1)(CTX), y ## d; \ | ||
xorl ba ## d, y ## d; \ | ||
shrq $32, ba; \ | ||
roll $1, ba ## d; \ | ||
xorl x ## d, ba ## d; \ | ||
shlq $32, ba; \ | ||
rorl $1, y ## d; \ | ||
orq y, ba; | ||
|
||
#define encrypt_round3(ab, cd, n) \ | ||
g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ | ||
\ | ||
enc_round_end(ab ## 0, RX0, RY0, n); \ | ||
enc_round_end(ab ## 1, RX1, RY1, n); \ | ||
enc_round_end(ab ## 2, RX2, RY2, n); | ||
|
||
#define decrypt_round3(ba, dc, n) \ | ||
g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ | ||
\ | ||
dec_round_end(ba ## 0, RX0, RY0, n); \ | ||
dec_round_end(ba ## 1, RX1, RY1, n); \ | ||
dec_round_end(ba ## 2, RX2, RY2, n); | ||
|
||
#define encrypt_cycle3(ab, cd, n) \ | ||
encrypt_round3(ab, cd, n*2); \ | ||
encrypt_round3(ab, cd, (n*2)+1); | ||
|
||
#define decrypt_cycle3(ba, dc, n) \ | ||
decrypt_round3(ba, dc, (n*2)+1); \ | ||
decrypt_round3(ba, dc, (n*2)); | ||
|
||
#define inpack3(in, n, xy, m) \ | ||
movq 4*(n)(in), xy ## 0; \ | ||
xorq w+4*m(CTX), xy ## 0; \ | ||
\ | ||
movq 4*(4+(n))(in), xy ## 1; \ | ||
xorq w+4*m(CTX), xy ## 1; \ | ||
\ | ||
movq 4*(8+(n))(in), xy ## 2; \ | ||
xorq w+4*m(CTX), xy ## 2; | ||
|
||
#define outunpack3(op, out, n, xy, m) \ | ||
xorq w+4*m(CTX), xy ## 0; \ | ||
op ## q xy ## 0, 4*(n)(out); \ | ||
\ | ||
xorq w+4*m(CTX), xy ## 1; \ | ||
op ## q xy ## 1, 4*(4+(n))(out); \ | ||
\ | ||
xorq w+4*m(CTX), xy ## 2; \ | ||
op ## q xy ## 2, 4*(8+(n))(out); | ||
|
||
#define inpack_enc3() \ | ||
inpack3(RIO, 0, RAB, 0); \ | ||
inpack3(RIO, 2, RCD, 2); | ||
|
||
#define outunpack_enc3(op) \ | ||
outunpack3(op, RIO, 2, RAB, 6); \ | ||
outunpack3(op, RIO, 0, RCD, 4); | ||
|
||
#define inpack_dec3() \ | ||
inpack3(RIO, 0, RAB, 4); \ | ||
rorq $32, RAB0; \ | ||
rorq $32, RAB1; \ | ||
rorq $32, RAB2; \ | ||
inpack3(RIO, 2, RCD, 6); \ | ||
rorq $32, RCD0; \ | ||
rorq $32, RCD1; \ | ||
rorq $32, RCD2; | ||
|
||
#define outunpack_dec3() \ | ||
rorq $32, RCD0; \ | ||
rorq $32, RCD1; \ | ||
rorq $32, RCD2; \ | ||
outunpack3(mov, RIO, 0, RCD, 0); \ | ||
rorq $32, RAB0; \ | ||
rorq $32, RAB1; \ | ||
rorq $32, RAB2; \ | ||
outunpack3(mov, RIO, 2, RAB, 2); | ||
|
||
.align 8 | ||
.global __twofish_enc_blk_3way | ||
.type __twofish_enc_blk_3way,@function; | ||
|
||
__twofish_enc_blk_3way: | ||
/* input: | ||
* %rdi: ctx, CTX | ||
* %rsi: dst | ||
* %rdx: src, RIO | ||
* %rcx: bool, if true: xor output | ||
*/ | ||
pushq %r15; | ||
pushq %r14; | ||
pushq %r13; | ||
pushq %r12; | ||
pushq %rbp; | ||
pushq %rbx; | ||
|
||
pushq %rcx; /* bool xor */ | ||
pushq %rsi; /* dst */ | ||
|
||
inpack_enc3(); | ||
|
||
encrypt_cycle3(RAB, RCD, 0); | ||
encrypt_cycle3(RAB, RCD, 1); | ||
encrypt_cycle3(RAB, RCD, 2); | ||
encrypt_cycle3(RAB, RCD, 3); | ||
encrypt_cycle3(RAB, RCD, 4); | ||
encrypt_cycle3(RAB, RCD, 5); | ||
encrypt_cycle3(RAB, RCD, 6); | ||
encrypt_cycle3(RAB, RCD, 7); | ||
|
||
popq RIO; /* dst */ | ||
popq %rbp; /* bool xor */ | ||
|
||
testb %bpl, %bpl; | ||
jnz __enc_xor3; | ||
|
||
outunpack_enc3(mov); | ||
|
||
popq %rbx; | ||
popq %rbp; | ||
popq %r12; | ||
popq %r13; | ||
popq %r14; | ||
popq %r15; | ||
ret; | ||
|
||
__enc_xor3: | ||
outunpack_enc3(xor); | ||
|
||
popq %rbx; | ||
popq %rbp; | ||
popq %r12; | ||
popq %r13; | ||
popq %r14; | ||
popq %r15; | ||
ret; | ||
|
||
.global twofish_dec_blk_3way | ||
.type twofish_dec_blk_3way,@function; | ||
|
||
twofish_dec_blk_3way: | ||
/* input: | ||
* %rdi: ctx, CTX | ||
* %rsi: dst | ||
* %rdx: src, RIO | ||
*/ | ||
pushq %r15; | ||
pushq %r14; | ||
pushq %r13; | ||
pushq %r12; | ||
pushq %rbp; | ||
pushq %rbx; | ||
|
||
pushq %rsi; /* dst */ | ||
|
||
inpack_dec3(); | ||
|
||
decrypt_cycle3(RAB, RCD, 7); | ||
decrypt_cycle3(RAB, RCD, 6); | ||
decrypt_cycle3(RAB, RCD, 5); | ||
decrypt_cycle3(RAB, RCD, 4); | ||
decrypt_cycle3(RAB, RCD, 3); | ||
decrypt_cycle3(RAB, RCD, 2); | ||
decrypt_cycle3(RAB, RCD, 1); | ||
decrypt_cycle3(RAB, RCD, 0); | ||
|
||
popq RIO; /* dst */ | ||
|
||
outunpack_dec3(); | ||
|
||
popq %rbx; | ||
popq %rbp; | ||
popq %r12; | ||
popq %r13; | ||
popq %r14; | ||
popq %r15; | ||
ret; | ||
|
Oops, something went wrong.