Skip to content

Commit

Permalink
crypto: serpent-sse2 - change transpose_4x4 to only use integer instr…
Browse files Browse the repository at this point in the history
…uctions

Matrix transpose macro in serpent-sse2 uses mix of SSE2 integer and SSE floating
point instructions, which might cause performance penality on some CPUs.

This patch replaces transpose_4x4 macro with version that uses only SSE2
integer instructions.

Signed-off-by: Jussi Kivilinna <[email protected]>
Signed-off-by: Herbert Xu <[email protected]>
  • Loading branch information
jkivilin authored and herbertx committed Jan 13, 2012
1 parent 4c58464 commit 847cb7e
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 32 deletions.
29 changes: 13 additions & 16 deletions arch/x86/crypto/serpent-sse2-i586-asm_32.S
Original file line number Diff line number Diff line change
Expand Up @@ -463,23 +463,20 @@
pand x0, x4; \
pxor x2, x4;

#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
movdqa x2, t3; \
movdqa x0, t1; \
unpcklps x3, t3; \
#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
movdqa x0, t2; \
unpcklps x1, t1; \
unpckhps x1, t2; \
movdqa t3, x1; \
unpckhps x3, x2; \
movdqa t1, x0; \
movhlps t1, x1; \
movdqa t2, t1; \
movlhps t3, x0; \
movlhps x2, t1; \
movhlps t2, x2; \
movdqa x2, x3; \
movdqa t1, x2;
punpckldq x1, x0; \
punpckhdq x1, t2; \
movdqa x2, t1; \
punpckhdq x3, x2; \
punpckldq x3, t1; \
movdqa x0, x1; \
punpcklqdq t1, x0; \
punpckhqdq t1, x1; \
movdqa t2, x3; \
punpcklqdq x2, t2; \
punpckhqdq x2, x3; \
movdqa t2, x2;

#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
movdqu (0*4*4)(in), x0; \
Expand Down
29 changes: 13 additions & 16 deletions arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
Original file line number Diff line number Diff line change
Expand Up @@ -585,23 +585,20 @@
get_key(i, 1, RK1); \
SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \

#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
movdqa x2, t3; \
movdqa x0, t1; \
unpcklps x3, t3; \
#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
movdqa x0, t2; \
unpcklps x1, t1; \
unpckhps x1, t2; \
movdqa t3, x1; \
unpckhps x3, x2; \
movdqa t1, x0; \
movhlps t1, x1; \
movdqa t2, t1; \
movlhps t3, x0; \
movlhps x2, t1; \
movhlps t2, x2; \
movdqa x2, x3; \
movdqa t1, x2;
punpckldq x1, x0; \
punpckhdq x1, t2; \
movdqa x2, t1; \
punpckhdq x3, x2; \
punpckldq x3, t1; \
movdqa x0, x1; \
punpcklqdq t1, x0; \
punpckhqdq t1, x1; \
movdqa t2, x3; \
punpcklqdq x2, t2; \
punpckhqdq x2, x3; \
movdqa t2, x2;

#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
movdqu (0*4*4)(in), x0; \
Expand Down

0 comments on commit 847cb7e

Please sign in to comment.