Written by Solar Designer <solar at openwall.com> in 1998-2010. No copyright is claimed, and the software is hereby placed in the public domain. In case this attempt to disclaim copyright and place the software in the public domain is deemed null and void, then the software is Copyright (c) 1998-2010 Solar Designer and it is hereby released to the general public under the following terms: Redistribution and use in source and binary forms, with or without modification, are permitted. There's ABSOLUTELY NO WARRANTY, express or implied. See crypt_blowfish.c for more information.
ifdef i386
if defined(__OpenBSD__) && !defined(__ELF__) define UNDERSCORES define ALIGN_LOG endif
if defined(CYGWIN32) || defined(MINGW32) define UNDERSCORES endif
ifdef __DJGPP__ define UNDERSCORES define ALIGN_LOG endif
ifdef UNDERSCORES define _BF_body_r __BF_body_r endif
ifdef ALIGN_LOG define DO_ALIGN(log) .align (log) elif defined(DUMBAS) define DO_ALIGN(log) .align 1 << log else define DO_ALIGN(log) .align (1 << (log)) endif
define BF_FRAME 0x200 define ctx %esp
define BF_ptr (ctx)
define S(N, r) N+BF_FRAME(ctx,r,4) ifdef DUMBAS define P(N) 0x1000+N+N+N+N+BF_FRAME(ctx) else define P(N) 0x1000+4*N+BF_FRAME(ctx) endif
This version of the assembly code is optimized primarily for the original Intel Pentium but is also careful to avoid partial register stalls on the Pentium Pro family of processors (tested up to Pentium III Coppermine). It is possible to do 15% faster on the Pentium Pro family and probably on many non-Intel x86 processors, but, unfortunately, that would make things twice slower for the original Pentium. An additional 2% speedup may be achieved with non-reentrant code. /
define L %esi define R %edi define tmp1 %eax define tmp1_lo %al define tmp2 %ecx define tmp2_hi %ch define tmp3 %edx define tmp3_lo %dl define tmp4 %ebx define tmp4_hi %bh define tmp5 %ebp
.text
define BF_ROUND(L, R, N) \
xorl L,tmp2; \ xorl tmp1,tmp1; \ movl tmp2,L; \ shrl $16,tmp2; \ movl L,tmp4; \ movb tmp2_hi,tmp1_lo; \ andl $0xFF,tmp2; \ movb tmp4_hi,tmp3_lo; \ andl $0xFF,tmp4; \ movl S(0,tmp1),tmp1; \ movl S(0x400,tmp2),tmp5; \ addl tmp5,tmp1; \ movl S(0x800,tmp3),tmp5; \ xorl tmp5,tmp1; \ movl S(0xC00,tmp4),tmp5; \ addl tmp1,tmp5; \ movl 4+P(N),tmp2; \ xorl tmp5,R
define BF_ENCRYPT_START \
BF_ROUND(L, R, 0); \ BF_ROUND(R, L, 1); \ BF_ROUND(L, R, 2); \ BF_ROUND(R, L, 3); \ BF_ROUND(L, R, 4); \ BF_ROUND(R, L, 5); \ BF_ROUND(L, R, 6); \ BF_ROUND(R, L, 7); \ BF_ROUND(L, R, 8); \ BF_ROUND(R, L, 9); \ BF_ROUND(L, R, 10); \ BF_ROUND(R, L, 11); \ BF_ROUND(L, R, 12); \ BF_ROUND(R, L, 13); \ BF_ROUND(L, R, 14); \ BF_ROUND(R, L, 15); \ movl BF_ptr,tmp5; \ xorl L,tmp2; \ movl P(17),L
define BF_ENCRYPT_END \
xorl R,L; \ movl tmp2,R
DO_ALIGN(5) .globl _BF_body_r _BF_body_r:
movl 4(%esp),%eax pushl %ebp pushl %ebx pushl %esi pushl %edi subl $BF_FRAME-8,%eax xorl L,L cmpl %esp,%eax ja BF_die xchgl %eax,%esp xorl R,R pushl %eax leal 0x1000+BF_FRAME-4(ctx),%eax movl 0x1000+BF_FRAME-4(ctx),tmp2 pushl %eax xorl tmp3,tmp3
BF_loop_P:
BF_ENCRYPT_START addl $8,tmp5 BF_ENCRYPT_END leal 0x1000+18*4+BF_FRAME(ctx),tmp1 movl tmp5,BF_ptr cmpl tmp5,tmp1 movl L,-8(tmp5) movl R,-4(tmp5) movl P(0),tmp2 ja BF_loop_P leal BF_FRAME(ctx),tmp5 xorl tmp3,tmp3 movl tmp5,BF_ptr
BF_loop_S:
BF_ENCRYPT_START BF_ENCRYPT_END movl P(0),tmp2 movl L,(tmp5) movl R,4(tmp5) BF_ENCRYPT_START BF_ENCRYPT_END movl P(0),tmp2 movl L,8(tmp5) movl R,12(tmp5) BF_ENCRYPT_START BF_ENCRYPT_END movl P(0),tmp2 movl L,16(tmp5) movl R,20(tmp5) BF_ENCRYPT_START addl $32,tmp5 BF_ENCRYPT_END leal 0x1000+BF_FRAME(ctx),tmp1 movl tmp5,BF_ptr cmpl tmp5,tmp1 movl P(0),tmp2 movl L,-8(tmp5) movl R,-4(tmp5) ja BF_loop_S movl 4(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret
BF_die: /* Oops, need to re-compile with a larger BF_FRAME.
hlt jmp BF_die
endif
if defined(__ELF__) && defined(__linux__) .section .note.GNU-stack,“”,@progbits endif