/*

* Written by Solar Designer <solar at openwall.com> in 1998-2010.
* No copyright is claimed, and the software is hereby placed in the public
* domain.  In case this attempt to disclaim copyright and place the software
* in the public domain is deemed null and void, then the software is
* Copyright (c) 1998-2010 Solar Designer and it is hereby released to the
* general public under the following terms:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted.
*
* There's ABSOLUTELY NO WARRANTY, express or implied.
*
* See crypt_blowfish.c for more information.
*/

ifdef i386

if defined(__OpenBSD__) && !defined(__ELF__) define UNDERSCORES define ALIGN_LOG endif

if defined(CYGWIN32) || defined(MINGW32) define UNDERSCORES endif

ifdef __DJGPP__ define UNDERSCORES define ALIGN_LOG endif

ifdef UNDERSCORES define _BF_body_r __BF_body_r endif

ifdef ALIGN_LOG define DO_ALIGN(log) .align (log) elif defined(DUMBAS) define DO_ALIGN(log) .align 1 << log else define DO_ALIGN(log) .align (1 << (log)) endif

define BF_FRAME 0x200 define ctx %esp

define BF_ptr (ctx)

define S(N, r) N+BF_FRAME(ctx,r,4) ifdef DUMBAS define P(N) 0x1000+N+N+N+N+BF_FRAME(ctx) else define P(N) 0x1000+4*N+BF_FRAME(ctx) endif

/*

* This version of the assembly code is optimized primarily for the original
* Intel Pentium but is also careful to avoid partial register stalls on the
* Pentium Pro family of processors (tested up to Pentium III Coppermine).
*
* It is possible to do 15% faster on the Pentium Pro family and probably on
* many non-Intel x86 processors, but, unfortunately, that would make things
* twice slower for the original Pentium.
*
* An additional 2% speedup may be achieved with non-reentrant code.
*/

define L %esi define R %edi define tmp1 %eax define tmp1_lo %al define tmp2 %ecx define tmp2_hi %ch define tmp3 %edx define tmp3_lo %dl define tmp4 %ebx define tmp4_hi %bh define tmp5 %ebp

.text

define BF_ROUND(L, R, N) \

xorl L,tmp2; \
xorl tmp1,tmp1; \
movl tmp2,L; \
shrl $16,tmp2; \
movl L,tmp4; \
movb tmp2_hi,tmp1_lo; \
andl $0xFF,tmp2; \
movb tmp4_hi,tmp3_lo; \
andl $0xFF,tmp4; \
movl S(0,tmp1),tmp1; \
movl S(0x400,tmp2),tmp5; \
addl tmp5,tmp1; \
movl S(0x800,tmp3),tmp5; \
xorl tmp5,tmp1; \
movl S(0xC00,tmp4),tmp5; \
addl tmp1,tmp5; \
movl 4+P(N),tmp2; \
xorl tmp5,R

define BF_ENCRYPT_START \

BF_ROUND(L, R, 0); \
BF_ROUND(R, L, 1); \
BF_ROUND(L, R, 2); \
BF_ROUND(R, L, 3); \
BF_ROUND(L, R, 4); \
BF_ROUND(R, L, 5); \
BF_ROUND(L, R, 6); \
BF_ROUND(R, L, 7); \
BF_ROUND(L, R, 8); \
BF_ROUND(R, L, 9); \
BF_ROUND(L, R, 10); \
BF_ROUND(R, L, 11); \
BF_ROUND(L, R, 12); \
BF_ROUND(R, L, 13); \
BF_ROUND(L, R, 14); \
BF_ROUND(R, L, 15); \
movl BF_ptr,tmp5; \
xorl L,tmp2; \
movl P(17),L

define BF_ENCRYPT_END \

xorl R,L; \
movl tmp2,R

DO_ALIGN(5) .globl _BF_body_r _BF_body_r:

movl 4(%esp),%eax
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
subl $BF_FRAME-8,%eax
xorl L,L
cmpl %esp,%eax
ja BF_die
xchgl %eax,%esp
xorl R,R
pushl %eax
leal 0x1000+BF_FRAME-4(ctx),%eax
movl 0x1000+BF_FRAME-4(ctx),tmp2
pushl %eax
xorl tmp3,tmp3

BF_loop_P:

BF_ENCRYPT_START
addl $8,tmp5
BF_ENCRYPT_END
leal 0x1000+18*4+BF_FRAME(ctx),tmp1
movl tmp5,BF_ptr
cmpl tmp5,tmp1
movl L,-8(tmp5)
movl R,-4(tmp5)
movl P(0),tmp2
ja BF_loop_P
leal BF_FRAME(ctx),tmp5
xorl tmp3,tmp3
movl tmp5,BF_ptr

BF_loop_S:

BF_ENCRYPT_START
BF_ENCRYPT_END
movl P(0),tmp2
movl L,(tmp5)
movl R,4(tmp5)
BF_ENCRYPT_START
BF_ENCRYPT_END
movl P(0),tmp2
movl L,8(tmp5)
movl R,12(tmp5)
BF_ENCRYPT_START
BF_ENCRYPT_END
movl P(0),tmp2
movl L,16(tmp5)
movl R,20(tmp5)
BF_ENCRYPT_START
addl $32,tmp5
BF_ENCRYPT_END
leal 0x1000+BF_FRAME(ctx),tmp1
movl tmp5,BF_ptr
cmpl tmp5,tmp1
movl P(0),tmp2
movl L,-8(tmp5)
movl R,-4(tmp5)
ja BF_loop_S
movl 4(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret

BF_die: /* Oops, need to re-compile with a larger BF_FRAME. */

hlt
jmp BF_die

endif

if defined(__ELF__) && defined(__linux__) .section .note.GNU-stack,“”,%progbits endif