SDL2_gfx 1.0.2
Graphics primitives and surface functions for SDL2
/builddir/build/BUILD/SDL2_gfx-1.0.4/SDL2_imageFilter.c
Go to the documentation of this file.
1/*
2
3SDL2_imageFilter.c: byte-image "filter" routines
4
5Copyright (C) 2012-2014 Andreas Schiffler
6Copyright (C) 2013 Sylvain Beucler
7
8This software is provided 'as-is', without any express or implied
9warranty. In no event will the authors be held liable for any damages
10arising from the use of this software.
11
12Permission is granted to anyone to use this software for any purpose,
13including commercial applications, and to alter it and redistribute it
14freely, subject to the following restrictions:
15
16 1. The origin of this software must not be misrepresented; you must not
17 claim that you wrote the original software. If you use this software
18 in a product, an acknowledgment in the product documentation would be
19 appreciated but is not required.
20
21 2. Altered source versions must be plainly marked as such, and must not be
22 misrepresented as being the original software.
23
24 3. This notice may not be removed or altered from any source
25 distribution.
26
27Andreas Schiffler -- aschiffler at ferzkopp dot net
28
29*/
30
31/*
32
33Note: Uses inline x86 MMX or ASM optimizations if available and enabled.
34
35Note: Most of the MMX code is based on published routines
36by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to
37him for his work.
38
39*/
40
41#include <stdio.h>
42#include <stdlib.h>
43#include <string.h>
44
45#include "SDL.h"
46
47/* Use GCC intrinsics if available: they support both i386 and x86_64,
48 provide ASM-grade performances, and lift the PUSHA/POPA issues. */
49#ifdef __GNUC__
50# ifdef USE_MMX
51# include <mmintrin.h>
52# endif
53# include <SDL_cpuinfo.h>
54#endif
55
56#include "SDL2_imageFilter.h"
57
61#define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8) | (((x) & 0x0000ff00) << 8) | ((x) << 24))
62
63/* ------ Static variables ----- */
64
68static int SDL_imageFilterUseMMX = 1;
69
70/* Detect GCC */
71#if defined(__GNUC__)
72#define GCC__
73#endif
74
81{
82 /* Check override flag */
83 if (SDL_imageFilterUseMMX == 0) {
84 return (0);
85 }
86
87 return SDL_HasMMX();
88}
89
94{
95 SDL_imageFilterUseMMX = 0;
96}
97
102{
103 SDL_imageFilterUseMMX = 1;
104}
105
106/* ------------------------------------------------------------------------------------ */
107
118static int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
119{
120#ifdef USE_MMX
121#if !defined(GCC__)
122 __asm
123 {
124 pusha
125 mov eax, Src1 /* load Src1 address into eax */
126 mov ebx, Src2 /* load Src2 address into ebx */
127 mov edi, Dest /* load Dest address into edi */
128 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
129 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
130 align 16 /* 16 byte alignment of the loop entry */
131L1010:
132 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
133 paddusb mm1, [ebx] /* mm1=Src1+Src2 (add 8 bytes with saturation) */
134 movq [edi], mm1 /* store result in Dest */
135 add eax, 8 /* increase Src1, Src2 and Dest */
136 add ebx, 8 /* register pointers by 8 */
137 add edi, 8
138 dec ecx /* decrease loop counter */
139 jnz L1010 /* check loop termination, proceed if required */
140 emms /* exit MMX state */
141 popa
142 }
143#else
144 /* i386 and x86_64 */
145 __m64 *mSrc1 = (__m64*)Src1;
146 __m64 *mSrc2 = (__m64*)Src2;
147 __m64 *mDest = (__m64*)Dest;
148 int i;
149 for (i = 0; i < SrcLength/8; i++) {
150 *mDest = _m_paddusb(*mSrc1, *mSrc2); /* Src1+Src2 (add 8 bytes with saturation) */
151 mSrc1++;
152 mSrc2++;
153 mDest++;
154 }
155 _m_empty(); /* clean MMX state */
156#endif
157 return (0);
158#else
159 return (-1);
160#endif
161}
162
173int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
174{
175 unsigned int i, istart;
176 unsigned char *cursrc1, *cursrc2, *curdst;
177 int result;
178
179 /* Validate input parameters */
180 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
181 return(-1);
182 if (length == 0)
183 return(0);
184
185 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
186
187 /* Use MMX assembly routine */
188 SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
189
190 /* Check for unaligned bytes */
191 if ((length & 7) > 0) {
192 /* Setup to process unaligned bytes */
193 istart = length & 0xfffffff8;
194 cursrc1 = &Src1[istart];
195 cursrc2 = &Src2[istart];
196 curdst = &Dest[istart];
197 } else {
198 /* No unaligned bytes - we are done */
199 return (0);
200 }
201 } else {
202 /* Setup to process whole image */
203 istart = 0;
204 cursrc1 = Src1;
205 cursrc2 = Src2;
206 curdst = Dest;
207 }
208
209 /* C routine to process image */
210 for (i = istart; i < length; i++) {
211 result = (int) *cursrc1 + (int) *cursrc2;
212 if (result > 255)
213 result = 255;
214 *curdst = (unsigned char) result;
215 /* Advance pointers */
216 cursrc1++;
217 cursrc2++;
218 curdst++;
219 }
220
221 return (0);
222}
223
235static int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
236 unsigned char *Mask)
237{
238#ifdef USE_MMX
239#if !defined(GCC__)
240 __asm
241 {
242 pusha
243 mov edx, Mask /* load Mask address into edx */
244 movq mm0, [edx] /* load Mask into mm0 */
245 mov eax, Src1 /* load Src1 address into eax */
246 mov ebx, Src2 /* load Src2 address into ebx */
247 mov edi, Dest /* load Dest address into edi */
248 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
249 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
250 align 16 /* 16 byte alignment of the loop entry */
251L21011:
252 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
253 movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */
254 /* --- Byte shift via Word shift --- */
255 psrlw mm1, 1 /* shift 4 WORDS of mm1 1 bit to the right */
256 psrlw mm2, 1 /* shift 4 WORDS of mm2 1 bit to the right */
257 pand mm1, mm0 // apply Mask to 8 BYTES of mm1 */
258 /* byte 0x0f, 0xdb, 0xc8 */
259 pand mm2, mm0 // apply Mask to 8 BYTES of mm2 */
260 /* byte 0x0f, 0xdb, 0xd0 */
261 paddusb mm1, mm2 /* mm1=mm1+mm2 (add 8 bytes with saturation) */
262 movq [edi], mm1 /* store result in Dest */
263 add eax, 8 /* increase Src1, Src2 and Dest */
264 add ebx, 8 /* register pointers by 8 */
265 add edi, 8
266 dec ecx /* decrease loop counter */
267 jnz L21011 /* check loop termination, proceed if required */
268 emms /* exit MMX state */
269 popa
270 }
271#else
272 /* i386 and x86_64 */
273 __m64 *mSrc1 = (__m64*)Src1;
274 __m64 *mSrc2 = (__m64*)Src2;
275 __m64 *mDest = (__m64*)Dest;
276 __m64 *mMask = (__m64*)Mask;
277 int i;
278 for (i = 0; i < SrcLength/8; i++) {
279 __m64 mm1 = *mSrc1,
280 mm2 = *mSrc2;
281 mm1 = _m_psrlwi(mm1, 1); /* shift 4 WORDS of mm1 1 bit to the right */
282 mm2 = _m_psrlwi(mm2, 1); /* shift 4 WORDS of mm2 1 bit to the right */
283 mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of mm1 */
284 mm2 = _m_pand(mm2, *mMask); /* apply Mask to 8 BYTES of mm2 */
285 *mDest = _m_paddusb(mm1, mm2); /* mm1+mm2 (add 8 bytes with saturation) */
286 mSrc1++;
287 mSrc2++;
288 mDest++;
289 }
290 _m_empty(); /* clean MMX state */
291#endif
292 return (0);
293#else
294 return (-1);
295#endif
296}
297
308int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
309{
310 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
311 unsigned int i, istart;
312 unsigned char *cursrc1, *cursrc2, *curdst;
313 int result;
314
315 /* Validate input parameters */
316 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
317 return(-1);
318 if (length == 0)
319 return(0);
320
321 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
322 /* MMX routine */
323 SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
324
325 /* Check for unaligned bytes */
326 if ((length & 7) > 0) {
327 /* Setup to process unaligned bytes */
328 istart = length & 0xfffffff8;
329 cursrc1 = &Src1[istart];
330 cursrc2 = &Src2[istart];
331 curdst = &Dest[istart];
332 } else {
333 /* No unaligned bytes - we are done */
334 return (0);
335 }
336 } else {
337 /* Setup to process whole image */
338 istart = 0;
339 cursrc1 = Src1;
340 cursrc2 = Src2;
341 curdst = Dest;
342 }
343
344 /* C routine to process image */
345 for (i = istart; i < length; i++) {
346 result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
347 *curdst = (unsigned char) result;
348 /* Advance pointers */
349 cursrc1++;
350 cursrc2++;
351 curdst++;
352 }
353
354 return (0);
355}
356
367static int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
368{
369#ifdef USE_MMX
370#if !defined(GCC__)
371 __asm
372 {
373 pusha
374 mov eax, Src1 /* load Src1 address into eax */
375 mov ebx, Src2 /* load Src2 address into ebx */
376 mov edi, Dest /* load Dest address into edi */
377 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
378 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
379 align 16 /* 16 byte alignment of the loop entry */
380L1012:
381 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
382 psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
383 movq [edi], mm1 /* store result in Dest */
384 add eax, 8 /* increase Src1, Src2 and Dest */
385 add ebx, 8 /* register pointers by 8 */
386 add edi, 8
387 dec ecx /* decrease loop counter */
388 jnz L1012 /* check loop termination, proceed if required */
389 emms /* exit MMX state */
390 popa
391 }
392#else
393 /* i386 and x86_64 */
394 __m64 *mSrc1 = (__m64*)Src1;
395 __m64 *mSrc2 = (__m64*)Src2;
396 __m64 *mDest = (__m64*)Dest;
397 int i;
398 for (i = 0; i < SrcLength/8; i++) {
399 *mDest = _m_psubusb(*mSrc1, *mSrc2); /* Src1-Src2 (sub 8 bytes with saturation) */
400 mSrc1++;
401 mSrc2++;
402 mDest++;
403 }
404 _m_empty(); /* clean MMX state */
405#endif
406 return (0);
407#else
408 return (-1);
409#endif
410}
411
422int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
423{
424 unsigned int i, istart;
425 unsigned char *cursrc1, *cursrc2, *curdst;
426 int result;
427
428 /* Validate input parameters */
429 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
430 return(-1);
431 if (length == 0)
432 return(0);
433
434 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
435 /* MMX routine */
436 SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
437
438 /* Check for unaligned bytes */
439 if ((length & 7) > 0) {
440 /* Setup to process unaligned bytes */
441 istart = length & 0xfffffff8;
442 cursrc1 = &Src1[istart];
443 cursrc2 = &Src2[istart];
444 curdst = &Dest[istart];
445 } else {
446 /* No unaligned bytes - we are done */
447 return (0);
448 }
449 } else {
450 /* Setup to process whole image */
451 istart = 0;
452 cursrc1 = Src1;
453 cursrc2 = Src2;
454 curdst = Dest;
455 }
456
457 /* C routine to process image */
458 for (i = istart; i < length; i++) {
459 result = (int) *cursrc1 - (int) *cursrc2;
460 if (result < 0)
461 result = 0;
462 *curdst = (unsigned char) result;
463 /* Advance pointers */
464 cursrc1++;
465 cursrc2++;
466 curdst++;
467 }
468
469 return (0);
470}
471
482static int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
483{
484#ifdef USE_MMX
485#if !defined(GCC__)
486 __asm
487 {
488 pusha
489 mov eax, Src1 /* load Src1 address into eax */
490 mov ebx, Src2 /* load Src2 address into ebx */
491 mov edi, Dest /* load Dest address into edi */
492 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
493 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
494 align 16 /* 16 byte alignment of the loop entry */
495L1013:
496 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
497 movq mm2, [ebx] /* load 8 bytes from Src2 into mm2 */
498 psubusb mm1, [ebx] /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
499 psubusb mm2, [eax] /* mm2=Src2-Src1 (sub 8 bytes with saturation) */
500 por mm1, mm2 /* combine both mm2 and mm1 results */
501 movq [edi], mm1 /* store result in Dest */
502 add eax, 8 /* increase Src1, Src2 and Dest */
503 add ebx, 8 /* register pointers by 8 */
504 add edi, 8
505 dec ecx /* decrease loop counter */
506 jnz L1013 /* check loop termination, proceed if required */
507 emms /* exit MMX state */
508 popa
509 }
510#else
511 /* i386 and x86_64 */
512 __m64 *mSrc1 = (__m64*)Src1;
513 __m64 *mSrc2 = (__m64*)Src2;
514 __m64 *mDest = (__m64*)Dest;
515 int i;
516 for (i = 0; i < SrcLength/8; i++) {
517 __m64 mm1 = _m_psubusb(*mSrc2, *mSrc1); /* Src1-Src2 (sub 8 bytes with saturation) */
518 __m64 mm2 = _m_psubusb(*mSrc1, *mSrc2); /* Src2-Src1 (sub 8 bytes with saturation) */
519 *mDest = _m_por(mm1, mm2); /* combine both mm2 and mm1 results */
520 mSrc1++;
521 mSrc2++;
522 mDest++;
523 }
524 _m_empty(); /* clean MMX state */
525#endif
526 return (0);
527#else
528 return (-1);
529#endif
530}
531
542int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
543{
544 unsigned int i, istart;
545 unsigned char *cursrc1, *cursrc2, *curdst;
546 int result;
547
548 /* Validate input parameters */
549 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
550 return(-1);
551 if (length == 0)
552 return(0);
553
554 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
555 /* MMX routine */
556 SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
557
558 /* Check for unaligned bytes */
559 if ((length & 7) > 0) {
560 /* Setup to process unaligned bytes */
561 istart = length & 0xfffffff8;
562 cursrc1 = &Src1[istart];
563 cursrc2 = &Src2[istart];
564 curdst = &Dest[istart];
565 } else {
566 /* No unaligned bytes - we are done */
567 return (0);
568 }
569 } else {
570 /* Setup to process whole image */
571 istart = 0;
572 cursrc1 = Src1;
573 cursrc2 = Src2;
574 curdst = Dest;
575 }
576
577 /* C routine to process image */
578 for (i = istart; i < length; i++) {
579 result = abs((int) *cursrc1 - (int) *cursrc2);
580 *curdst = (unsigned char) result;
581 /* Advance pointers */
582 cursrc1++;
583 cursrc2++;
584 curdst++;
585 }
586
587 return (0);
588}
589
600static int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
601{
602#ifdef USE_MMX
603#if !defined(GCC__)
604 __asm
605 {
606 pusha
607 mov eax, Src1 /* load Src1 address into eax */
608 mov ebx, Src2 /* load Src2 address into ebx */
609 mov edi, Dest /* load Dest address into edi */
610 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
611 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
612 pxor mm0, mm0 /* zero mm0 register */
613 align 16 /* 16 byte alignment of the loop entry */
614L1014:
615 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
616 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
617 movq mm2, mm1 /* copy mm1 into mm2 */
618 movq mm4, mm3 /* copy mm3 into mm4 */
619 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
620 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
621 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
622 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
623 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
624 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
625 /* Take abs value of the results (signed words) */
626 movq mm5, mm1 /* copy mm1 into mm5 */
627 movq mm6, mm2 /* copy mm2 into mm6 */
628 psraw mm5, 15 /* fill mm5 words with word sign bit */
629 psraw mm6, 15 /* fill mm6 words with word sign bit */
630 pxor mm1, mm5 /* take 1's compliment of only neg. words */
631 pxor mm2, mm6 /* take 1's compliment of only neg. words */
632 psubsw mm1, mm5 /* add 1 to only neg. words, W-(-1) or W-0 */
633 psubsw mm2, mm6 /* add 1 to only neg. words, W-(-1) or W-0 */
634 packuswb mm1, mm2 /* pack words back into bytes with saturation */
635 movq [edi], mm1 /* store result in Dest */
636 add eax, 8 /* increase Src1, Src2 and Dest */
637 add ebx, 8 /* register pointers by 8 */
638 add edi, 8
639 dec ecx /* decrease loop counter */
640 jnz L1014 /* check loop termination, proceed if required */
641 emms /* exit MMX state */
642 popa
643 }
644#else
645 /* i386 ASM with constraints: */
646 /* asm volatile ( */
647 /* "shr $3, %%ecx \n\t" /\* counter/8 (MMX loads 8 bytes at a time) *\/ */
648 /* "pxor %%mm0, %%mm0 \n\t" /\* zero mm0 register *\/ */
649 /* ".align 16 \n\t" /\* 16 byte alignment of the loop entry *\/ */
650 /* "1: movq (%%eax), %%mm1 \n\t" /\* load 8 bytes from Src1 into mm1 *\/ */
651 /* "movq (%%ebx), %%mm3 \n\t" /\* load 8 bytes from Src2 into mm3 *\/ */
652 /* "movq %%mm1, %%mm2 \n\t" /\* copy mm1 into mm2 *\/ */
653 /* "movq %%mm3, %%mm4 \n\t" /\* copy mm3 into mm4 *\/ */
654 /* "punpcklbw %%mm0, %%mm1 \n\t" /\* unpack low bytes of Src1 into words *\/ */
655 /* "punpckhbw %%mm0, %%mm2 \n\t" /\* unpack high bytes of Src1 into words *\/ */
656 /* "punpcklbw %%mm0, %%mm3 \n\t" /\* unpack low bytes of Src2 into words *\/ */
657 /* "punpckhbw %%mm0, %%mm4 \n\t" /\* unpack high bytes of Src2 into words *\/ */
658 /* "pmullw %%mm3, %%mm1 \n\t" /\* mul low bytes of Src1 and Src2 *\/ */
659 /* "pmullw %%mm4, %%mm2 \n\t" /\* mul high bytes of Src1 and Src2 *\/ */
660 /* /\* Take abs value of the results (signed words) *\/ */
661 /* "movq %%mm1, %%mm5 \n\t" /\* copy mm1 into mm5 *\/ */
662 /* "movq %%mm2, %%mm6 \n\t" /\* copy mm2 into mm6 *\/ */
663 /* "psraw $15, %%mm5 \n\t" /\* fill mm5 words with word sign bit *\/ */
664 /* "psraw $15, %%mm6 \n\t" /\* fill mm6 words with word sign bit *\/ */
665 /* "pxor %%mm5, %%mm1 \n\t" /\* take 1's compliment of only neg. words *\/ */
666 /* "pxor %%mm6, %%mm2 \n\t" /\* take 1's compliment of only neg. words *\/ */
667 /* "psubsw %%mm5, %%mm1 \n\t" /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
668 /* "psubsw %%mm6, %%mm2 \n\t" /\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
669 /* "packuswb %%mm2, %%mm1 \n\t" /\* pack words back into bytes with saturation *\/ */
670 /* "movq %%mm1, (%%edi) \n\t" /\* store result in Dest *\/ */
671 /* "add $8, %%eax \n\t" /\* increase Src1, Src2 and Dest *\/ */
672 /* "add $8, %%ebx \n\t" /\* register pointers by 8 *\/ */
673 /* "add $8, %%edi \n\t" */
674 /* "dec %%ecx \n\t" /\* decrease loop counter *\/ */
675 /* "jnz 1b \n\t" /\* check loop termination, proceed if required *\/ */
676 /* "emms \n\t" /\* exit MMX state *\/ */
677 /* : "+a" (Src1), /\* load Src1 address into rax, modified by the loop *\/ */
678 /* "+b" (Src2), /\* load Src2 address into rbx, modified by the loop *\/ */
679 /* "+c" (SrcLength), /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
680 /* "+D" (Dest) /\* load Dest address into rdi, modified by the loop *\/ */
681 /* : */
682 /* : "memory", /\* *Dest is modified *\/ */
683 /* "mm0","mm1","mm2","mm3","mm4","mm5","mm6" /\* registers modified *\/ */
684 /* ); */
685
686 /* i386 and x86_64 */
687 __m64 *mSrc1 = (__m64*)Src1;
688 __m64 *mSrc2 = (__m64*)Src2;
689 __m64 *mDest = (__m64*)Dest;
690 __m64 mm0 = _m_from_int(0); /* zero mm0 register */
691 int i;
692 for (i = 0; i < SrcLength/8; i++) {
693 __m64 mm1, mm2, mm3, mm4, mm5, mm6;
694 mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
695 mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
696 mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */
697 mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */
698 mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */
699 mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */
700 mm5 = _m_psrawi(mm1, 15); /* fill mm5 words with word sign bit */
701 mm6 = _m_psrawi(mm2, 15); /* fill mm6 words with word sign bit */
702 mm1 = _m_pxor(mm1, mm5); /* take 1's compliment of only neg. words */
703 mm2 = _m_pxor(mm2, mm6); /* take 1's compliment of only neg. words */
704 mm1 = _m_psubsw(mm1, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */
705 mm2 = _m_psubsw(mm2, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */
706 *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */
707 mSrc1++;
708 mSrc2++;
709 mDest++;
710 }
711 _m_empty(); /* clean MMX state */
712#endif
713 return (0);
714#else
715 return (-1);
716#endif
717}
718
729int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
730{
731 unsigned int i, istart;
732 unsigned char *cursrc1, *cursrc2, *curdst;
733 int result;
734
735 /* Validate input parameters */
736 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
737 return(-1);
738 if (length == 0)
739 return(0);
740
741 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
742 /* MMX routine */
743 SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
744
745 /* Check for unaligned bytes */
746 if ((length & 7) > 0) {
747 /* Setup to process unaligned bytes */
748 istart = length & 0xfffffff8;
749 cursrc1 = &Src1[istart];
750 cursrc2 = &Src2[istart];
751 curdst = &Dest[istart];
752 } else {
753 /* No unaligned bytes - we are done */
754 return (0);
755 }
756 } else {
757 /* Setup to process whole image */
758 istart = 0;
759 cursrc1 = Src1;
760 cursrc2 = Src2;
761 curdst = Dest;
762 }
763
764 /* C routine to process image */
765 for (i = istart; i < length; i++) {
766
767 /* NOTE: this is probably wrong - dunno what the MMX code does */
768
769 result = (int) *cursrc1 * (int) *cursrc2;
770 if (result > 255)
771 result = 255;
772 *curdst = (unsigned char) result;
773 /* Advance pointers */
774 cursrc1++;
775 cursrc2++;
776 curdst++;
777 }
778
779 return (0);
780}
781
792int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
793{
794#ifdef USE_MMX
795#if !defined(GCC__)
796 __asm
797 {
798 pusha
799 mov edx, Src1 /* load Src1 address into edx */
800 mov esi, Src2 /* load Src2 address into esi */
801 mov edi, Dest /* load Dest address into edi */
802 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
803 align 16 /* 16 byte alignment of the loop entry */
804L10141:
805 mov al, [edx] /* load a byte from Src1 */
806 mul [esi] /* mul with a byte from Src2 */
807 mov [edi], al /* move a byte result to Dest */
808 inc edx /* increment Src1, Src2, Dest */
809 inc esi /* pointer registers by one */
810 inc edi
811 dec ecx /* decrease loop counter */
812 jnz L10141 /* check loop termination, proceed if required */
813 popa
814 }
815#else
816 /* Note: ~5% gain on i386, less efficient than C on x86_64 */
817 /* Also depends on whether this function is static (?!) */
818 asm volatile (
819 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
820# if defined(i386)
821 "1:mov (%%edx), %%al \n\t" /* load a byte from Src1 */
822 "mulb (%%esi) \n\t" /* mul with a byte from Src2 */
823 "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */
824 "inc %%edx \n\t" /* increment Src1, Src2, Dest */
825 "inc %%esi \n\t" /* pointer registers by one */
826 "inc %%edi \n\t"
827 "dec %%ecx \n\t" /* decrease loop counter */
828# elif defined(__x86_64__)
829 "1:mov (%%rdx), %%al \n\t" /* load a byte from Src1 */
830 "mulb (%%rsi) \n\t" /* mul with a byte from Src2 */
831 "mov %%al, (%%rdi) \n\t" /* move a byte result to Dest */
832 "inc %%rdx \n\t" /* increment Src1, Src2, Dest */
833 "inc %%rsi \n\t" /* pointer registers by one */
834 "inc %%rdi \n\t"
835 "dec %%rcx \n\t" /* decrease loop counter */
836# endif
837 "jnz 1b \n\t" /* check loop termination, proceed if required */
838 : "+d" (Src1), /* load Src1 address into edx */
839 "+S" (Src2), /* load Src2 address into esi */
840 "+c" (SrcLength), /* load loop counter (SIZE) into ecx */
841 "+D" (Dest) /* load Dest address into edi */
842 :
843 : "memory", "rax"
844 );
845#endif
846 return (0);
847#else
848 return (-1);
849#endif
850}
851
862int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
863{
864 unsigned int i, istart;
865 unsigned char *cursrc1, *cursrc2, *curdst;
866
867 /* Validate input parameters */
868 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
869 return(-1);
870 if (length == 0)
871 return(0);
872
874 if (length > 0) {
875 /* ASM routine */
876 SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
877
878 /* Check for unaligned bytes */
879 if ((length & 7) > 0) {
880 /* Setup to process unaligned bytes */
881 istart = length & 0xfffffff8;
882 cursrc1 = &Src1[istart];
883 cursrc2 = &Src2[istart];
884 curdst = &Dest[istart];
885 } else {
886 /* No unaligned bytes - we are done */
887 return (0);
888 }
889 } else {
890 /* No bytes - we are done */
891 return (0);
892 }
893 } else {
894 /* Setup to process whole image */
895 istart = 0;
896 cursrc1 = Src1;
897 cursrc2 = Src2;
898 curdst = Dest;
899 }
900
901 /* C routine to process image */
902 for (i = istart; i < length; i++) {
903 *curdst = (int)*cursrc1 * (int)*cursrc2; // (int) for efficiency
904 /* Advance pointers */
905 cursrc1++;
906 cursrc2++;
907 curdst++;
908 }
909
910 return (0);
911}
912
923static int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
924{
925#ifdef USE_MMX
926#if !defined(GCC__)
927 __asm
928 {
929 pusha
930 mov eax, Src1 /* load Src1 address into eax */
931 mov ebx, Src2 /* load Src2 address into ebx */
932 mov edi, Dest /* load Dest address into edi */
933 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
934 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
935 pxor mm0, mm0 /* zero mm0 register */
936 align 16 /* 16 byte alignment of the loop entry */
937L1015:
938 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
939 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
940 movq mm2, mm1 /* copy mm1 into mm2 */
941 movq mm4, mm3 /* copy mm3 into mm4 */
942 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
943 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
944 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
945 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
946 psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */
947 psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */
948 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
949 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
950 packuswb mm1, mm2 /* pack words back into bytes with saturation */
951 movq [edi], mm1 /* store result in Dest */
952 add eax, 8 /* increase Src1, Src2 and Dest */
953 add ebx, 8 /* register pointers by 8 */
954 add edi, 8
955 dec ecx /* decrease loop counter */
956 jnz L1015 /* check loop termination, proceed if required */
957 emms /* exit MMX state */
958 popa
959 }
960#else
961 /* i386 and x86_64 */
962 __m64 *mSrc1 = (__m64*)Src1;
963 __m64 *mSrc2 = (__m64*)Src2;
964 __m64 *mDest = (__m64*)Dest;
965 __m64 mm0 = _m_from_int(0); /* zero mm0 register */
966 int i;
967 for (i = 0; i < SrcLength/8; i++) {
968 __m64 mm1, mm2, mm3, mm4, mm5, mm6;
969 mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
970 mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
971 mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */
972 mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */
973 mm1 = _m_psrlwi(mm1, 1); /* divide mm1 words by 2, Src1 low bytes */
974 mm2 = _m_psrlwi(mm2, 1); /* divide mm2 words by 2, Src1 high bytes */
975 mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */
976 mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */
977 *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */
978 mSrc1++;
979 mSrc2++;
980 mDest++;
981 }
982 _m_empty(); /* clean MMX state */
983#endif
984 return (0);
985#else
986 return (-1);
987#endif
988}
989
1000int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1001{
1002 unsigned int i, istart;
1003 unsigned char *cursrc1, *cursrc2, *curdst;
1004 int result;
1005
1006 /* Validate input parameters */
1007 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1008 return(-1);
1009 if (length == 0)
1010 return(0);
1011
1012 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1013 /* MMX routine */
1014 SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
1015
1016 /* Check for unaligned bytes */
1017 if ((length & 7) > 0) {
1018 /* Setup to process unaligned bytes */
1019 istart = length & 0xfffffff8;
1020 cursrc1 = &Src1[istart];
1021 cursrc2 = &Src2[istart];
1022 curdst = &Dest[istart];
1023 } else {
1024 /* No unaligned bytes - we are done */
1025 return (0);
1026 }
1027 } else {
1028 /* Setup to process whole image */
1029 istart = 0;
1030 cursrc1 = Src1;
1031 cursrc2 = Src2;
1032 curdst = Dest;
1033 }
1034
1035 /* C routine to process image */
1036 for (i = istart; i < length; i++) {
1037 result = ((int) *cursrc1 / 2) * (int) *cursrc2;
1038 if (result > 255)
1039 result = 255;
1040 *curdst = (unsigned char) result;
1041 /* Advance pointers */
1042 cursrc1++;
1043 cursrc2++;
1044 curdst++;
1045 }
1046
1047 return (0);
1048}
1049
1060static int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1061{
1062#ifdef USE_MMX
1063#if !defined(GCC__)
1064 __asm
1065 {
1066 pusha
1067 mov eax, Src1 /* load Src1 address into eax */
1068 mov ebx, Src2 /* load Src2 address into ebx */
1069 mov edi, Dest /* load Dest address into edi */
1070 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1071 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1072 pxor mm0, mm0 /* zero mm0 register */
1073 align 16 /* 16 byte alignment of the loop entry */
1074L1016:
1075 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
1076 movq mm3, [ebx] /* load 8 bytes from Src2 into mm3 */
1077 movq mm2, mm1 /* copy mm1 into mm2 */
1078 movq mm4, mm3 /* copy mm3 into mm4 */
1079 punpcklbw mm1, mm0 /* unpack low bytes of Src1 into words */
1080 punpckhbw mm2, mm0 /* unpack high bytes of Src1 into words */
1081 punpcklbw mm3, mm0 /* unpack low bytes of Src2 into words */
1082 punpckhbw mm4, mm0 /* unpack high bytes of Src2 into words */
1083 psrlw mm1, 1 /* divide mm1 words by 2, Src1 low bytes */
1084 psrlw mm2, 1 /* divide mm2 words by 2, Src1 high bytes */
1085 psrlw mm3, 1 /* divide mm3 words by 2, Src2 low bytes */
1086 psrlw mm4, 1 /* divide mm4 words by 2, Src2 high bytes */
1087 pmullw mm1, mm3 /* mul low bytes of Src1 and Src2 */
1088 pmullw mm2, mm4 /* mul high bytes of Src1 and Src2 */
1089 packuswb mm1, mm2 /* pack words back into bytes with saturation */
1090 movq [edi], mm1 /* store result in Dest */
1091 add eax, 8 /* increase Src1, Src2 and Dest */
1092 add ebx, 8 /* register pointers by 8 */
1093 add edi, 8
1094 dec ecx /* decrease loop counter */
1095 jnz L1016 /* check loop termination, proceed if required */
1096 emms /* exit MMX state */
1097 popa
1098 }
1099#else
1100 /* i386 and x86_64 */
1101 __m64 *mSrc1 = (__m64*)Src1;
1102 __m64 *mSrc2 = (__m64*)Src2;
1103 __m64 *mDest = (__m64*)Dest;
1104 __m64 mm0 = _m_from_int(0); /* zero mm0 register */
1105 int i;
1106 for (i = 0; i < SrcLength/8; i++) {
1107 __m64 mm1, mm2, mm3, mm4, mm5, mm6;
1108 mm1 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
1109 mm2 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
1110 mm3 = _m_punpcklbw(*mSrc2, mm0); /* unpack low bytes of Src2 into words */
1111 mm4 = _m_punpckhbw(*mSrc2, mm0); /* unpack high bytes of Src2 into words */
1112 mm1 = _m_psrlwi(mm1, 1); /* divide mm1 words by 2, Src1 low bytes */
1113 mm2 = _m_psrlwi(mm2, 1); /* divide mm2 words by 2, Src1 high bytes */
1114 mm3 = _m_psrlwi(mm3, 1); /* divide mm3 words by 2, Src2 low bytes */
1115 mm4 = _m_psrlwi(mm4, 1); /* divide mm4 words by 2, Src2 high bytes */
1116 mm1 = _m_pmullw(mm1, mm3); /* mul low bytes of Src1 and Src2 */
1117 mm2 = _m_pmullw(mm2, mm4); /* mul high bytes of Src1 and Src2 */
1118 *mDest = _m_packuswb(mm1, mm2); /* pack words back into bytes with saturation */
1119 mSrc1++;
1120 mSrc2++;
1121 mDest++;
1122 }
1123 _m_empty(); /* clean MMX state */
1124#endif
1125 return (0);
1126#else
1127 return (-1);
1128#endif
1129}
1130
1141int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1142{
1143 unsigned int i, istart;
1144 unsigned char *cursrc1, *cursrc2, *curdst;
1145 int result;
1146
1147 /* Validate input parameters */
1148 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1149 return(-1);
1150 if (length == 0)
1151 return(0);
1152
1153 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1154 /* MMX routine */
1155 SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
1156
1157 /* Check for unaligned bytes */
1158 if ((length & 7) > 0) {
1159 /* Setup to process unaligned bytes */
1160 istart = length & 0xfffffff8;
1161 cursrc1 = &Src1[istart];
1162 cursrc2 = &Src2[istart];
1163 curdst = &Dest[istart];
1164 } else {
1165 /* No unaligned bytes - we are done */
1166 return (0);
1167 }
1168 } else {
1169 /* Setup to process whole image */
1170 istart = 0;
1171 cursrc1 = Src1;
1172 cursrc2 = Src2;
1173 curdst = Dest;
1174 }
1175
1176 /* C routine to process image */
1177 for (i = istart; i < length; i++) {
1178 result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
1179 if (result > 255)
1180 result = 255;
1181 *curdst = (unsigned char) result;
1182 /* Advance pointers */
1183 cursrc1++;
1184 cursrc2++;
1185 curdst++;
1186 }
1187
1188 return (0);
1189}
1190
1201static int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1202{
1203#ifdef USE_MMX
1204#if !defined(GCC__)
1205 __asm
1206 {
1207 pusha
1208 mov eax, Src1 /* load Src1 address into eax */
1209 mov ebx, Src2 /* load Src2 address into ebx */
1210 mov edi, Dest /* load Dest address into edi */
1211 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1212 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1213 align 16 /* 16 byte alignment of the loop entry */
1214L1017:
1215 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
1216 pand mm1, [ebx] /* mm1=Src1&Src2 */
1217 movq [edi], mm1 /* store result in Dest */
1218 add eax, 8 /* increase Src1, Src2 and Dest */
1219 add ebx, 8 /* register pointers by 8 */
1220 add edi, 8
1221 dec ecx /* decrease loop counter */
1222 jnz L1017 /* check loop termination, proceed if required */
1223 emms /* exit MMX state */
1224 popa
1225 }
1226#else
1227 /* x86_64 ASM with constraints: */
1228 /* asm volatile ( */
1229 /* "shr $3, %%rcx \n\t" /\* counter/8 (MMX loads 8 bytes at a time) *\/ */
1230 /* ".align 16 \n\t" /\* 16 byte alignment of the loop entry *\/ */
1231 /* "1: movq (%%rax), %%mm1 \n\t" /\* load 8 bytes from Src1 into mm1 *\/ */
1232 /* "pand (%%rbx), %%mm1 \n\t" /\* mm1=Src1&Src2 *\/ */
1233 /* "movq %%mm1, (%%rdi) \n\t" /\* store result in Dest *\/ */
1234 /* "add $8, %%rax \n\t" /\* increase Src1, Src2 and Dest *\/ */
1235 /* "add $8, %%rbx \n\t" /\* register pointers by 8 *\/ */
1236 /* "add $8, %%rdi \n\t" */
1237 /* "dec %%rcx \n\t" /\* decrease loop counter *\/ */
1238 /* "jnz 1b \n\t" /\* check loop termination, proceed if required *\/ */
1239 /* "emms \n\t" /\* exit MMX state *\/ */
1240 /* : "+a" (Src1), /\* load Src1 address into rax, modified by the loop *\/ */
1241 /* "+b" (Src2), /\* load Src2 address into rbx, modified by the loop *\/ */
1242 /* "+c" (SrcLength), /\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
1243 /* "+D" (Dest) /\* load Dest address into rdi, modified by the loop *\/ */
1244 /* : */
1245 /* : "memory", /\* *Dest is modified *\/ */
1246 /* "mm1" /\* register mm1 modified *\/ */
1247 /* ); */
1248
1249 /* i386 and x86_64 */
1250 __m64 *mSrc1 = (__m64*)Src1;
1251 __m64 *mSrc2 = (__m64*)Src2;
1252 __m64 *mDest = (__m64*)Dest;
1253 int i;
1254 for (i = 0; i < SrcLength/8; i++) {
1255 *mDest = _m_pand(*mSrc1, *mSrc2); /* Src1&Src2 */
1256 mSrc1++;
1257 mSrc2++;
1258 mDest++;
1259 }
1260 _m_empty(); /* clean MMX state */
1261#endif
1262 return (0);
1263#else
1264 return (-1);
1265#endif
1266}
1267
1278int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1279{
1280 unsigned int i, istart;
1281 unsigned char *cursrc1, *cursrc2, *curdst;
1282
1283 /* Validate input parameters */
1284 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1285 return(-1);
1286 if (length == 0)
1287 return(0);
1288
1289 if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
1290 /* if (length > 7) { */
1291 /* Call MMX routine */
1292
1293 SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
1294
1295 /* Check for unaligned bytes */
1296 if ((length & 7) > 0) {
1297
1298 /* Setup to process unaligned bytes */
1299 istart = length & 0xfffffff8;
1300 cursrc1 = &Src1[istart];
1301 cursrc2 = &Src2[istart];
1302 curdst = &Dest[istart];
1303 } else {
1304 /* No unaligned bytes - we are done */
1305 return (0);
1306 }
1307 } else {
1308 /* Setup to process whole image */
1309 istart = 0;
1310 cursrc1 = Src1;
1311 cursrc2 = Src2;
1312 curdst = Dest;
1313 }
1314
1315 /* C routine to process image */
1316 for (i = istart; i < length; i++) {
1317 *curdst = (*cursrc1) & (*cursrc2);
1318 /* Advance pointers */
1319 cursrc1++;
1320 cursrc2++;
1321 curdst++;
1322 }
1323
1324 return (0);
1325}
1326
1337static int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1338{
1339#ifdef USE_MMX
1340#if !defined(GCC__)
1341 __asm
1342 {
1343 pusha
1344 mov eax, Src1 /* load Src1 address into eax */
1345 mov ebx, Src2 /* load Src2 address into ebx */
1346 mov edi, Dest /* load Dest address into edi */
1347 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1348 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1349 align 16 /* 16 byte alignment of the loop entry */
1350L91017:
1351 movq mm1, [eax] /* load 8 bytes from Src1 into mm1 */
1352 por mm1, [ebx] /* mm1=Src1|Src2 */
1353 movq [edi], mm1 /* store result in Dest */
1354 add eax, 8 /* increase Src1, Src2 and Dest */
1355 add ebx, 8 /* register pointers by 8 */
1356 add edi, 8
1357 dec ecx /* decrease loop counter */
1358 jnz L91017 /* check loop termination, proceed if required */
1359 emms /* exit MMX state */
1360 popa
1361 }
1362#else
1363 /* i386 and x86_64 */
1364 __m64 *mSrc1 = (__m64*)Src1;
1365 __m64 *mSrc2 = (__m64*)Src2;
1366 __m64 *mDest = (__m64*)Dest;
1367 int i;
1368 for (i = 0; i < SrcLength/8; i++) {
1369 *mDest = _m_por(*mSrc1, *mSrc2); /* Src1|Src2 */
1370 mSrc1++;
1371 mSrc2++;
1372 mDest++;
1373 }
1374 _m_empty(); /* clean MMX state */
1375#endif
1376 return (0);
1377#else
1378 return (-1);
1379#endif
1380}
1381
1392int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1393{
1394 unsigned int i, istart;
1395 unsigned char *cursrc1, *cursrc2, *curdst;
1396
1397 /* Validate input parameters */
1398 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1399 return(-1);
1400 if (length == 0)
1401 return(0);
1402
1403 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1404
1405 /* MMX routine */
1406 SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
1407
1408 /* Check for unaligned bytes */
1409 if ((length & 7) > 0) {
1410 /* Setup to process unaligned bytes */
1411 istart = length & 0xfffffff8;
1412 cursrc1 = &Src1[istart];
1413 cursrc2 = &Src2[istart];
1414 curdst = &Dest[istart];
1415 } else {
1416 /* No unaligned bytes - we are done */
1417 return (0);
1418 }
1419 } else {
1420 /* Setup to process whole image */
1421 istart = 0;
1422 cursrc1 = Src1;
1423 cursrc2 = Src2;
1424 curdst = Dest;
1425 }
1426
1427 /* C routine to process image */
1428 for (i = istart; i < length; i++) {
1429 *curdst = *cursrc1 | *cursrc2;
1430 /* Advance pointers */
1431 cursrc1++;
1432 cursrc2++;
1433 curdst++;
1434 }
1435 return (0);
1436}
1437
1448static int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
1449{
1450#ifdef USE_MMX
1451#if !defined(GCC__)
1452 __asm
1453 {
1454 pusha
1455 mov edx, Src1 /* load Src1 address into edx */
1456 mov esi, Src2 /* load Src2 address into esi */
1457 mov edi, Dest /* load Dest address into edi */
1458 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1459 align 16 /* 16 byte alignment of the loop entry */
1460L10191:
1461 mov bl, [esi] /* load a byte from Src2 */
1462 cmp bl, 0 /* check if it zero */
1463 jnz L10192
1464 mov [edi], 255 /* division by zero = 255 !!! */
1465 jmp L10193
1466L10192:
1467 xor ah, ah /* prepare AX, zero AH register */
1468 mov al, [edx] /* load a byte from Src1 into AL */
1469 div bl /* divide AL by BL */
1470 mov [edi], al /* move a byte result to Dest */
1471L10193:
1472 inc edx /* increment Src1, Src2, Dest */
1473 inc esi /* pointer registers by one */
1474 inc edi
1475 dec ecx /* decrease loop counter */
1476 jnz L10191 /* check loop termination, proceed if required */
1477 popa
1478 }
1479#else
1480 /* Note: ~15% gain on i386, less efficient than C on x86_64 */
1481 /* Also depends on whether the function is static (?!) */
1482 /* Also depends on whether we work on malloc() or static char[] */
1483 asm volatile (
1484# if defined(i386)
1485 "pushl %%ebx \n\t" /* %ebx may be the PIC register. */
1486 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
1487 "1: mov (%%esi), %%bl \n\t" /* load a byte from Src2 */
1488 "cmp $0, %%bl \n\t" /* check if it zero */
1489 "jnz 2f \n\t"
1490 "movb $255, (%%edi) \n\t" /* division by zero = 255 !!! */
1491 "jmp 3f \n\t"
1492 "2: xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */
1493 "mov (%%edx), %%al \n\t" /* load a byte from Src1 into AL */
1494 "div %%bl \n\t" /* divide AL by BL */
1495 "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */
1496 "3: inc %%edx \n\t" /* increment Src1, Src2, Dest */
1497 "inc %%esi \n\t" /* pointer registers by one */
1498 "inc %%edi \n\t"
1499 "dec %%ecx \n\t" /* decrease loop counter */
1500 "jnz 1b \n\t" /* check loop termination, proceed if required */
1501 "popl %%ebx \n\t" /* restore %ebx */
1502 : "+d" (Src1), /* load Src1 address into edx */
1503 "+S" (Src2), /* load Src2 address into esi */
1504 "+c" (SrcLength), /* load loop counter (SIZE) into ecx */
1505 "+D" (Dest) /* load Dest address into edi */
1506 :
1507 : "memory", "rax"
1508# elif defined(__x86_64__)
1509 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
1510 "1: mov (%%rsi), %%bl \n\t" /* load a byte from Src2 */
1511 "cmp $0, %%bl \n\t" /* check if it zero */
1512 "jnz 2f \n\t"
1513 "movb $255, (%%rdi) \n\t" /* division by zero = 255 !!! */
1514 "jmp 3f \n\t"
1515 "2: xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */
1516 "mov (%%rdx), %%al \n\t" /* load a byte from Src1 into AL */
1517 "div %%bl \n\t" /* divide AL by BL */
1518 "mov %%al, (%%rdi) \n\t" /* move a byte result to Dest */
1519 "3: inc %%rdx \n\t" /* increment Src1, Src2, Dest */
1520 "inc %%rsi \n\t" /* pointer registers by one */
1521 "inc %%rdi \n\t"
1522 "dec %%rcx \n\t" /* decrease loop counter */
1523 "jnz 1b \n\t" /* check loop termination, proceed if required */
1524 : "+d" (Src1), /* load Src1 address into edx */
1525 "+S" (Src2), /* load Src2 address into esi */
1526 "+c" (SrcLength), /* load loop counter (SIZE) into ecx */
1527 "+D" (Dest) /* load Dest address into edi */
1528 :
1529 : "memory", "rax", "rbx"
1530# endif
1531 );
1532#endif
1533 return (0);
1534#else
1535 return (-1);
1536#endif
1537}
1538
1549int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
1550{
1551 unsigned int i, istart;
1552 unsigned char *cursrc1, *cursrc2, *curdst;
1553
1554 /* Validate input parameters */
1555 if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
1556 return(-1);
1557 if (length == 0)
1558 return(0);
1559
1561 if (length > 0) {
1562 /* Call ASM routine */
1563 SDL_imageFilterDivASM(Src1, Src2, Dest, length);
1564
1565 /* Never unaligned bytes - we are done */
1566 return (0);
1567 } else {
1568 return (-1);
1569 }
1570 }
1571
1572 /* Setup to process whole image */
1573 istart = 0;
1574 cursrc1 = Src1;
1575 cursrc2 = Src2;
1576 curdst = Dest;
1577
1578 /* C routine to process image */
1579 /* for (i = istart; i < length; i++) { */
1580 /* if (*cursrc2 == 0) { */
1581 /* *curdst = 255; */
1582 /* } else { */
1583 /* result = (int) *cursrc1 / (int) *cursrc2; */
1584 /* *curdst = (unsigned char) result; */
1585 /* } */
1586 /* /\* Advance pointers *\/ */
1587 /* cursrc1++; */
1588 /* cursrc2++; */
1589 /* curdst++; */
1590 /* } */
1591 for (i = istart; i < length; i++) {
1592 if (*cursrc2 == 0) {
1593 *curdst = 255;
1594 } else {
1595 *curdst = (int)*cursrc1 / (int)*cursrc2; // (int) for efficiency
1596 }
1597 /* Advance pointers */
1598 cursrc1++;
1599 cursrc2++;
1600 curdst++;
1601 }
1602
1603 return (0);
1604}
1605
1606/* ------------------------------------------------------------------------------------ */
1607
1617static int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)
1618{
1619#ifdef USE_MMX
1620#if !defined(GCC__)
1621 __asm
1622 {
1623 pusha
1624 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
1625 mov eax, Src1 /* load Src1 address into eax */
1626 mov edi, Dest /* load Dest address into edi */
1627 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1628 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1629 align 16 /* 16 byte alignment of the loop entry */
1630L91117:
1631 movq mm0, [eax] /* load 8 bytes from Src1 into mm1 */
1632 pxor mm0, mm1 /* negate mm0 by xoring with mm1 */
1633 movq [edi], mm0 /* store result in Dest */
1634 add eax, 8 /* increase Src1, Src2 and Dest */
1635 add edi, 8
1636 dec ecx /* decrease loop counter */
1637 jnz L91117 /* check loop termination, proceed if required */
1638 emms /* exit MMX state */
1639 popa
1640 }
1641#else
1642 /* i386 and x86_64 */
1643 __m64 *mSrc1 = (__m64*)Src1;
1644 __m64 *mDest = (__m64*)Dest;
1645 __m64 mm1;
1646 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */
1647 int i;
1648 for (i = 0; i < SrcLength/8; i++) {
1649 *mDest = _m_pxor(*mSrc1, mm1); /* negate mm0 by xoring with mm1 */
1650 mSrc1++;
1651 mDest++;
1652 }
1653 _m_empty(); /* clean MMX state */
1654
1655#endif
1656 return (0);
1657#else
1658 return (-1);
1659#endif
1660}
1661
1671int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
1672{
1673 unsigned int i, istart;
1674 unsigned char *cursrc1, *curdst;
1675
1676 /* Validate input parameters */
1677 if ((Src1 == NULL) || (Dest == NULL))
1678 return(-1);
1679 if (length == 0)
1680 return(0);
1681
1682 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1683 /* MMX routine */
1684 SDL_imageFilterBitNegationMMX(Src1, Dest, length);
1685
1686 /* Check for unaligned bytes */
1687 if ((length & 7) > 0) {
1688 /* Setup to process unaligned bytes */
1689 istart = length & 0xfffffff8;
1690 cursrc1 = &Src1[istart];
1691 curdst = &Dest[istart];
1692 } else {
1693 /* No unaligned bytes - we are done */
1694 return (0);
1695 }
1696 } else {
1697 /* Setup to process whole image */
1698 istart = 0;
1699 cursrc1 = Src1;
1700 curdst = Dest;
1701 }
1702
1703 /* C routine to process image */
1704 for (i = istart; i < length; i++) {
1705 *curdst = ~(*cursrc1);
1706 /* Advance pointers */
1707 cursrc1++;
1708 curdst++;
1709 }
1710
1711 return (0);
1712}
1713
1724static int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
1725{
1726#ifdef USE_MMX
1727#if !defined(GCC__)
1728 __asm
1729 {
1730 pusha
1731 /* ** Duplicate C in 8 bytes of MM1 ** */
1732 mov al, C /* load C into AL */
1733 mov ah, al /* copy AL into AH */
1734 mov bx, ax /* copy AX into BX */
1735 shl eax, 16 /* shift 2 bytes of EAX left */
1736 mov ax, bx /* copy BX into AX */
1737 movd mm1, eax /* copy EAX into MM1 */
1738 movd mm2, eax /* copy EAX into MM2 */
1739 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
1740 mov eax, Src1 /* load Src1 address into eax */
1741 mov edi, Dest /* load Dest address into edi */
1742 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1743 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1744 align 16 /* 16 byte alignment of the loop entry */
1745L1021:
1746 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */
1747 paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */
1748 movq [edi], mm0 /* store result in Dest */
1749 add eax, 8 /* increase Dest register pointer by 8 */
1750 add edi, 8 /* increase Dest register pointer by 8 */
1751 dec ecx /* decrease loop counter */
1752 jnz L1021 /* check loop termination, proceed if required */
1753 emms /* exit MMX state */
1754 popa
1755 }
1756#else
1757 /* i386 and x86_64 */
1758 __m64 *mSrc1 = (__m64*)Src1;
1759 __m64 *mDest = (__m64*)Dest;
1760 /* Duplicate C in 8 bytes of MM1 */
1761 int i;
1762 memset(&i, C, 4);
1763 __m64 mm1 = _m_from_int(i);
1764 __m64 mm2 = _m_from_int(i);
1765 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */
1766 //__m64 mm1 = _m_from_int64(lli); // x86_64 only
1767 for (i = 0; i < SrcLength/8; i++) {
1768 *mDest = _m_paddusb(*mSrc1, mm1); /* Src1+C (add 8 bytes with saturation) */
1769 mSrc1++;
1770 mDest++;
1771 }
1772 _m_empty(); /* clean MMX state */
1773#endif
1774 return (0);
1775#else
1776 return (-1);
1777#endif
1778}
1779
1791int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
1792{
1793 unsigned int i, istart;
1794 int iC;
1795 unsigned char *cursrc1, *curdest;
1796 int result;
1797
1798 /* Validate input parameters */
1799 if ((Src1 == NULL) || (Dest == NULL))
1800 return(-1);
1801 if (length == 0)
1802 return(0);
1803
1804 /* Special case: C==0 */
1805 if (C == 0) {
1806 memcpy(Src1, Dest, length);
1807 return (0);
1808 }
1809
1810 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1811
1812 /* MMX routine */
1813 SDL_imageFilterAddByteMMX(Src1, Dest, length, C);
1814
1815 /* Check for unaligned bytes */
1816 if ((length & 7) > 0) {
1817 /* Setup to process unaligned bytes */
1818 istart = length & 0xfffffff8;
1819 cursrc1 = &Src1[istart];
1820 curdest = &Dest[istart];
1821 } else {
1822 /* No unaligned bytes - we are done */
1823 return (0);
1824 }
1825 } else {
1826 /* Setup to process whole image */
1827 istart = 0;
1828 cursrc1 = Src1;
1829 curdest = Dest;
1830 }
1831
1832 /* C routine to process image */
1833 iC = (int) C;
1834 for (i = istart; i < length; i++) {
1835 result = (int) *cursrc1 + iC;
1836 if (result > 255)
1837 result = 255;
1838 *curdest = (unsigned char) result;
1839 /* Advance pointers */
1840 cursrc1++;
1841 curdest++;
1842 }
1843 return (0);
1844}
1845
1857static int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
1858{
1859#ifdef USE_MMX
1860#if !defined(GCC__)
1861 __asm
1862 {
1863 pusha
1864 /* ** Duplicate (int)C in 8 bytes of MM1 ** */
1865 mov eax, C /* load C into EAX */
1866 movd mm1, eax /* copy EAX into MM1 */
1867 mov eax, D /* load D into EAX */
1868 movd mm2, eax /* copy EAX into MM2 */
1869 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
1870 mov eax, Src1 /* load Src1 address into eax */
1871 mov edi, Dest /* load Dest address into edi */
1872 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
1873 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
1874 align 16 /* 16 byte alignment of the loop entry */
1875L11023:
1876 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
1877 paddusb mm0, mm1 /* MM0=SrcDest+C (add 8 bytes with saturation) */
1878 movq [edi], mm0 /* store result in SrcDest */
1879 add eax, 8 /* increase Src1 register pointer by 8 */
1880 add edi, 8 /* increase Dest register pointer by 8 */
1881 dec ecx /* decrease loop counter */
1882 jnz L11023 /* check loop termination, proceed if required */
1883 emms /* exit MMX state */
1884 popa
1885 }
1886#else
1887 /* i386 and x86_64 */
1888 __m64 *mSrc1 = (__m64*)Src1;
1889 __m64 *mDest = (__m64*)Dest;
1890 /* Duplicate (int)C in 8 bytes of MM1 */
1891 __m64 mm1 = _m_from_int(C);
1892 __m64 mm2 = _m_from_int(C);
1893 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */
1894 //__m64 mm1 = _m_from_int64(lli); // x86_64 only
1895 int i;
1896 for (i = 0; i < SrcLength/8; i++) {
1897 *mDest = _m_paddusb(*mSrc1, mm1); /* Src1+C (add 8 bytes with saturation) */
1898 mSrc1++;
1899 mDest++;
1900 }
1901 _m_empty(); /* clean MMX state */
1902#endif
1903 return (0);
1904#else
1905 return (-1);
1906#endif
1907}
1908
1919int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
1920{
1921 unsigned int i, j, istart, D;
1922 int iC[4];
1923 unsigned char *cursrc1;
1924 unsigned char *curdest;
1925 int result;
1926
1927 /* Validate input parameters */
1928 if ((Src1 == NULL) || (Dest == NULL))
1929 return(-1);
1930 if (length == 0)
1931 return(0);
1932
1933 /* Special case: C==0 */
1934 if (C == 0) {
1935 memcpy(Src1, Dest, length);
1936 return (0);
1937 }
1938
1939 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
1940
1941 /* MMX routine */
1942 D=SWAP_32(C);
1943 SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);
1944
1945 /* Check for unaligned bytes */
1946 if ((length & 7) > 0) {
1947 /* Setup to process unaligned bytes */
1948 istart = length & 0xfffffff8;
1949 cursrc1 = &Src1[istart];
1950 curdest = &Dest[istart];
1951 } else {
1952 /* No unaligned bytes - we are done */
1953 return (0);
1954 }
1955 } else {
1956 /* Setup to process whole image */
1957 istart = 0;
1958 cursrc1 = Src1;
1959 curdest = Dest;
1960 }
1961
1962 /* C routine to process bytes */
1963 iC[3] = (int) ((C >> 24) & 0xff);
1964 iC[2] = (int) ((C >> 16) & 0xff);
1965 iC[1] = (int) ((C >> 8) & 0xff);
1966 iC[0] = (int) ((C >> 0) & 0xff);
1967 for (i = istart; i < length; i += 4) {
1968 for (j = 0; j < 4; j++) {
1969 if ((i+j)<length) {
1970 result = (int) *cursrc1 + iC[j];
1971 if (result > 255) result = 255;
1972 *curdest = (unsigned char) result;
1973 /* Advance pointers */
1974 cursrc1++;
1975 curdest++;
1976 }
1977 }
1978 }
1979 return (0);
1980}
1981
1993static int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,
1994 unsigned char *Mask)
1995{
1996#ifdef USE_MMX
1997#if !defined(GCC__)
1998 __asm
1999 {
2000 pusha
2001 /* ** Duplicate C in 8 bytes of MM1 ** */
2002 mov al, C /* load C into AL */
2003 mov ah, al /* copy AL into AH */
2004 mov bx, ax /* copy AX into BX */
2005 shl eax, 16 /* shift 2 bytes of EAX left */
2006 mov ax, bx /* copy BX into AX */
2007 movd mm1, eax /* copy EAX into MM1 */
2008 movd mm2, eax /* copy EAX into MM2 */
2009 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
2010 mov edx, Mask /* load Mask address into edx */
2011 movq mm0, [edx] /* load Mask into mm0 */
2012 mov eax, Src1 /* load Src1 address into eax */
2013 mov edi, Dest /* load Dest address into edi */
2014 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2015 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2016 align 16 /* 16 byte alignment of the loop entry */
2017L1022:
2018 movq mm2, [eax] /* load 8 bytes from Src1 into MM2 */
2019 psrlw mm2, 1 /* shift 4 WORDS of MM2 1 bit to the right */
2020 pand mm2, mm0 // apply Mask to 8 BYTES of MM2 */
2021 paddusb mm2, mm1 /* MM2=SrcDest+C (add 8 bytes with saturation) */
2022 movq [edi], mm2 /* store result in Dest */
2023 add eax, 8 /* increase Src1 register pointer by 8 */
2024 add edi, 8 /* increase Dest register pointer by 8 */
2025 dec ecx /* decrease loop counter */
2026 jnz L1022 /* check loop termination, proceed if required */
2027 emms /* exit MMX state */
2028 popa
2029 }
2030#else
2031 /* i386 and x86_64 */
2032 __m64 *mSrc1 = (__m64*)Src1;
2033 __m64 *mDest = (__m64*)Dest;
2034 __m64 *mMask = (__m64*)Mask;
2035 /* Duplicate C in 8 bytes of MM1 */
2036 int i;
2037 memset(&i, C, 4);
2038 __m64 mm1 = _m_from_int(i);
2039 __m64 mm2 = _m_from_int(i);
2040 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */
2041 //__m64 mm1 = _m_from_int64(lli); // x86_64 only
2042 for (i = 0; i < SrcLength/8; i++) {
2043 __m64 mm2 = _m_psrlwi(*mSrc1, 1); /* shift 4 WORDS of MM2 1 bit to the right */
2044 mm2 = _m_pand(mm2, *mMask); /* apply Mask to 8 BYTES of MM2 */
2045 /* byte 0x0f, 0xdb, 0xd0 */
2046 *mDest = _m_paddusb(mm1, mm2); /* Src1+C (add 8 bytes with saturation) */
2047 mSrc1++;
2048 mDest++;
2049 }
2050 _m_empty(); /* clean MMX state */
2051#endif
2052 return (0);
2053#else
2054 return (-1);
2055#endif
2056}
2057
2068int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
2069{
2070 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
2071 unsigned int i, istart;
2072 int iC;
2073 unsigned char *cursrc1;
2074 unsigned char *curdest;
2075 int result;
2076
2077 /* Validate input parameters */
2078 if ((Src1 == NULL) || (Dest == NULL))
2079 return(-1);
2080 if (length == 0)
2081 return(0);
2082
2083 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2084
2085 /* MMX routine */
2086 SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);
2087
2088 /* Check for unaligned bytes */
2089 if ((length & 7) > 0) {
2090 /* Setup to process unaligned bytes */
2091 istart = length & 0xfffffff8;
2092 cursrc1 = &Src1[istart];
2093 curdest = &Dest[istart];
2094 } else {
2095 /* No unaligned bytes - we are done */
2096 return (0);
2097 }
2098 } else {
2099 /* Setup to process whole image */
2100 istart = 0;
2101 cursrc1 = Src1;
2102 curdest = Dest;
2103 }
2104
2105 /* C routine to process image */
2106 iC = (int) C;
2107 for (i = istart; i < length; i++) {
2108 result = (int) (*cursrc1 / 2) + iC;
2109 if (result > 255)
2110 result = 255;
2111 *curdest = (unsigned char) result;
2112 /* Advance pointers */
2113 cursrc1++;
2114 curdest++;
2115 }
2116
2117 return (0);
2118}
2119
2130int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
2131{
2132#ifdef USE_MMX
2133#if !defined(GCC__)
2134 __asm
2135 {
2136 pusha
2137 /* ** Duplicate C in 8 bytes of MM1 ** */
2138 mov al, C /* load C into AL */
2139 mov ah, al /* copy AL into AH */
2140 mov bx, ax /* copy AX into BX */
2141 shl eax, 16 /* shift 2 bytes of EAX left */
2142 mov ax, bx /* copy BX into AX */
2143 movd mm1, eax /* copy EAX into MM1 */
2144 movd mm2, eax /* copy EAX into MM2 */
2145 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
2146 mov eax, Src1 /* load Src1 address into eax */
2147 mov edi, Dest /* load Dest address into edi */
2148 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2149 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2150 align 16 /* 16 byte alignment of the loop entry */
2151L1023:
2152 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
2153 psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */
2154 movq [edi], mm0 /* store result in SrcDest */
2155 add eax, 8 /* increase Src1 register pointer by 8 */
2156 add edi, 8 /* increase Dest register pointer by 8 */
2157 dec ecx /* decrease loop counter */
2158 jnz L1023 /* check loop termination, proceed if required */
2159 emms /* exit MMX state */
2160 popa
2161 }
2162#else
2163 /* i386 and x86_64 */
2164 __m64 *mSrc1 = (__m64*)Src1;
2165 __m64 *mDest = (__m64*)Dest;
2166 /* Duplicate C in 8 bytes of MM1 */
2167 int i;
2168 memset(&i, C, 4);
2169 __m64 mm1 = _m_from_int(i);
2170 __m64 mm2 = _m_from_int(i);
2171 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */
2172 //__m64 mm1 = _m_from_int64(lli); // x86_64 only
2173 for (i = 0; i < SrcLength/8; i++) {
2174 *mDest = _m_psubusb(*mSrc1, mm1); /* Src1-C (sub 8 bytes with saturation) */
2175 mSrc1++;
2176 mDest++;
2177 }
2178 _m_empty(); /* clean MMX state */
2179#endif
2180 return (0);
2181#else
2182 return (-1);
2183#endif
2184}
2185
2196int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
2197{
2198 unsigned int i, istart;
2199 int iC;
2200 unsigned char *cursrc1;
2201 unsigned char *curdest;
2202 int result;
2203
2204 /* Validate input parameters */
2205 if ((Src1 == NULL) || (Dest == NULL))
2206 return(-1);
2207 if (length == 0)
2208 return(0);
2209
2210 /* Special case: C==0 */
2211 if (C == 0) {
2212 memcpy(Src1, Dest, length);
2213 return (0);
2214 }
2215
2216 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2217
2218 /* MMX routine */
2219 SDL_imageFilterSubByteMMX(Src1, Dest, length, C);
2220
2221 /* Check for unaligned bytes */
2222 if ((length & 7) > 0) {
2223 /* Setup to process unaligned bytes */
2224 istart = length & 0xfffffff8;
2225 cursrc1 = &Src1[istart];
2226 curdest = &Dest[istart];
2227 } else {
2228 /* No unaligned bytes - we are done */
2229 return (0);
2230 }
2231 } else {
2232 /* Setup to process whole image */
2233 istart = 0;
2234 cursrc1 = Src1;
2235 curdest = Dest;
2236 }
2237
2238 /* C routine to process image */
2239 iC = (int) C;
2240 for (i = istart; i < length; i++) {
2241 result = (int) *cursrc1 - iC;
2242 if (result < 0)
2243 result = 0;
2244 *curdest = (unsigned char) result;
2245 /* Advance pointers */
2246 cursrc1++;
2247 curdest++;
2248 }
2249 return (0);
2250}
2251
2263static int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
2264{
2265#ifdef USE_MMX
2266#if !defined(GCC__)
2267 __asm
2268 {
2269 pusha
2270 /* ** Duplicate (int)C in 8 bytes of MM1 ** */
2271 mov eax, C /* load C into EAX */
2272 movd mm1, eax /* copy EAX into MM1 */
2273 mov eax, D /* load D into EAX */
2274 movd mm2, eax /* copy EAX into MM2 */
2275 punpckldq mm1, mm2 /* fill higher bytes of MM1 with C */
2276 mov eax, Src1 /* load Src1 address into eax */
2277 mov edi, Dest /* load Dest address into edi */
2278 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2279 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2280 align 16 /* 16 byte alignment of the loop entry */
2281L11024:
2282 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
2283 psubusb mm0, mm1 /* MM0=SrcDest-C (sub 8 bytes with saturation) */
2284 movq [edi], mm0 /* store result in SrcDest */
2285 add eax, 8 /* increase Src1 register pointer by 8 */
2286 add edi, 8 /* increase Dest register pointer by 8 */
2287 dec ecx /* decrease loop counter */
2288 jnz L11024 /* check loop termination, proceed if required */
2289 emms /* exit MMX state */
2290 popa
2291 }
2292#else
2293 /* i386 and x86_64 */
2294 __m64 *mSrc1 = (__m64*)Src1;
2295 __m64 *mDest = (__m64*)Dest;
2296 /* Duplicate (int)C in 8 bytes of MM1 */
2297 __m64 mm1 = _m_from_int(C);
2298 __m64 mm2 = _m_from_int(C);
2299 mm1 = _m_punpckldq(mm1, mm2); /* fill higher bytes of MM1 with C */
2300 //__m64 mm1 = _m_from_int64(lli); // x86_64 only
2301 int i;
2302 for (i = 0; i < SrcLength/8; i++) {
2303 *mDest = _m_psubusb(*mSrc1, mm1); /* Src1-C (sub 8 bytes with saturation) */
2304 mSrc1++;
2305 mDest++;
2306 }
2307 _m_empty(); /* clean MMX state */
2308#endif
2309 return (0);
2310#else
2311 return (-1);
2312#endif
2313}
2314
2325int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
2326{
2327 unsigned int i, j, istart, D;
2328 int iC[4];
2329 unsigned char *cursrc1;
2330 unsigned char *curdest;
2331 int result;
2332
2333 /* Validate input parameters */
2334 if ((Src1 == NULL) || (Dest == NULL))
2335 return(-1);
2336 if (length == 0)
2337 return(0);
2338
2339 /* Special case: C==0 */
2340 if (C == 0) {
2341 memcpy(Src1, Dest, length);
2342 return (0);
2343 }
2344
2345 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2346
2347 /* MMX routine */
2348 D=SWAP_32(C);
2349 SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);
2350
2351 /* Check for unaligned bytes */
2352 if ((length & 7) > 0) {
2353 /* Setup to process unaligned bytes */
2354 istart = length & 0xfffffff8;
2355 cursrc1 = &Src1[istart];
2356 curdest = &Dest[istart];
2357 } else {
2358 /* No unaligned bytes - we are done */
2359 return (0);
2360 }
2361 } else {
2362 /* Setup to process whole image */
2363 istart = 0;
2364 cursrc1 = Src1;
2365 curdest = Dest;
2366 }
2367
2368 /* C routine to process image */
2369 iC[3] = (int) ((C >> 24) & 0xff);
2370 iC[2] = (int) ((C >> 16) & 0xff);
2371 iC[1] = (int) ((C >> 8) & 0xff);
2372 iC[0] = (int) ((C >> 0) & 0xff);
2373 for (i = istart; i < length; i += 4) {
2374 for (j = 0; j < 4; j++) {
2375 if ((i+j)<length) {
2376 result = (int) *cursrc1 - iC[j];
2377 if (result < 0) result = 0;
2378 *curdest = (unsigned char) result;
2379 /* Advance pointers */
2380 cursrc1++;
2381 curdest++;
2382 }
2383 }
2384 }
2385 return (0);
2386}
2387
2399static int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
2400 unsigned char *Mask)
2401{
2402#ifdef USE_MMX
2403#if !defined(GCC__)
2404 __asm
2405 {
2406 pusha
2407 mov edx, Mask /* load Mask address into edx */
2408 movq mm0, [edx] /* load Mask into mm0 */
2409 xor ecx, ecx /* zero ECX */
2410 mov cl, N /* load loop counter (N) into CL */
2411 movd mm3, ecx /* copy (N) into MM3 */
2412 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
2413L10240: /* ** Prepare proper bit-Mask in MM1 ** */
2414 psrlw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the right */
2415 pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */
2416 /* byte 0x0f, 0xdb, 0xc8 */
2417 dec cl /* decrease loop counter */
2418 jnz L10240 /* check loop termination, proceed if required */
2419 /* ** Shift all bytes of the image ** */
2420 mov eax, Src1 /* load Src1 address into eax */
2421 mov edi, Dest /* load Dest address into edi */
2422 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2423 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2424 align 16 /* 16 byte alignment of the loop entry */
2425L10241:
2426 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
2427 psrlw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the right */
2428 pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */
2429 /* byte 0x0f, 0xdb, 0xc1 */
2430 movq [edi], mm0 /* store result in SrcDest */
2431 add eax, 8 /* increase Src1 register pointer by 8 */
2432 add edi, 8 /* increase Dest register pointer by 8 */
2433 dec ecx /* decrease loop counter */
2434 jnz L10241 /* check loop termination, proceed if required */
2435 emms /* exit MMX state */
2436 popa
2437 }
2438#else
2439 /* i386 and x86_64 */
2440 __m64 *mSrc1 = (__m64*)Src1;
2441 __m64 *mDest = (__m64*)Dest;
2442 __m64 *mMask = (__m64*)Mask;
2443 __m64 mm1;
2444 int i;
2445 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */
2446 /* Prepare proper bit-Mask in MM1 */
2447 for (i = 0; i < N; i++) {
2448 mm1 = _m_psrlwi(mm1, 1); /* shift 4 WORDS of MM1 1 bit to the right */
2449 mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of MM1 */
2450 }
2451 /* Shift all bytes of the image */
2452 for (i = 0; i < SrcLength/8; i++) {
2453 __m64 mm0 = _m_psrlwi(*mSrc1, N); /* shift 4 WORDS of MM0 (N) bits to the right */
2454 *mDest = _m_pand(mm0, mm1); /* apply proper bit-Mask to 8 BYTES of MM0 */
2455 mSrc1++;
2456 mDest++;
2457 }
2458 _m_empty(); /* clean MMX state */
2459#endif
2460 return (0);
2461#else
2462 return (-1);
2463#endif
2464}
2465
2476int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
2477{
2478 static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
2479 unsigned int i, istart;
2480 unsigned char *cursrc1;
2481 unsigned char *curdest;
2482
2483 /* Validate input parameters */
2484 if ((Src1 == NULL) || (Dest == NULL))
2485 return(-1);
2486 if (length == 0)
2487 return(0);
2488
2489 /* Check shift */
2490 if (N > 8) {
2491 return (-1);
2492 }
2493
2494 /* Special case: N==0 */
2495 if (N == 0) {
2496 memcpy(Src1, Dest, length);
2497 return (0);
2498 }
2499
2500 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2501
2502 /* MMX routine */
2503 SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);
2504
2505 /* Check for unaligned bytes */
2506 if ((length & 7) > 0) {
2507 /* Setup to process unaligned bytes */
2508 istart = length & 0xfffffff8;
2509 cursrc1 = &Src1[istart];
2510 curdest = &Dest[istart];
2511 } else {
2512 /* No unaligned bytes - we are done */
2513 return (0);
2514 }
2515 } else {
2516 /* Setup to process whole image */
2517 istart = 0;
2518 cursrc1 = Src1;
2519 curdest = Dest;
2520 }
2521
2522 /* C routine to process image */
2523 for (i = istart; i < length; i++) {
2524 *curdest = (unsigned char) *cursrc1 >> N;
2525 /* Advance pointers */
2526 cursrc1++;
2527 curdest++;
2528 }
2529
2530 return (0);
2531}
2532
2543static int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
2544{
2545#ifdef USE_MMX
2546#if !defined(GCC__)
2547 __asm
2548 {
2549 pusha
2550 mov eax, Src1 /* load Src1 address into eax */
2551 mov edi, Dest /* load Dest address into edi */
2552 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2553 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2554 align 16 /* 16 byte alignment of the loop entry */
2555L13023:
2556 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
2557 psrld mm0, N
2558 movq [edi], mm0 /* store result in SrcDest */
2559 add eax, 8 /* increase Src1 register pointer by 8 */
2560 add edi, 8 /* increase Dest register pointer by 8 */
2561 dec ecx /* decrease loop counter */
2562 jnz L13023 /* check loop termination, proceed if required */
2563 emms /* exit MMX state */
2564 popa
2565 }
2566#else
2567 /* i386 and x86_64 */
2568 __m64 *mSrc1 = (__m64*)Src1;
2569 __m64 *mDest = (__m64*)Dest;
2570 int i;
2571 for (i = 0; i < SrcLength/8; i++) {
2572 *mDest = _m_psrldi(*mSrc1, N);
2573 mSrc1++;
2574 mDest++;
2575 }
2576 _m_empty(); /* clean MMX state */
2577#endif
2578 return (0);
2579#else
2580 return (-1);
2581#endif
2582}
2583
2594int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
2595{
2596 unsigned int i, istart;
2597 unsigned char *cursrc1, *curdest;
2598 unsigned int *icursrc1, *icurdest;
2599 unsigned int result;
2600
2601 /* Validate input parameters */
2602 if ((Src1 == NULL) || (Dest == NULL))
2603 return(-1);
2604 if (length == 0)
2605 return(0);
2606
2607 if (N > 32) {
2608 return (-1);
2609 }
2610
2611 /* Special case: N==0 */
2612 if (N == 0) {
2613 memcpy(Src1, Dest, length);
2614 return (0);
2615 }
2616
2617 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2618
2619 SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);
2620
2621 /* Check for unaligned bytes */
2622 if ((length & 7) > 0) {
2623 /* Setup to process unaligned bytes */
2624 istart = length & 0xfffffff8;
2625 cursrc1 = &Src1[istart];
2626 curdest = &Dest[istart];
2627 } else {
2628 /* No unaligned bytes - we are done */
2629 return (0);
2630 }
2631 } else {
2632 /* Setup to process whole image */
2633 istart = 0;
2634 cursrc1 = Src1;
2635 curdest = Dest;
2636 }
2637
2638 /* C routine to process image */
2639 icursrc1=(unsigned int *)cursrc1;
2640 icurdest=(unsigned int *)curdest;
2641 for (i = istart; i < length; i += 4) {
2642 if ((i+4)<length) {
2643 result = ((unsigned int)*icursrc1 >> N);
2644 *icurdest = result;
2645 }
2646 /* Advance pointers */
2647 icursrc1++;
2648 icurdest++;
2649 }
2650
2651 return (0);
2652}
2653
2664static int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
2665{
2666#ifdef USE_MMX
2667#if !defined(GCC__)
2668 __asm
2669 {
2670 pusha
2671 /* ** Duplicate C in 4 words of MM1 ** */
2672 mov al, C /* load C into AL */
2673 xor ah, ah /* zero AH */
2674 mov bx, ax /* copy AX into BX */
2675 shl eax, 16 /* shift 2 bytes of EAX left */
2676 mov ax, bx /* copy BX into AX */
2677 movd mm1, eax /* copy EAX into MM1 */
2678 movd mm2, eax /* copy EAX into MM2 */
2679 punpckldq mm1, mm2 /* fill higher words of MM1 with C */
2680 pxor mm0, mm0 /* zero MM0 register */
2681 mov eax, Src1 /* load Src1 address into eax */
2682 mov edi, Dest /* load Dest address into edi */
2683 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2684 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2685 cmp al, 128 /* if (C <= 128) execute more efficient code */
2686 jg L10251
2687 align 16 /* 16 byte alignment of the loop entry */
2688L10250:
2689 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
2690 movq mm4, mm3 /* copy MM3 into MM4 */
2691 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
2692 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
2693 pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */
2694 pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */
2695 packuswb mm3, mm4 /* pack words back into bytes with saturation */
2696 movq [edi], mm3 /* store result in Dest */
2697 add eax, 8 /* increase Src1 register pointer by 8 */
2698 add edi, 8 /* increase Dest register pointer by 8 */
2699 dec ecx /* decrease loop counter */
2700 jnz L10250 /* check loop termination, proceed if required */
2701 jmp L10252
2702 align 16 /* 16 byte alignment of the loop entry */
2703L10251:
2704 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
2705 movq mm4, mm3 /* copy MM3 into MM4 */
2706 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
2707 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
2708 pmullw mm3, mm1 /* mul low bytes of SrcDest and MM1 */
2709 pmullw mm4, mm1 /* mul high bytes of SrcDest and MM1 */
2710 /* ** Take abs value of the results (signed words) ** */
2711 movq mm5, mm3 /* copy mm3 into mm5 */
2712 movq mm6, mm4 /* copy mm4 into mm6 */
2713 psraw mm5, 15 /* fill mm5 words with word sign bit */
2714 psraw mm6, 15 /* fill mm6 words with word sign bit */
2715 pxor mm3, mm5 /* take 1's compliment of only neg words */
2716 pxor mm4, mm6 /* take 1's compliment of only neg words */
2717 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */
2718 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
2719 packuswb mm3, mm4 /* pack words back into bytes with saturation */
2720 movq [edi], mm3 /* store result in Dest */
2721 add eax, 8 /* increase Src1 register pointer by 8 */
2722 add edi, 8 /* increase Dest register pointer by 8 */
2723 dec ecx /* decrease loop counter */
2724 jnz L10251 /* check loop termination, proceed if required */
2725L10252:
2726 emms /* exit MMX state */
2727 popa
2728 }
2729#else
2730 /* i386 and x86_64 */
2731 __m64 *mSrc1 = (__m64*)Src1;
2732 __m64 *mDest = (__m64*)Dest;
2733 __m64 mm0 = _m_from_int(0); /* zero mm0 register */
2734 /* Duplicate C in 4 words of MM1 */
2735 int i;
2736 i = C | C<<16;
2737 __m64 mm1 = _m_from_int(i);
2738 __m64 mm2 = _m_from_int(i);
2739 mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with C */
2740 // long long lli = C | C<<16 | (long long)C<<32 | (long long)C<<48;
2741 //__m64 mm1 = _m_from_int64(lli); // x86_64 only
2742 if (C <= 128) { /* if (C <= 128) execute more efficient code */
2743 for (i = 0; i < SrcLength/8; i++) {
2744 __m64 mm3, mm4;
2745 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
2746 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
2747 mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */
2748 mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */
2749 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
2750 mSrc1++;
2751 mDest++;
2752 }
2753 } else {
2754 for (i = 0; i < SrcLength/8; i++) {
2755 __m64 mm3, mm4, mm5, mm6;
2756 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
2757 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
2758 mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */
2759 mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */
2760 /* Take abs value of the results (signed words) */
2761 mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */
2762 mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */
2763 mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */
2764 mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */
2765 mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */
2766 mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */
2767 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
2768 mSrc1++;
2769 mDest++;
2770 }
2771 }
2772 _m_empty(); /* clean MMX state */
2773#endif
2774 return (0);
2775#else
2776 return (-1);
2777#endif
2778}
2779
2790int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
2791{
2792 unsigned int i, istart;
2793 int iC;
2794 unsigned char *cursrc1;
2795 unsigned char *curdest;
2796 int result;
2797
2798 /* Validate input parameters */
2799 if ((Src1 == NULL) || (Dest == NULL))
2800 return(-1);
2801 if (length == 0)
2802 return(0);
2803
2804 /* Special case: C==1 */
2805 if (C == 1) {
2806 memcpy(Src1, Dest, length);
2807 return (0);
2808 }
2809
2810 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2811
2812 SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);
2813
2814 /* Check for unaligned bytes */
2815 if ((length & 7) > 0) {
2816 /* Setup to process unaligned bytes */
2817 istart = length & 0xfffffff8;
2818 cursrc1 = &Src1[istart];
2819 curdest = &Dest[istart];
2820 } else {
2821 /* No unaligned bytes - we are done */
2822 return (0);
2823 }
2824 } else {
2825 /* Setup to process whole image */
2826 istart = 0;
2827 cursrc1 = Src1;
2828 curdest = Dest;
2829 }
2830
2831 /* C routine to process image */
2832 iC = (int) C;
2833 for (i = istart; i < length; i++) {
2834 result = (int) *cursrc1 * iC;
2835 if (result > 255)
2836 result = 255;
2837 *curdest = (unsigned char) result;
2838 /* Advance pointers */
2839 cursrc1++;
2840 curdest++;
2841 }
2842
2843 return (0);
2844}
2845
2857static int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
2858 unsigned char C)
2859{
2860#ifdef USE_MMX
2861#if !defined(GCC__)
2862 __asm
2863 {
2864 pusha
2865 /* ** Duplicate C in 4 words of MM1 ** */
2866 mov al, C /* load C into AL */
2867 xor ah, ah /* zero AH */
2868 mov bx, ax /* copy AX into BX */
2869 shl eax, 16 /* shift 2 bytes of EAX left */
2870 mov ax, bx /* copy BX into AX */
2871 movd mm1, eax /* copy EAX into MM1 */
2872 movd mm2, eax /* copy EAX into MM2 */
2873 punpckldq mm1, mm2 /* fill higher words of MM1 with C */
2874 xor ecx, ecx /* zero ECX */
2875 mov cl, N /* load N into CL */
2876 movd mm7, ecx /* copy N into MM7 */
2877 pxor mm0, mm0 /* zero MM0 register */
2878 mov eax, Src1 /* load Src1 address into eax */
2879 mov edi, Dest /* load Dest address into edi */
2880 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
2881 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
2882 align 16 /* 16 byte alignment of the loop entry */
2883L1026:
2884 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
2885 movq mm4, mm3 /* copy MM3 into MM4 */
2886 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
2887 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
2888 psrlw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the right */
2889 psrlw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the right */
2890 pmullw mm3, mm1 /* mul low bytes of SrcDest by MM1 */
2891 pmullw mm4, mm1 /* mul high bytes of SrcDest by MM1 */
2892 packuswb mm3, mm4 /* pack words back into bytes with saturation */
2893 movq [edi], mm3 /* store result in Dest */
2894 add eax, 8 /* increase Src1 register pointer by 8 */
2895 add edi, 8 /* increase Dest register pointer by 8 */
2896 dec ecx /* decrease loop counter */
2897 jnz L1026 /* check loop termination, proceed if required */
2898 emms /* exit MMX state */
2899 popa
2900 }
2901#else
2902 /* i386 and x86_64 */
2903 __m64 *mSrc1 = (__m64*)Src1;
2904 __m64 *mDest = (__m64*)Dest;
2905 __m64 mm0 = _m_from_int(0); /* zero mm0 register */
2906 /* Duplicate C in 4 words of MM1 */
2907 int i;
2908 i = (C<<16)|C;
2909 __m64 mm1 = _m_from_int(i);
2910 __m64 mm2 = _m_from_int(i);
2911 mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with C */
2912 for (i = 0; i < SrcLength/8; i++) {
2913 __m64 mm3, mm4, mm5, mm6;
2914 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
2915 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
2916 mm3 = _m_psrlwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the right */
2917 mm4 = _m_psrlwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the right */
2918 mm3 = _m_pmullw(mm3, mm1); /* mul low bytes of Src1 and MM1 */
2919 mm4 = _m_pmullw(mm4, mm1); /* mul high bytes of Src1 and MM1 */
2920 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
2921 mSrc1++;
2922 mDest++;
2923 }
2924 _m_empty(); /* clean MMX state */
2925#endif
2926 return (0);
2927#else
2928 return (-1);
2929#endif
2930}
2931
2943int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,
2944 unsigned char C)
2945{
2946 unsigned int i, istart;
2947 int iC;
2948 unsigned char *cursrc1;
2949 unsigned char *curdest;
2950 int result;
2951
2952 /* Validate input parameters */
2953 if ((Src1 == NULL) || (Dest == NULL))
2954 return(-1);
2955 if (length == 0)
2956 return(0);
2957
2958 /* Check shift */
2959 if (N > 8) {
2960 return (-1);
2961 }
2962
2963 /* Special case: N==0 && C==1 */
2964 if ((N == 0) && (C == 1)) {
2965 memcpy(Src1, Dest, length);
2966 return (0);
2967 }
2968
2969 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
2970
2971 SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);
2972
2973 /* Check for unaligned bytes */
2974 if ((length & 7) > 0) {
2975 /* Setup to process unaligned bytes */
2976 istart = length & 0xfffffff8;
2977 cursrc1 = &Src1[istart];
2978 curdest = &Dest[istart];
2979 } else {
2980 /* No unaligned bytes - we are done */
2981 return (0);
2982 }
2983 } else {
2984 /* Setup to process whole image */
2985 istart = 0;
2986 cursrc1 = Src1;
2987 curdest = Dest;
2988 }
2989
2990 /* C routine to process image */
2991 iC = (int) C;
2992 for (i = istart; i < length; i++) {
2993 result = (int) (*cursrc1 >> N) * iC;
2994 if (result > 255)
2995 result = 255;
2996 *curdest = (unsigned char) result;
2997 /* Advance pointers */
2998 cursrc1++;
2999 curdest++;
3000 }
3001
3002 return (0);
3003}
3004
3016static int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
3017 unsigned char *Mask)
3018{
3019#ifdef USE_MMX
3020#if !defined(GCC__)
3021 __asm
3022 {
3023 pusha
3024 mov edx, Mask /* load Mask address into edx */
3025 movq mm0, [edx] /* load Mask into mm0 */
3026 xor ecx, ecx /* zero ECX */
3027 mov cl, N /* load loop counter (N) into CL */
3028 movd mm3, ecx /* copy (N) into MM3 */
3029 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
3030L10270: /* ** Prepare proper bit-Mask in MM1 ** */
3031 psllw mm1, 1 /* shift 4 WORDS of MM1 1 bit to the left */
3032 pand mm1, mm0 // apply Mask to 8 BYTES of MM1 */
3033 /* byte 0x0f, 0xdb, 0xc8 */
3034 dec cl /* decrease loop counter */
3035 jnz L10270 /* check loop termination, proceed if required */
3036 /* ** Shift all bytes of the image ** */
3037 mov eax, Src1 /* load Src1 address into eax */
3038 mov edi, Dest /* load SrcDest address into edi */
3039 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3040 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3041 align 16 /* 16 byte alignment of the loop entry */
3042L10271:
3043 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */
3044 psllw mm0, mm3 /* shift 4 WORDS of MM0 (N) bits to the left */
3045 pand mm0, mm1 // apply proper bit-Mask to 8 BYTES of MM0 */
3046 /* byte 0x0f, 0xdb, 0xc1 */
3047 movq [edi], mm0 /* store result in Dest */
3048 add eax, 8 /* increase Src1 register pointer by 8 */
3049 add edi, 8 /* increase Dest register pointer by 8 */
3050 dec ecx /* decrease loop counter */
3051 jnz L10271 /* check loop termination, proceed if required */
3052 emms /* exit MMX state */
3053 popa
3054 }
3055#else
3056 /* i386 and x86_64 */
3057 __m64 *mSrc1 = (__m64*)Src1;
3058 __m64 *mDest = (__m64*)Dest;
3059 __m64 *mMask = (__m64*)Mask;
3060 __m64 mm1;
3061 int i;
3062 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */
3063 /* Prepare proper bit-Mask in MM1 */
3064 for (i = 0; i < N; i++) {
3065 mm1 = _m_psllwi(mm1, 1); /* shift 4 WORDS of MM1 1 bit to the left */
3066 mm1 = _m_pand(mm1, *mMask); /* apply Mask to 8 BYTES of MM1 */
3067 }
3068 /* ** Shift all bytes of the image ** */
3069 for (i = 0; i < SrcLength/8; i++) {
3070 __m64 mm0 = _m_psllwi(*mSrc1, N); /* shift 4 WORDS of MM0 (N) bits to the left */
3071 *mDest = _m_pand(mm0, mm1); /* apply proper bit-Mask to 8 BYTES of MM0 */
3072 mSrc1++;
3073 mDest++;
3074 }
3075 _m_empty(); /* clean MMX state */
3076#endif
3077 return (0);
3078#else
3079 return (-1);
3080#endif
3081}
3082
3093int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
3094{
3095 static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };
3096 unsigned int i, istart;
3097 unsigned char *cursrc1, *curdest;
3098 int result;
3099
3100 /* Validate input parameters */
3101 if ((Src1 == NULL) || (Dest == NULL))
3102 return(-1);
3103 if (length == 0)
3104 return(0);
3105
3106 if (N > 8) {
3107 return (-1);
3108 }
3109
3110 /* Special case: N==0 */
3111 if (N == 0) {
3112 memcpy(Src1, Dest, length);
3113 return (0);
3114 }
3115
3116 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3117
3118 SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);
3119
3120 /* Check for unaligned bytes */
3121 if ((length & 7) > 0) {
3122 /* Setup to process unaligned bytes */
3123 istart = length & 0xfffffff8;
3124 cursrc1 = &Src1[istart];
3125 curdest = &Dest[istart];
3126 } else {
3127 /* No unaligned bytes - we are done */
3128 return (0);
3129 }
3130 } else {
3131 /* Setup to process whole image */
3132 istart = 0;
3133 cursrc1 = Src1;
3134 curdest = Dest;
3135 }
3136
3137 /* C routine to process image */
3138 for (i = istart; i < length; i++) {
3139 result = ((int) *cursrc1 << N) & 0xff;
3140 *curdest = (unsigned char) result;
3141 /* Advance pointers */
3142 cursrc1++;
3143 curdest++;
3144 }
3145
3146 return (0);
3147}
3148
3159static int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
3160{
3161#ifdef USE_MMX
3162#if !defined(GCC__)
3163 __asm
3164 {
3165 pusha
3166 mov eax, Src1 /* load Src1 address into eax */
3167 mov edi, Dest /* load Dest address into edi */
3168 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3169 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3170 align 16 /* 16 byte alignment of the loop entry */
3171L12023:
3172 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
3173 pslld mm0, N /* MM0=SrcDest+C (add 8 bytes with saturation) */
3174 movq [edi], mm0 /* store result in SrcDest */
3175 add eax, 8 /* increase Src1 register pointer by 8 */
3176 add edi, 8 /* increase Dest register pointer by 8 */
3177 dec ecx /* decrease loop counter */
3178 jnz L12023 /* check loop termination, proceed if required */
3179 emms /* exit MMX state */
3180 popa
3181 }
3182#else
3183 /* i386 and x86_64 */
3184 __m64 *mSrc1 = (__m64*)Src1;
3185 __m64 *mDest = (__m64*)Dest;
3186 int i;
3187 for (i = 0; i < SrcLength/8; i++) {
3188 *mDest = _m_pslldi(*mSrc1, N); /* Src1+C (add 8 bytes with saturation) */
3189 mSrc1++;
3190 mDest++;
3191 }
3192 _m_empty(); /* clean MMX state */
3193#endif
3194 return (0);
3195#else
3196 return (-1);
3197#endif
3198}
3199
3210int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
3211{
3212 unsigned int i, istart;
3213 unsigned char *cursrc1, *curdest;
3214 unsigned int *icursrc1, *icurdest;
3215 unsigned int result;
3216
3217 /* Validate input parameters */
3218 if ((Src1 == NULL) || (Dest == NULL))
3219 return(-1);
3220 if (length == 0)
3221 return(0);
3222
3223 if (N > 32) {
3224 return (-1);
3225 }
3226
3227 /* Special case: N==0 */
3228 if (N == 0) {
3229 memcpy(Src1, Dest, length);
3230 return (0);
3231 }
3232
3233 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3234
3235 SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);
3236
3237 /* Check for unaligned bytes */
3238 if ((length & 7) > 0) {
3239 /* Setup to process unaligned bytes */
3240 istart = length & 0xfffffff8;
3241 cursrc1 = &Src1[istart];
3242 curdest = &Dest[istart];
3243 } else {
3244 /* No unaligned bytes - we are done */
3245 return (0);
3246 }
3247 } else {
3248 /* Setup to process whole image */
3249 istart = 0;
3250 cursrc1 = Src1;
3251 curdest = Dest;
3252 }
3253
3254 /* C routine to process image */
3255 icursrc1=(unsigned int *)cursrc1;
3256 icurdest=(unsigned int *)curdest;
3257 for (i = istart; i < length; i += 4) {
3258 if ((i+4)<length) {
3259 result = ((unsigned int)*icursrc1 << N);
3260 *icurdest = result;
3261 }
3262 /* Advance pointers */
3263 icursrc1++;
3264 icurdest++;
3265 }
3266
3267 return (0);
3268}
3269
3280static int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
3281{
3282#ifdef USE_MMX
3283#if !defined(GCC__)
3284 __asm
3285 {
3286 pusha
3287 xor eax, eax /* zero EAX */
3288 mov al, N /* load N into AL */
3289 movd mm7, eax /* copy N into MM7 */
3290 pxor mm0, mm0 /* zero MM0 register */
3291 mov eax, Src1 /* load Src1 address into eax */
3292 mov edi, Dest /* load Dest address into edi */
3293 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3294 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3295 cmp al, 7 /* if (N <= 7) execute more efficient code */
3296 jg L10281
3297 align 16 /* 16 byte alignment of the loop entry */
3298L10280:
3299 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
3300 movq mm4, mm3 /* copy MM3 into MM4 */
3301 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
3302 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
3303 psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the left */
3304 psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the left */
3305 packuswb mm3, mm4 /* pack words back into bytes with saturation */
3306 movq [edi], mm3 /* store result in Dest */
3307 add eax, 8 /* increase Src1 register pointer by 8 */
3308 add edi, 8 /* increase Dest register pointer by 8 */
3309 dec ecx /* decrease loop counter */
3310 jnz L10280 /* check loop termination, proceed if required */
3311 jmp L10282
3312 align 16 /* 16 byte alignment of the loop entry */
3313L10281:
3314 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
3315 movq mm4, mm3 /* copy MM3 into MM4 */
3316 punpcklbw mm3, mm0 /* unpack low bytes of SrcDest into words */
3317 punpckhbw mm4, mm0 /* unpack high bytes of SrcDest into words */
3318 psllw mm3, mm7 /* shift 4 WORDS of MM3 (N) bits to the left */
3319 psllw mm4, mm7 /* shift 4 WORDS of MM4 (N) bits to the left */
3320 /* ** Take abs value of the signed words ** */
3321 movq mm5, mm3 /* copy mm3 into mm5 */
3322 movq mm6, mm4 /* copy mm4 into mm6 */
3323 psraw mm5, 15 /* fill mm5 words with word sign bit */
3324 psraw mm6, 15 /* fill mm6 words with word sign bit */
3325 pxor mm3, mm5 /* take 1's compliment of only neg words */
3326 pxor mm4, mm6 /* take 1's compliment of only neg words */
3327 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */
3328 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
3329 packuswb mm3, mm4 /* pack words back into bytes with saturation */
3330 movq [edi], mm3 /* store result in Dest */
3331 add eax, 8 /* increase Src1 register pointer by 8 */
3332 add edi, 8 /* increase Dest register pointer by 8 */
3333 dec ecx /* decrease loop counter */
3334 jnz L10281 /* check loop termination, proceed if required */
3335L10282:
3336 emms /* exit MMX state */
3337 popa
3338 }
3339#else
3340 /* i386 and x86_64 */
3341 __m64 *mSrc1 = (__m64*)Src1;
3342 __m64 *mDest = (__m64*)Dest;
3343 __m64 mm0 = _m_from_int(0); /* zero mm0 register */
3344 int i;
3345 if (N <= 7) { /* if (N <= 7) execute more efficient code */
3346 for (i = 0; i < SrcLength/8; i++) {
3347 __m64 mm3, mm4;
3348 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
3349 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
3350 mm3 = _m_psllwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the left */
3351 mm4 = _m_psllwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the left */
3352 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
3353 mSrc1++;
3354 mDest++;
3355 }
3356 } else {
3357 for (i = 0; i < SrcLength/8; i++) {
3358 __m64 mm3, mm4, mm5, mm6;
3359 mm3 = _m_punpcklbw(*mSrc1, mm0); /* unpack low bytes of Src1 into words */
3360 mm4 = _m_punpckhbw(*mSrc1, mm0); /* unpack high bytes of Src1 into words */
3361 mm3 = _m_psllwi(mm3, N); /* shift 4 WORDS of MM3 (N) bits to the left */
3362 mm4 = _m_psllwi(mm4, N); /* shift 4 WORDS of MM4 (N) bits to the left */
3363 /* Take abs value of the signed words */
3364 mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */
3365 mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */
3366 mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */
3367 mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */
3368 mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */
3369 mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */
3370 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
3371 mSrc1++;
3372 mDest++;
3373 }
3374 }
3375 _m_empty(); /* clean MMX state */
3376#endif
3377 return (0);
3378#else
3379 return (-1);
3380#endif
3381}
3382
3393int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
3394{
3395 unsigned int i, istart;
3396 unsigned char *cursrc1, *curdest;
3397 int result;
3398
3399 /* Validate input parameters */
3400 if ((Src1 == NULL) || (Dest == NULL))
3401 return(-1);
3402 if (length == 0)
3403 return(0);
3404
3405 if (N > 8) {
3406 return (-1);
3407 }
3408
3409 /* Special case: N==0 */
3410 if (N == 0) {
3411 memcpy(Src1, Dest, length);
3412 return (0);
3413 }
3414
3415 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3416
3417 SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);
3418
3419 /* Check for unaligned bytes */
3420 if ((length & 7) > 0) {
3421 /* Setup to process unaligned bytes */
3422 istart = length & 0xfffffff8;
3423 cursrc1 = &Src1[istart];
3424 curdest = &Dest[istart];
3425 } else {
3426 /* No unaligned bytes - we are done */
3427 return (0);
3428 }
3429 } else {
3430 /* Setup to process whole image */
3431 istart = 0;
3432 cursrc1 = Src1;
3433 curdest = Dest;
3434 }
3435
3436 /* C routine to process image */
3437 for (i = istart; i < length; i++) {
3438 result = (int) *cursrc1 << N;
3439 if (result > 255)
3440 result = 255;
3441 *curdest = (unsigned char) result;
3442 /* Advance pointers */
3443 cursrc1++;
3444 curdest++;
3445 }
3446
3447 return (0);
3448}
3449
3460static int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)
3461{
3462#ifdef USE_MMX
3463#if !defined(GCC__)
3464 __asm
3465 {
3466 pusha
3467 /* ** Duplicate T in 8 bytes of MM3 ** */
3468 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
3469 pcmpeqb mm2, mm2 /* generate all 1's in mm2 */
3470 mov al, T /* load T into AL */
3471 mov ah, al /* copy AL into AH */
3472 mov bx, ax /* copy AX into BX */
3473 shl eax, 16 /* shift 2 bytes of EAX left */
3474 mov ax, bx /* copy BX into AX */
3475 movd mm3, eax /* copy EAX into MM3 */
3476 movd mm4, eax /* copy EAX into MM4 */
3477 punpckldq mm3, mm4 /* fill higher bytes of MM3 with T */
3478 psubusb mm2, mm3 /* store 0xFF - T in MM2 */
3479 mov eax, Src1 /* load Src1 address into eax */
3480 mov edi, Dest /* load Dest address into edi */
3481 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3482 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3483 align 16 /* 16 byte alignment of the loop entry */
3484L1029:
3485 movq mm0, [eax] /* load 8 bytes from SrcDest into MM0 */
3486 paddusb mm0, mm2 /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
3487 pcmpeqb mm0, mm1 /* binarize 255:0, comparing to 255 */
3488 movq [edi], mm0 /* store result in SrcDest */
3489 add eax, 8 /* increase Src1 register pointer by 8 */
3490 add edi, 8 /* increase Dest register pointer by 8 */
3491 dec ecx /* decrease loop counter */
3492 jnz L1029 /* check loop termination, proceed if required */
3493 emms /* exit MMX state */
3494 popa
3495 }
3496#else
3497 /* i386 and x86_64 */
3498 __m64 *mSrc1 = (__m64*)Src1;
3499 __m64 *mDest = (__m64*)Dest;
3500 /* Duplicate T in 8 bytes of MM3 */
3501 __m64 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */
3502 __m64 mm2 = _m_pcmpeqb(mm2, mm2); /* generate all 1's in mm1 */
3503 int i;
3504 memset(&i, T, 4);
3505 __m64 mm3 = _m_from_int(i);
3506 __m64 mm4 = _m_from_int(i);
3507 mm3 = _m_punpckldq(mm3, mm4); /* fill higher bytes of MM3 with T */
3508 mm2 = _m_psubusb(mm2, mm3); /* store 0xFF - T in MM2 */
3509 //__m64 mm3 = _m_from_int64(lli); // x86_64 only
3510 for (i = 0; i < SrcLength/8; i++) {
3511 __m64 mm0 = _m_paddusb(*mSrc1, mm2); /* Src1+(0xFF-T) (add 8 bytes with saturation) */
3512 *mDest = _m_pcmpeqb(mm0, mm1); /* binarize 255:0, comparing to 255 */
3513 mSrc1++;
3514 mDest++;
3515 }
3516 _m_empty(); /* clean MMX state */
3517#endif
3518 return (0);
3519#else
3520 return (-1);
3521#endif
3522}
3523
3534int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
3535{
3536 unsigned int i, istart;
3537 unsigned char *cursrc1;
3538 unsigned char *curdest;
3539
3540 /* Validate input parameters */
3541 if ((Src1 == NULL) || (Dest == NULL))
3542 return(-1);
3543 if (length == 0)
3544 return(0);
3545
3546 /* Special case: T==0 */
3547 if (T == 0) {
3548 memset(Dest, 255, length);
3549 return (0);
3550 }
3551
3552 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3553
3554 SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);
3555
3556 /* Check for unaligned bytes */
3557 if ((length & 7) > 0) {
3558 /* Setup to process unaligned bytes */
3559 istart = length & 0xfffffff8;
3560 cursrc1 = &Src1[istart];
3561 curdest = &Dest[istart];
3562 } else {
3563 /* No unaligned bytes - we are done */
3564 return (0);
3565 }
3566 } else {
3567 /* Setup to process whole image */
3568 istart = 0;
3569 cursrc1 = Src1;
3570 curdest = Dest;
3571 }
3572
3573 /* C routine to process image */
3574 for (i = istart; i < length; i++) {
3575 *curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0);
3576 /* Advance pointers */
3577 cursrc1++;
3578 curdest++;
3579 }
3580
3581 return (0);
3582}
3583
3595static int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,
3596 unsigned char Tmax)
3597{
3598#ifdef USE_MMX
3599#if !defined(GCC__)
3600 __asm
3601 {
3602 pusha
3603 pcmpeqb mm1, mm1 /* generate all 1's in mm1 */
3604 /* ** Duplicate Tmax in 8 bytes of MM3 ** */
3605 mov al, Tmax /* load Tmax into AL */
3606 mov ah, al /* copy AL into AH */
3607 mov bx, ax /* copy AX into BX */
3608 shl eax, 16 /* shift 2 bytes of EAX left */
3609 mov ax, bx /* copy BX into AX */
3610 movd mm3, eax /* copy EAX into MM3 */
3611 movd mm4, eax /* copy EAX into MM4 */
3612 punpckldq mm3, mm4 /* fill higher bytes of MM3 with Tmax */
3613 psubusb mm1, mm3 /* store 0xFF - Tmax in MM1 */
3614 /* ** Duplicate Tmin in 8 bytes of MM5 ** */
3615 mov al, Tmin /* load Tmin into AL */
3616 mov ah, al /* copy AL into AH */
3617 mov bx, ax /* copy AX into BX */
3618 shl eax, 16 /* shift 2 bytes of EAX left */
3619 mov ax, bx /* copy BX into AX */
3620 movd mm5, eax /* copy EAX into MM5 */
3621 movd mm4, eax /* copy EAX into MM4 */
3622 punpckldq mm5, mm4 /* fill higher bytes of MM5 with Tmin */
3623 movq mm7, mm5 /* copy MM5 into MM7 */
3624 paddusb mm7, mm1 /* store 0xFF - Tmax + Tmin in MM7 */
3625 mov eax, Src1 /* load Src1 address into eax */
3626 mov edi, Dest /* load Dest address into edi */
3627 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3628 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3629 align 16 /* 16 byte alignment of the loop entry */
3630L1030:
3631 movq mm0, [eax] /* load 8 bytes from Src1 into MM0 */
3632 paddusb mm0, mm1 /* MM0=SrcDest+(0xFF-Tmax) */
3633 psubusb mm0, mm7 /* MM0=MM0-(0xFF-Tmax+Tmin) */
3634 paddusb mm0, mm5 /* MM0=MM0+Tmin */
3635 movq [edi], mm0 /* store result in Dest */
3636 add eax, 8 /* increase Src1 register pointer by 8 */
3637 add edi, 8 /* increase Dest register pointer by 8 */
3638 dec ecx /* decrease loop counter */
3639 jnz L1030 /* check loop termination, proceed if required */
3640 emms /* exit MMX state */
3641 popa
3642 }
3643#else
3644 /* i386 and x86_64 */
3645 __m64 *mSrc1 = (__m64*)Src1;
3646 __m64 *mDest = (__m64*)Dest;
3647 __m64 mm1 = _m_pcmpeqb(mm1, mm1); /* generate all 1's in mm1 */
3648 int i;
3649 /* Duplicate Tmax in 8 bytes of MM3 */
3650 __m64 mm3, mm4;
3651 memset(&i, Tmax, 4);
3652 mm3 = _m_from_int(i);
3653 mm4 = _m_from_int(i);
3654 mm3 = _m_punpckldq(mm3, mm4); /* fill higher bytes of MM3 with Tmax */
3655 mm1 = _m_psubusb(mm1, mm3); /* store 0xFF - Tmax in MM1 */
3656 //__m64 mm3 = _m_from_int64(lli); // x86_64 only
3657 /* Duplicate Tmax in 8 bytes of MM3 */
3658 __m64 mm5, mm7;
3659 memset(&i, Tmin, 4);
3660 mm5 = _m_from_int(i);
3661 mm4 = _m_from_int(i);
3662 mm5 = _m_punpckldq(mm5, mm4); /* fill higher bytes of MM5 with Tmin */
3663 mm7 = _m_paddusb(mm5, mm1); /* store 0xFF - Tmax + Tmin in MM7 */
3664 for (i = 0; i < SrcLength/8; i++) {
3665 __m64 mm0;
3666 mm0 = _m_paddusb(*mSrc1, mm1); /* MM0=Src1+(0xFF-Tmax) */
3667 mm0 = _m_psubusb(mm0, mm7); /* MM0=MM0-(0xFF-Tmax+Tmin) */
3668 *mDest = _m_paddusb(mm0, mm5); /* MM0+Tmin */
3669 mSrc1++;
3670 mDest++;
3671 }
3672 _m_empty(); /* clean MMX state */
3673#endif
3674 return (0);
3675#else
3676 return (-1);
3677#endif
3678}
3679
3691int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,
3692 unsigned char Tmax)
3693{
3694 unsigned int i, istart;
3695 unsigned char *cursrc1;
3696 unsigned char *curdest;
3697
3698 /* Validate input parameters */
3699 if ((Src1 == NULL) || (Dest == NULL))
3700 return(-1);
3701 if (length == 0)
3702 return(0);
3703
3704 /* Special case: Tmin==0 && Tmax = 255 */
3705 if ((Tmin == 0) && (Tmax == 25)) {
3706 memcpy(Src1, Dest, length);
3707 return (0);
3708 }
3709
3710 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3711
3712 SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);
3713
3714 /* Check for unaligned bytes */
3715 if ((length & 7) > 0) {
3716 /* Setup to process unaligned bytes */
3717 istart = length & 0xfffffff8;
3718 cursrc1 = &Src1[istart];
3719 curdest = &Dest[istart];
3720 } else {
3721 /* No unaligned bytes - we are done */
3722 return (0);
3723 }
3724 } else {
3725 /* Setup to process whole image */
3726 istart = 0;
3727 cursrc1 = Src1;
3728 curdest = Dest;
3729 }
3730
3731 /* C routine to process image */
3732 for (i = istart; i < length; i++) {
3733 if (*cursrc1 < Tmin) {
3734 *curdest = Tmin;
3735 } else if (*cursrc1 > Tmax) {
3736 *curdest = Tmax;
3737 } else {
3738 *curdest = *cursrc1;
3739 }
3740 /* Advance pointers */
3741 cursrc1++;
3742 curdest++;
3743 }
3744
3745 return (0);
3746}
3747
3761static int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,
3762 int Nmin, int Nmax)
3763{
3764#ifdef USE_MMX
3765#if !defined(GCC__)
3766 __asm
3767 {
3768 pusha
3769 mov ax, WORD PTR Nmax /* load Nmax in AX */
3770 mov bx, WORD PTR Cmax /* load Cmax in BX */
3771 sub ax, WORD PTR Nmin /* AX = Nmax - Nmin */
3772 sub bx, WORD PTR Cmin /* BX = Cmax - Cmin */
3773 jz L10311 /* check division by zero */
3774 xor dx, dx /* prepare for division, zero DX */
3775 div bx /* AX = AX/BX */
3776 jmp L10312
3777L10311:
3778 mov ax, 255 /* if div by zero, assume result max byte value */
3779L10312: /* ** Duplicate AX in 4 words of MM0 ** */
3780 mov bx, ax /* copy AX into BX */
3781 shl eax, 16 /* shift 2 bytes of EAX left */
3782 mov ax, bx /* copy BX into AX */
3783 movd mm0, eax /* copy EAX into MM0 */
3784 movd mm1, eax /* copy EAX into MM1 */
3785 punpckldq mm0, mm1 /* fill higher words of MM0 with AX */
3786 /* ** Duplicate Cmin in 4 words of MM1 ** */
3787 mov ax, WORD PTR Cmin /* load Cmin into AX */
3788 mov bx, ax /* copy AX into BX */
3789 shl eax, 16 /* shift 2 bytes of EAX left */
3790 mov ax, bx /* copy BX into AX */
3791 movd mm1, eax /* copy EAX into MM1 */
3792 movd mm2, eax /* copy EAX into MM2 */
3793 punpckldq mm1, mm2 /* fill higher words of MM1 with Cmin */
3794 /* ** Duplicate Nmin in 4 words of MM2 ** */
3795 mov ax, WORD PTR Nmin /* load Nmin into AX */
3796 mov bx, ax /* copy AX into BX */
3797 shl eax, 16 /* shift 2 bytes of EAX left */
3798 mov ax, bx /* copy BX into AX */
3799 movd mm2, eax /* copy EAX into MM2 */
3800 movd mm3, eax /* copy EAX into MM3 */
3801 punpckldq mm2, mm3 /* fill higher words of MM2 with Nmin */
3802 pxor mm7, mm7 /* zero MM7 register */
3803 mov eax, Src1 /* load Src1 address into eax */
3804 mov edi, Dest /* load Dest address into edi */
3805 mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
3806 shr ecx, 3 /* counter/8 (MMX loads 8 bytes at a time) */
3807 align 16 /* 16 byte alignment of the loop entry */
3808L1031:
3809 movq mm3, [eax] /* load 8 bytes from Src1 into MM3 */
3810 movq mm4, mm3 /* copy MM3 into MM4 */
3811 punpcklbw mm3, mm7 /* unpack low bytes of SrcDest into words */
3812 punpckhbw mm4, mm7 /* unpack high bytes of SrcDest into words */
3813 psubusb mm3, mm1 /* S-Cmin, low bytes */
3814 psubusb mm4, mm1 /* S-Cmin, high bytes */
3815 pmullw mm3, mm0 /* MM0*(S-Cmin), low bytes */
3816 pmullw mm4, mm0 /* MM0*(S-Cmin), high bytes */
3817 paddusb mm3, mm2 /* MM0*(S-Cmin)+Nmin, low bytes */
3818 paddusb mm4, mm2 /* MM0*(S-Cmin)+Nmin, high bytes */
3819 /* ** Take abs value of the signed words ** */
3820 movq mm5, mm3 /* copy mm3 into mm5 */
3821 movq mm6, mm4 /* copy mm4 into mm6 */
3822 psraw mm5, 15 /* fill mm5 words with word sign bit */
3823 psraw mm6, 15 /* fill mm6 words with word sign bit */
3824 pxor mm3, mm5 /* take 1's compliment of only neg words */
3825 pxor mm4, mm6 /* take 1's compliment of only neg words */
3826 psubsw mm3, mm5 /* add 1 to only neg words, W-(-1) or W-0 */
3827 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
3828 packuswb mm3, mm4 /* pack words back into bytes with saturation */
3829 movq [edi], mm3 /* store result in Dest */
3830 add eax, 8 /* increase Src1 register pointer by 8 */
3831 add edi, 8 /* increase Dest register pointer by 8 */
3832 dec ecx /* decrease loop counter */
3833 jnz L1031 /* check loop termination, proceed if required */
3834 emms /* exit MMX state */
3835 popa
3836 }
3837#else
3838 /* i386 and x86_64 */
3839 __m64 *mSrc1 = (__m64*)Src1;
3840 __m64 *mDest = (__m64*)Dest;
3841 __m64 mm0, mm1, mm2, mm3;
3842
3843 int i;
3844 /* Duplicate (Nmax-Nmin)/(Cmax-Cmin) in 4 words of MM0 */
3845 unsigned short a = Nmax - Nmin;
3846 unsigned short b = Cmax - Cmin;
3847 if (b == 0) {
3848 a = 255;
3849 } else {
3850 a /= b;
3851 }
3852 i = (a<<16)|a;
3853 mm0 = _m_from_int(i);
3854 mm1 = _m_from_int(i);
3855 mm0 = _m_punpckldq(mm0, mm1); /* fill higher words of MM0 with AX */
3856 /* Duplicate Cmin in 4 words of MM1 */
3857 i = (Cmin<<16)|(short)Cmin;
3858 mm1 = _m_from_int(i);
3859 mm2 = _m_from_int(i);
3860 mm1 = _m_punpckldq(mm1, mm2); /* fill higher words of MM1 with Cmin */
3861 /* Duplicate Nmin in 4 words of MM2 */
3862 i = (Nmin<<16)|(short)Nmin;
3863 mm2 = _m_from_int(i);
3864 mm3 = _m_from_int(i);
3865 mm2 = _m_punpckldq(mm2, mm3); /* fill higher words of MM2 with Nmin */
3866 __m64 mm7 = _m_from_int(0); /* zero mm0 register */
3867 for (i = 0; i < SrcLength/8; i++) {
3868 __m64 mm3, mm4, mm5, mm6;
3869 mm3 = _m_punpcklbw(*mSrc1, mm7); /* unpack low bytes of Src1 into words */
3870 mm4 = _m_punpckhbw(*mSrc1, mm7); /* unpack high bytes of Src1 into words */
3871 mm3 = _m_psubusb(mm3, mm1); /* S-Cmin, low bytes */
3872 mm4 = _m_psubusb(mm4, mm1); /* S-Cmin, high bytes */
3873 mm3 = _m_pmullw(mm3, mm0); /* MM0*(S-Cmin), low bytes */
3874 mm4 = _m_pmullw(mm4, mm0); /* MM0*(S-Cmin), high bytes */
3875 mm3 = _m_paddusb(mm3, mm2); /* MM0*(S-Cmin)+Nmin, low bytes */
3876 mm4 = _m_paddusb(mm4, mm2); /* MM0*(S-Cmin)+Nmin, high bytes */
3877 /* Take abs value of the signed words */
3878 mm5 = _m_psrawi(mm3, 15); /* fill mm5 words with word sign bit */
3879 mm6 = _m_psrawi(mm4, 15); /* fill mm6 words with word sign bit */
3880 mm3 = _m_pxor(mm3, mm5); /* take 1's compliment of only neg. words */
3881 mm4 = _m_pxor(mm4, mm6); /* take 1's compliment of only neg. words */
3882 mm3 = _m_psubsw(mm3, mm5); /* add 1 to only neg. words, W-(-1) or W-0 */
3883 mm4 = _m_psubsw(mm4, mm6); /* add 1 to only neg. words, W-(-1) or W-0 */
3884 *mDest = _m_packuswb(mm3, mm4); /* pack words back into bytes with saturation */
3885 mSrc1++;
3886 mDest++;
3887 }
3888 _m_empty(); /* clean MMX state */
3889#endif
3890 return (0);
3891#else
3892 return (-1);
3893#endif
3894}
3895
3909int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,
3910 int Nmax)
3911{
3912 unsigned int i, istart;
3913 unsigned char *cursrc;
3914 unsigned char *curdest;
3915 int dN, dC, factor;
3916 int result;
3917
3918 /* Validate input parameters */
3919 if ((Src == NULL) || (Dest == NULL))
3920 return(-1);
3921 if (length == 0)
3922 return(0);
3923
3924 if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
3925
3926 SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);
3927
3928 /* Check for unaligned bytes */
3929 if ((length & 7) > 0) {
3930 /* Setup to process unaligned bytes */
3931 istart = length & 0xfffffff8;
3932 cursrc = &Src[istart];
3933 curdest = &Dest[istart];
3934 } else {
3935 /* No unaligned bytes - we are done */
3936 return (0);
3937 }
3938 } else {
3939 /* Setup to process whole image */
3940 istart = 0;
3941 cursrc = Src;
3942 curdest = Dest;
3943 }
3944
3945 /* C routine to process image */
3946 dC = Cmax - Cmin;
3947 if (dC == 0)
3948 return (0);
3949 dN = Nmax - Nmin;
3950 factor = dN / dC;
3951 for (i = istart; i < length; i++) {
3952 result = factor * ((int) (*cursrc) - Cmin) + Nmin;
3953 if (result > 255)
3954 result = 255;
3955 *curdest = (unsigned char) result;
3956 /* Advance pointers */
3957 cursrc++;
3958 curdest++;
3959 }
3960
3961 return (0);
3962}
3963
3964/* ------------------------------------------------------------------------------------ */
3965
3980int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
3981 signed short *Kernel, unsigned char Divisor)
3982{
3983 /* Validate input parameters */
3984 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
3985 return(-1);
3986
3987 if ((columns < 3) || (rows < 3) || (Divisor == 0))
3988 return (-1);
3989
3990 if ((SDL_imageFilterMMXdetect())) {
3991//#ifdef USE_MMX
3992#if defined(USE_MMX) && defined(i386)
3993#if !defined(GCC__)
3994 __asm
3995 {
3996 pusha
3997 pxor mm0, mm0 /* zero MM0 */
3998 xor ebx, ebx /* zero EBX */
3999 mov bl, Divisor /* load Divisor into BL */
4000 mov edx, Kernel /* load Kernel address into EDX */
4001 movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */
4002 add edx, 8 /* second row |K0 K1 K2 0| */
4003 movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
4004 add edx, 8 /* third row |K6 K7 K8 0| */
4005 movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */
4006 /* ---, */
4007 mov eax, columns /* load columns into EAX */
4008 mov esi, Src /* ESI = Src row 0 address */
4009 mov edi, Dest /* load Dest address to EDI */
4010 add edi, eax /* EDI = EDI + columns */
4011 inc edi /* 1 byte offset from the left edge */
4012 mov edx, rows /* initialize ROWS counter */
4013 sub edx, 2 /* do not use first and last row */
4014 /* ---, */
4015L10320:
4016 mov ecx, eax /* initialize COLUMS counter */
4017 sub ecx, 2 /* do not use first and last column */
4018 align 16 /* 16 byte alignment of the loop entry */
4019L10322:
4020 /* ---, */
4021 movq mm1, [esi] /* load 8 bytes of the image first row */
4022 add esi, eax /* move one row below */
4023 movq mm2, [esi] /* load 8 bytes of the image second row */
4024 add esi, eax /* move one row below */
4025 movq mm3, [esi] /* load 8 bytes of the image third row */
4026 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4027 punpcklbw mm2, mm0 /* unpack first 4 bytes into words */
4028 punpcklbw mm3, mm0 /* unpack first 4 bytes into words */
4029 pmullw mm1, mm5 /* multiply words first row image*Kernel */
4030 pmullw mm2, mm6 /* multiply words second row image*Kernel */
4031 pmullw mm3, mm7 /* multiply words third row image*Kernel */
4032 paddsw mm1, mm2 /* add 4 words of the first and second rows */
4033 paddsw mm1, mm3 /* add 4 words of the third row and result */
4034 movq mm2, mm1 /* copy MM1 into MM2 */
4035 psrlq mm1, 32 /* shift 2 left words to the right */
4036 paddsw mm1, mm2 /* add 2 left and 2 right result words */
4037 movq mm3, mm1 /* copy MM1 into MM3 */
4038 psrlq mm1, 16 /* shift 1 left word to the right */
4039 paddsw mm1, mm3 /* add 1 left and 1 right result words */
4040 /* --, */
4041 movd mm2, eax /* save EAX in MM2 */
4042 movd mm3, edx /* save EDX in MM3 */
4043 movd eax, mm1 /* copy MM1 into EAX */
4044 psraw mm1, 15 /* spread sign bit of the result */
4045 movd edx, mm1 /* fill EDX with a sign bit */
4046 idiv bx /* IDIV - VERY EXPENSIVE */
4047 movd mm1, eax /* move result of division into MM1 */
4048 packuswb mm1, mm0 /* pack division result with saturation */
4049 movd eax, mm1 /* copy saturated result into EAX */
4050 mov [edi], al /* copy a byte result into Dest */
4051 movd edx, mm3 /* restore saved EDX */
4052 movd eax, mm2 /* restore saved EAX */
4053 /* --, */
4054 sub esi, eax /* move two rows up */
4055 sub esi, eax /* */
4056 inc esi /* move Src pointer to the next pixel */
4057 inc edi /* move Dest pointer to the next pixel */
4058 /* ---, */
4059 dec ecx /* decrease loop counter COLUMNS */
4060 jnz L10322 /* check loop termination, proceed if required */
4061 add esi, 2 /* move to the next row in Src */
4062 add edi, 2 /* move to the next row in Dest */
4063 dec edx /* decrease loop counter ROWS */
4064 jnz L10320 /* check loop termination, proceed if required */
4065 /* ---, */
4066 emms /* exit MMX state */
4067 popa
4068 }
4069#else
4070 asm volatile
4071 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
4072 "xor %%ebx, %%ebx \n\t" /* zero EBX */
4073 "mov %5, %%bl \n\t" /* load Divisor into BL */
4074 "mov %4, %%edx \n\t" /* load Kernel address into EDX */
4075 "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */
4076 "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */
4077 "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
4078 "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */
4079 "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */
4080 /* --- */
4081 "mov %3, %%eax \n\t" /* load columns into EAX */
4082 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
4083 "mov %0, %%edi \n\t" /* load Dest address to EDI */
4084 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
4085 "inc %%edi \n\t" /* 1 byte offset from the left edge */
4086 "mov %2, %%edx \n\t" /* initialize ROWS counter */
4087 "sub $2, %%edx \n\t" /* do not use first and last row */
4088 /* --- */
4089 ".L10320: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
4090 "sub $2, %%ecx \n\t" /* do not use first and last column */
4091 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
4092 ".L10322: \n\t"
4093 /* --- */
4094 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */
4095 "add %%eax, %%esi \n\t" /* move one row below */
4096 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */
4097 "add %%eax, %%esi \n\t" /* move one row below */
4098 "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */
4099 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4100 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */
4101 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */
4102 "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */
4103 "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */
4104 "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */
4105 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */
4106 "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */
4107 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4108 "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */
4109 "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */
4110 "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */
4111 "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */
4112 "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */
4113 /* -- */
4114 "movd %%eax, %%mm2 \n\t" /* save EAX in MM2 */
4115 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
4116 "movd %%mm1, %%eax \n\t" /* copy MM1 into EAX */
4117 "psraw $15, %%mm1 \n\t" /* spread sign bit of the result */
4118 "movd %%mm1, %%edx \n\t" /* fill EDX with a sign bit */
4119 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
4120 "movd %%eax, %%mm1 \n\t" /* move result of division into MM1 */
4121 "packuswb %%mm0, %%mm1 \n\t" /* pack division result with saturation */
4122 "movd %%mm1, %%eax \n\t" /* copy saturated result into EAX */
4123 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
4124 "movd %%mm3, %%edx \n\t" /* restore saved EDX */
4125 "movd %%mm2, %%eax \n\t" /* restore saved EAX */
4126 /* -- */
4127 "sub %%eax, %%esi \n\t" /* move two rows up */
4128 "sub %%eax, %%esi \n\t" /* */
4129 "inc %%esi \n\t" /* move Src pointer to the next pixel */
4130 "inc %%edi \n\t" /* move Dest pointer to the next pixel */
4131 /* --- */
4132 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
4133 "jnz .L10322 \n\t" /* check loop termination, proceed if required */
4134 "add $2, %%esi \n\t" /* move to the next row in Src */
4135 "add $2, %%edi \n\t" /* move to the next row in Dest */
4136 "dec %%edx \n\t" /* decrease loop counter ROWS */
4137 "jnz .L10320 \n\t" /* check loop termination, proceed if required */
4138 /* --- */
4139 "emms \n\t" /* exit MMX state */
4140 "popa \n\t":"=m" (Dest) /* %0 */
4141 :"m"(Src), /* %1 */
4142 "m"(rows), /* %2 */
4143 "m"(columns), /* %3 */
4144 "m"(Kernel), /* %4 */
4145 "m"(Divisor) /* %5 */
4146 );
4147#endif
4148#endif
4149 return (0);
4150 } else {
4151 /* No non-MMX implementation yet */
4152 return (-1);
4153 }
4154}
4155
4170int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
4171 signed short *Kernel, unsigned char Divisor)
4172{
4173 /* Validate input parameters */
4174 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
4175 return(-1);
4176
4177 if ((columns < 5) || (rows < 5) || (Divisor == 0))
4178 return (-1);
4179
4180 if ((SDL_imageFilterMMXdetect())) {
4181//#ifdef USE_MMX
4182#if defined(USE_MMX) && defined(i386)
4183#if !defined(GCC__)
4184 __asm
4185 {
4186 pusha
4187 pxor mm0, mm0 /* zero MM0 */
4188 xor ebx, ebx /* zero EBX */
4189 mov bl, Divisor /* load Divisor into BL */
4190 movd mm5, ebx /* copy Divisor into MM5 */
4191 mov edx, Kernel /* load Kernel address into EDX */
4192 mov esi, Src /* load Src address to ESI */
4193 mov edi, Dest /* load Dest address to EDI */
4194 add edi, 2 /* 2 column offset from the left edge */
4195 mov eax, columns /* load columns into EAX */
4196 shl eax, 1 /* EAX = columns * 2 */
4197 add edi, eax /* 2 row offset from the top edge */
4198 shr eax, 1 /* EAX = columns */
4199 mov ebx, rows /* initialize ROWS counter */
4200 sub ebx, 4 /* do not use first 2 and last 2 rows */
4201 /* ---, */
4202L10330:
4203 mov ecx, eax /* initialize COLUMNS counter */
4204 sub ecx, 4 /* do not use first 2 and last 2 columns */
4205 align 16 /* 16 byte alignment of the loop entry */
4206L10332:
4207 pxor mm7, mm7 /* zero MM7 (accumulator) */
4208 movd mm6, esi /* save ESI in MM6 */
4209 /* --- 1 */
4210 movq mm1, [esi] /* load 8 bytes of the Src */
4211 movq mm2, mm1 /* copy MM1 into MM2 */
4212 add esi, eax /* move Src pointer 1 row below */
4213 movq mm3, [edx] /* load 4 words of Kernel */
4214 add edx, 8 /* move pointer to other 4 words */
4215 movq mm4, [edx] /* load 4 words of Kernel */
4216 add edx, 8 /* move pointer to other 4 words */
4217 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4218 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4219 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4220 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4221 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4222 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4223 /* --- 2 */
4224 movq mm1, [esi] /* load 8 bytes of the Src */
4225 movq mm2, mm1 /* copy MM1 into MM2 */
4226 add esi, eax /* move Src pointer 1 row below */
4227 movq mm3, [edx] /* load 4 words of Kernel */
4228 add edx, 8 /* move pointer to other 4 words */
4229 movq mm4, [edx] /* load 4 words of Kernel */
4230 add edx, 8 /* move pointer to other 4 words */
4231 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4232 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4233 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4234 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4235 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4236 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4237 /* --- 3 */
4238 movq mm1, [esi] /* load 8 bytes of the Src */
4239 movq mm2, mm1 /* copy MM1 into MM2 */
4240 add esi, eax /* move Src pointer 1 row below */
4241 movq mm3, [edx] /* load 4 words of Kernel */
4242 add edx, 8 /* move pointer to other 4 words */
4243 movq mm4, [edx] /* load 4 words of Kernel */
4244 add edx, 8 /* move pointer to other 4 words */
4245 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4246 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4247 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4248 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4249 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4250 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4251 /* --- 4 */
4252 movq mm1, [esi] /* load 8 bytes of the Src */
4253 movq mm2, mm1 /* copy MM1 into MM2 */
4254 add esi, eax /* move Src pointer 1 row below */
4255 movq mm3, [edx] /* load 4 words of Kernel */
4256 add edx, 8 /* move pointer to other 4 words */
4257 movq mm4, [edx] /* load 4 words of Kernel */
4258 add edx, 8 /* move pointer to other 4 words */
4259 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4260 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4261 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4262 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4263 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4264 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4265 /* --- 5 */
4266 movq mm1, [esi] /* load 8 bytes of the Src */
4267 movq mm2, mm1 /* copy MM1 into MM2 */
4268 movq mm3, [edx] /* load 4 words of Kernel */
4269 add edx, 8 /* move pointer to other 4 words */
4270 movq mm4, [edx] /* load 4 words of Kernel */
4271 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4272 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4273 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4274 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4275 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4276 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4277 /* ---, */
4278 movq mm3, mm7 /* copy MM7 into MM3 */
4279 psrlq mm7, 32 /* shift 2 left words to the right */
4280 paddsw mm7, mm3 /* add 2 left and 2 right result words */
4281 movq mm2, mm7 /* copy MM7 into MM2 */
4282 psrlq mm7, 16 /* shift 1 left word to the right */
4283 paddsw mm7, mm2 /* add 1 left and 1 right result words */
4284 /* ---, */
4285 movd mm1, eax /* save EDX in MM1 */
4286 movd mm2, ebx /* save EDX in MM2 */
4287 movd mm3, edx /* save EDX in MM3 */
4288 movd eax, mm7 /* load summation result into EAX */
4289 psraw mm7, 15 /* spread sign bit of the result */
4290 movd ebx, mm5 /* load Divisor into EBX */
4291 movd edx, mm7 /* fill EDX with a sign bit */
4292 idiv bx /* IDIV - VERY EXPENSIVE */
4293 movd mm7, eax /* move result of division into MM7 */
4294 packuswb mm7, mm0 /* pack division result with saturation */
4295 movd eax, mm7 /* copy saturated result into EAX */
4296 mov [edi], al /* copy a byte result into Dest */
4297 movd edx, mm3 /* restore saved EDX */
4298 movd ebx, mm2 /* restore saved EBX */
4299 movd eax, mm1 /* restore saved EAX */
4300 /* --, */
4301 movd esi, mm6 /* move Src pointer to the top pixel */
4302 sub edx, 72 /* EDX = Kernel address */
4303 inc esi /* move Src pointer to the next pixel */
4304 inc edi /* move Dest pointer to the next pixel */
4305 /* ---, */
4306 dec ecx /* decrease loop counter COLUMNS */
4307 jnz L10332 /* check loop termination, proceed if required */
4308 add esi, 4 /* move to the next row in Src */
4309 add edi, 4 /* move to the next row in Dest */
4310 dec ebx /* decrease loop counter ROWS */
4311 jnz L10330 /* check loop termination, proceed if required */
4312 /* ---, */
4313 emms /* exit MMX state */
4314 popa
4315 }
4316#else
4317 asm volatile
4318 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
4319 "xor %%ebx, %%ebx \n\t" /* zero EBX */
4320 "mov %5, %%bl \n\t" /* load Divisor into BL */
4321 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */
4322 "mov %4, %%edx \n\t" /* load Kernel address into EDX */
4323 "mov %1, %%esi \n\t" /* load Src address to ESI */
4324 "mov %0, %%edi \n\t" /* load Dest address to EDI */
4325 "add $2, %%edi \n\t" /* 2 column offset from the left edge */
4326 "mov %3, %%eax \n\t" /* load columns into EAX */
4327 "shl $1, %%eax \n\t" /* EAX = columns * 2 */
4328 "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */
4329 "shr $1, %%eax \n\t" /* EAX = columns */
4330 "mov %2, %%ebx \n\t" /* initialize ROWS counter */
4331 "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */
4332 /* --- */
4333 ".L10330: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
4334 "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */
4335 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
4336 ".L10332: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
4337 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
4338 /* --- 1 */
4339 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4340 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4341 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4342 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4343 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4344 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4345 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4346 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4347 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4348 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4349 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4350 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4351 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4352 /* --- 2 */
4353 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4354 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4355 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4356 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4357 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4358 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4359 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4360 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4361 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4362 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4363 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4364 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4365 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4366 /* --- 3 */
4367 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4368 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4369 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4370 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4371 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4372 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4373 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4374 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4375 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4376 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4377 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4378 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4379 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4380 /* --- 4 */
4381 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4382 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4383 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4384 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4385 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4386 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4387 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4388 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4389 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4390 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4391 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4392 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4393 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4394 /* --- 5 */
4395 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4396 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4397 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4398 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4399 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4400 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4401 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4402 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4403 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4404 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4405 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4406 /* --- */
4407 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
4408 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
4409 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
4410 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
4411 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
4412 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
4413 /* --- */
4414 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */
4415 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */
4416 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
4417 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */
4418 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */
4419 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */
4420 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */
4421 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
4422 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */
4423 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
4424 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
4425 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
4426 "movd %%mm3, %%edx \n\t" /* restore saved EDX */
4427 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */
4428 "movd %%mm1, %%eax \n\t" /* restore saved EAX */
4429 /* -- */
4430 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
4431 "sub $72, %%edx \n\t" /* EDX = Kernel address */
4432 "inc %%esi \n\t" /* move Src pointer to the next pixel */
4433 "inc %%edi \n\t" /* move Dest pointer to the next pixel */
4434 /* --- */
4435 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
4436 "jnz .L10332 \n\t" /* check loop termination, proceed if required */
4437 "add $4, %%esi \n\t" /* move to the next row in Src */
4438 "add $4, %%edi \n\t" /* move to the next row in Dest */
4439 "dec %%ebx \n\t" /* decrease loop counter ROWS */
4440 "jnz .L10330 \n\t" /* check loop termination, proceed if required */
4441 /* --- */
4442 "emms \n\t" /* exit MMX state */
4443 "popa \n\t":"=m" (Dest) /* %0 */
4444 :"m"(Src), /* %1 */
4445 "m"(rows), /* %2 */
4446 "m"(columns), /* %3 */
4447 "m"(Kernel), /* %4 */
4448 "m"(Divisor) /* %5 */
4449 );
4450#endif
4451#endif
4452 return (0);
4453 } else {
4454 /* No non-MMX implementation yet */
4455 return (-1);
4456 }
4457}
4458
4473int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
4474 signed short *Kernel, unsigned char Divisor)
4475{
4476 /* Validate input parameters */
4477 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
4478 return(-1);
4479
4480 if ((columns < 7) || (rows < 7) || (Divisor == 0))
4481 return (-1);
4482
4483 if ((SDL_imageFilterMMXdetect())) {
4484//#ifdef USE_MMX
4485#if defined(USE_MMX) && defined(i386)
4486#if !defined(GCC__)
4487 __asm
4488 {
4489 pusha
4490 pxor mm0, mm0 /* zero MM0 */
4491 xor ebx, ebx /* zero EBX */
4492 mov bl, Divisor /* load Divisor into BL */
4493 movd mm5, ebx /* copy Divisor into MM5 */
4494 mov edx, Kernel /* load Kernel address into EDX */
4495 mov esi, Src /* load Src address to ESI */
4496 mov edi, Dest /* load Dest address to EDI */
4497 add edi, 3 /* 3 column offset from the left edge */
4498 mov eax, columns /* load columns into EAX */
4499 add edi, eax /* 3 row offset from the top edge */
4500 add edi, eax
4501 add edi, eax
4502 mov ebx, rows /* initialize ROWS counter */
4503 sub ebx, 6 /* do not use first 3 and last 3 rows */
4504 /* ---, */
4505L10340:
4506 mov ecx, eax /* initialize COLUMNS counter */
4507 sub ecx, 6 /* do not use first 3 and last 3 columns */
4508 align 16 /* 16 byte alignment of the loop entry */
4509L10342:
4510 pxor mm7, mm7 /* zero MM7 (accumulator) */
4511 movd mm6, esi /* save ESI in MM6 */
4512 /* --- 1 */
4513 movq mm1, [esi] /* load 8 bytes of the Src */
4514 movq mm2, mm1 /* copy MM1 into MM2 */
4515 add esi, eax /* move Src pointer 1 row below */
4516 movq mm3, [edx] /* load 4 words of Kernel */
4517 add edx, 8 /* move pointer to other 4 words */
4518 movq mm4, [edx] /* load 4 words of Kernel */
4519 add edx, 8 /* move pointer to other 4 words */
4520 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4521 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4522 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4523 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4524 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4525 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4526 /* --- 2 */
4527 movq mm1, [esi] /* load 8 bytes of the Src */
4528 movq mm2, mm1 /* copy MM1 into MM2 */
4529 add esi, eax /* move Src pointer 1 row below */
4530 movq mm3, [edx] /* load 4 words of Kernel */
4531 add edx, 8 /* move pointer to other 4 words */
4532 movq mm4, [edx] /* load 4 words of Kernel */
4533 add edx, 8 /* move pointer to other 4 words */
4534 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4535 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4536 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4537 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4538 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4539 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4540 /* --- 3 */
4541 movq mm1, [esi] /* load 8 bytes of the Src */
4542 movq mm2, mm1 /* copy MM1 into MM2 */
4543 add esi, eax /* move Src pointer 1 row below */
4544 movq mm3, [edx] /* load 4 words of Kernel */
4545 add edx, 8 /* move pointer to other 4 words */
4546 movq mm4, [edx] /* load 4 words of Kernel */
4547 add edx, 8 /* move pointer to other 4 words */
4548 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4549 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4550 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4551 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4552 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4553 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4554 /* --- 4 */
4555 movq mm1, [esi] /* load 8 bytes of the Src */
4556 movq mm2, mm1 /* copy MM1 into MM2 */
4557 add esi, eax /* move Src pointer 1 row below */
4558 movq mm3, [edx] /* load 4 words of Kernel */
4559 add edx, 8 /* move pointer to other 4 words */
4560 movq mm4, [edx] /* load 4 words of Kernel */
4561 add edx, 8 /* move pointer to other 4 words */
4562 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4563 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4564 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4565 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4566 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4567 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4568 /* --- 5 */
4569 movq mm1, [esi] /* load 8 bytes of the Src */
4570 movq mm2, mm1 /* copy MM1 into MM2 */
4571 add esi, eax /* move Src pointer 1 row below */
4572 movq mm3, [edx] /* load 4 words of Kernel */
4573 add edx, 8 /* move pointer to other 4 words */
4574 movq mm4, [edx] /* load 4 words of Kernel */
4575 add edx, 8 /* move pointer to other 4 words */
4576 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4577 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4578 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4579 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4580 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4581 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4582 /* --- 6 */
4583 movq mm1, [esi] /* load 8 bytes of the Src */
4584 movq mm2, mm1 /* copy MM1 into MM2 */
4585 add esi, eax /* move Src pointer 1 row below */
4586 movq mm3, [edx] /* load 4 words of Kernel */
4587 add edx, 8 /* move pointer to other 4 words */
4588 movq mm4, [edx] /* load 4 words of Kernel */
4589 add edx, 8 /* move pointer to other 4 words */
4590 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4591 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4592 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4593 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4594 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4595 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4596 /* --- 7 */
4597 movq mm1, [esi] /* load 8 bytes of the Src */
4598 movq mm2, mm1 /* copy MM1 into MM2 */
4599 movq mm3, [edx] /* load 4 words of Kernel */
4600 add edx, 8 /* move pointer to other 4 words */
4601 movq mm4, [edx] /* load 4 words of Kernel */
4602 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4603 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4604 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
4605 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
4606 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4607 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4608 /* ---, */
4609 movq mm3, mm7 /* copy MM7 into MM3 */
4610 psrlq mm7, 32 /* shift 2 left words to the right */
4611 paddsw mm7, mm3 /* add 2 left and 2 right result words */
4612 movq mm2, mm7 /* copy MM7 into MM2 */
4613 psrlq mm7, 16 /* shift 1 left word to the right */
4614 paddsw mm7, mm2 /* add 1 left and 1 right result words */
4615 /* ---, */
4616 movd mm1, eax /* save EDX in MM1 */
4617 movd mm2, ebx /* save EDX in MM2 */
4618 movd mm3, edx /* save EDX in MM3 */
4619 movd eax, mm7 /* load summation result into EAX */
4620 psraw mm7, 15 /* spread sign bit of the result */
4621 movd ebx, mm5 /* load Divisor into EBX */
4622 movd edx, mm7 /* fill EDX with a sign bit */
4623 idiv bx /* IDIV - VERY EXPENSIVE */
4624 movd mm7, eax /* move result of division into MM7 */
4625 packuswb mm7, mm0 /* pack division result with saturation */
4626 movd eax, mm7 /* copy saturated result into EAX */
4627 mov [edi], al /* copy a byte result into Dest */
4628 movd edx, mm3 /* restore saved EDX */
4629 movd ebx, mm2 /* restore saved EBX */
4630 movd eax, mm1 /* restore saved EAX */
4631 /* --, */
4632 movd esi, mm6 /* move Src pointer to the top pixel */
4633 sub edx, 104 /* EDX = Kernel address */
4634 inc esi /* move Src pointer to the next pixel */
4635 inc edi /* move Dest pointer to the next pixel */
4636 /* ---, */
4637 dec ecx /* decrease loop counter COLUMNS */
4638 jnz L10342 /* check loop termination, proceed if required */
4639 add esi, 6 /* move to the next row in Src */
4640 add edi, 6 /* move to the next row in Dest */
4641 dec ebx /* decrease loop counter ROWS */
4642 jnz L10340 /* check loop termination, proceed if required */
4643 /* ---, */
4644 emms /* exit MMX state */
4645 popa
4646 }
4647#else
4648 asm volatile
4649 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
4650 "xor %%ebx, %%ebx \n\t" /* zero EBX */
4651 "mov %5, %%bl \n\t" /* load Divisor into BL */
4652 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */
4653 "mov %4, %%edx \n\t" /* load Kernel address into EDX */
4654 "mov %1, %%esi \n\t" /* load Src address to ESI */
4655 "mov %0, %%edi \n\t" /* load Dest address to EDI */
4656 "add $3, %%edi \n\t" /* 3 column offset from the left edge */
4657 "mov %3, %%eax \n\t" /* load columns into EAX */
4658 "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */
4659 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
4660 "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */
4661 /* --- */
4662 ".L10340: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
4663 "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */
4664 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
4665 ".L10342: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
4666 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
4667 /* --- 1 */
4668 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4669 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4670 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4671 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4672 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4673 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4674 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4675 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4676 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4677 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4678 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4679 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4680 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4681 /* --- 2 */
4682 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4683 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4684 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4685 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4686 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4687 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4688 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4689 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4690 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4691 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4692 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4693 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4694 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4695 /* --- 3 */
4696 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4697 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4698 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4699 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4700 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4701 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4702 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4703 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4704 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4705 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4706 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4707 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4708 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4709 /* --- 4 */
4710 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4711 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4712 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4713 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4714 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4715 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4716 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4717 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4718 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4719 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4720 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4721 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4722 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4723 /* --- 5 */
4724 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4725 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4726 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4727 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4728 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4729 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4730 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4731 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4732 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4733 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4734 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4735 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4736 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4737 /* --- 6 */
4738 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4739 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4740 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
4741 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4742 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4743 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4744 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4745 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4746 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4747 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4748 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4749 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4750 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4751 /* --- 7 */
4752 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
4753 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
4754 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
4755 "add $8, %%edx \n\t" /* move pointer to other 4 words */
4756 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
4757 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
4758 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
4759 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
4760 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
4761 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
4762 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
4763 /* --- */
4764 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
4765 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
4766 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
4767 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
4768 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
4769 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
4770 /* --- */
4771 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */
4772 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */
4773 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
4774 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */
4775 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */
4776 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */
4777 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */
4778 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
4779 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */
4780 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
4781 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
4782 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
4783 "movd %%mm3, %%edx \n\t" /* restore saved EDX */
4784 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */
4785 "movd %%mm1, %%eax \n\t" /* restore saved EAX */
4786 /* -- */
4787 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
4788 "sub $104, %%edx \n\t" /* EDX = Kernel address */
4789 "inc %%esi \n\t" /* move Src pointer to the next pixel */
4790 "inc %%edi \n\t" /* move Dest pointer to the next pixel */
4791 /* --- */
4792 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
4793 "jnz .L10342 \n\t" /* check loop termination, proceed if required */
4794 "add $6, %%esi \n\t" /* move to the next row in Src */
4795 "add $6, %%edi \n\t" /* move to the next row in Dest */
4796 "dec %%ebx \n\t" /* decrease loop counter ROWS */
4797 "jnz .L10340 \n\t" /* check loop termination, proceed if required */
4798 /* --- */
4799 "emms \n\t" /* exit MMX state */
4800 "popa \n\t":"=m" (Dest) /* %0 */
4801 :"m"(Src), /* %1 */
4802 "m"(rows), /* %2 */
4803 "m"(columns), /* %3 */
4804 "m"(Kernel), /* %4 */
4805 "m"(Divisor) /* %5 */
4806 );
4807#endif
4808#endif
4809 return (0);
4810 } else {
4811 /* No non-MMX implementation yet */
4812 return (-1);
4813 }
4814}
4815
4830int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
4831 signed short *Kernel, unsigned char Divisor)
4832{
4833 /* Validate input parameters */
4834 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
4835 return(-1);
4836
4837 if ((columns < 9) || (rows < 9) || (Divisor == 0))
4838 return (-1);
4839
4840 if ((SDL_imageFilterMMXdetect())) {
4841//#ifdef USE_MMX
4842#if defined(USE_MMX) && defined(i386)
4843#if !defined(GCC__)
4844 __asm
4845 {
4846 pusha
4847 pxor mm0, mm0 /* zero MM0 */
4848 xor ebx, ebx /* zero EBX */
4849 mov bl, Divisor /* load Divisor into BL */
4850 movd mm5, ebx /* copy Divisor into MM5 */
4851 mov edx, Kernel /* load Kernel address into EDX */
4852 mov esi, Src /* load Src address to ESI */
4853 mov edi, Dest /* load Dest address to EDI */
4854 add edi, 4 /* 4 column offset from the left edge */
4855 mov eax, columns /* load columns into EAX */
4856 add edi, eax /* 4 row offset from the top edge */
4857 add edi, eax
4858 add edi, eax
4859 add edi, eax
4860 mov ebx, rows /* initialize ROWS counter */
4861 sub ebx, 8 /* do not use first 4 and last 4 rows */
4862 /* ---, */
4863L10350:
4864 mov ecx, eax /* initialize COLUMNS counter */
4865 sub ecx, 8 /* do not use first 4 and last 4 columns */
4866 align 16 /* 16 byte alignment of the loop entry */
4867L10352:
4868 pxor mm7, mm7 /* zero MM7 (accumulator) */
4869 movd mm6, esi /* save ESI in MM6 */
4870 /* --- 1 */
4871 movq mm1, [esi] /* load 8 bytes of the Src */
4872 movq mm2, mm1 /* copy MM1 into MM2 */
4873 inc esi /* move pointer to the next 8 bytes of Src */
4874 movq mm3, [edx] /* load 4 words of Kernel */
4875 add edx, 8 /* move pointer to other 4 words */
4876 movq mm4, [edx] /* load 4 words of Kernel */
4877 add edx, 8 /* move pointer to other 4 words */
4878 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4879 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4880 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4881 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4882 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4883 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4884 movq mm1, [esi] /* load 8 bytes of the Src */
4885 dec esi
4886 add esi, eax /* move Src pointer 1 row below */
4887 movq mm3, [edx] /* load 4 words of Kernel */
4888 add edx, 8 /* move pointer to other 4 words */
4889 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4890 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4891 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4892 /* --- 2 */
4893 movq mm1, [esi] /* load 8 bytes of the Src */
4894 movq mm2, mm1 /* copy MM1 into MM2 */
4895 inc esi /* move pointer to the next 8 bytes of Src */
4896 movq mm3, [edx] /* load 4 words of Kernel */
4897 add edx, 8 /* move pointer to other 4 words */
4898 movq mm4, [edx] /* load 4 words of Kernel */
4899 add edx, 8 /* move pointer to other 4 words */
4900 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4901 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4902 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4903 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4904 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4905 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4906 movq mm1, [esi] /* load 8 bytes of the Src */
4907 dec esi
4908 add esi, eax /* move Src pointer 1 row below */
4909 movq mm3, [edx] /* load 4 words of Kernel */
4910 add edx, 8 /* move pointer to other 4 words */
4911 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4912 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4913 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4914 /* --- 3 */
4915 movq mm1, [esi] /* load 8 bytes of the Src */
4916 movq mm2, mm1 /* copy MM1 into MM2 */
4917 inc esi /* move pointer to the next 8 bytes of Src */
4918 movq mm3, [edx] /* load 4 words of Kernel */
4919 add edx, 8 /* move pointer to other 4 words */
4920 movq mm4, [edx] /* load 4 words of Kernel */
4921 add edx, 8 /* move pointer to other 4 words */
4922 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4923 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4924 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4925 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4926 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4927 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4928 movq mm1, [esi] /* load 8 bytes of the Src */
4929 dec esi
4930 add esi, eax /* move Src pointer 1 row below */
4931 movq mm3, [edx] /* load 4 words of Kernel */
4932 add edx, 8 /* move pointer to other 4 words */
4933 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4934 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4935 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4936 /* --- 4 */
4937 movq mm1, [esi] /* load 8 bytes of the Src */
4938 movq mm2, mm1 /* copy MM1 into MM2 */
4939 inc esi /* move pointer to the next 8 bytes of Src */
4940 movq mm3, [edx] /* load 4 words of Kernel */
4941 add edx, 8 /* move pointer to other 4 words */
4942 movq mm4, [edx] /* load 4 words of Kernel */
4943 add edx, 8 /* move pointer to other 4 words */
4944 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4945 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4946 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4947 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4948 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4949 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4950 movq mm1, [esi] /* load 8 bytes of the Src */
4951 dec esi
4952 add esi, eax /* move Src pointer 1 row below */
4953 movq mm3, [edx] /* load 4 words of Kernel */
4954 add edx, 8 /* move pointer to other 4 words */
4955 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4956 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4957 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4958 /* --- 5 */
4959 movq mm1, [esi] /* load 8 bytes of the Src */
4960 movq mm2, mm1 /* copy MM1 into MM2 */
4961 inc esi /* move pointer to the next 8 bytes of Src */
4962 movq mm3, [edx] /* load 4 words of Kernel */
4963 add edx, 8 /* move pointer to other 4 words */
4964 movq mm4, [edx] /* load 4 words of Kernel */
4965 add edx, 8 /* move pointer to other 4 words */
4966 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4967 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4968 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4969 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4970 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4971 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4972 movq mm1, [esi] /* load 8 bytes of the Src */
4973 dec esi
4974 add esi, eax /* move Src pointer 1 row below */
4975 movq mm3, [edx] /* load 4 words of Kernel */
4976 add edx, 8 /* move pointer to other 4 words */
4977 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4978 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4979 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4980 /* --- 6 */
4981 movq mm1, [esi] /* load 8 bytes of the Src */
4982 movq mm2, mm1 /* copy MM1 into MM2 */
4983 inc esi /* move pointer to the next 8 bytes of Src */
4984 movq mm3, [edx] /* load 4 words of Kernel */
4985 add edx, 8 /* move pointer to other 4 words */
4986 movq mm4, [edx] /* load 4 words of Kernel */
4987 add edx, 8 /* move pointer to other 4 words */
4988 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
4989 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
4990 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
4991 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
4992 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
4993 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
4994 movq mm1, [esi] /* load 8 bytes of the Src */
4995 dec esi
4996 add esi, eax /* move Src pointer 1 row below */
4997 movq mm3, [edx] /* load 4 words of Kernel */
4998 add edx, 8 /* move pointer to other 4 words */
4999 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5000 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5001 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5002 /* --- 7 */
5003 movq mm1, [esi] /* load 8 bytes of the Src */
5004 movq mm2, mm1 /* copy MM1 into MM2 */
5005 inc esi /* move pointer to the next 8 bytes of Src */
5006 movq mm3, [edx] /* load 4 words of Kernel */
5007 add edx, 8 /* move pointer to other 4 words */
5008 movq mm4, [edx] /* load 4 words of Kernel */
5009 add edx, 8 /* move pointer to other 4 words */
5010 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5011 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5012 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5013 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
5014 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5015 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5016 movq mm1, [esi] /* load 8 bytes of the Src */
5017 dec esi
5018 add esi, eax /* move Src pointer 1 row below */
5019 movq mm3, [edx] /* load 4 words of Kernel */
5020 add edx, 8 /* move pointer to other 4 words */
5021 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5022 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5023 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5024 /* --- 8 */
5025 movq mm1, [esi] /* load 8 bytes of the Src */
5026 movq mm2, mm1 /* copy MM1 into MM2 */
5027 inc esi /* move pointer to the next 8 bytes of Src */
5028 movq mm3, [edx] /* load 4 words of Kernel */
5029 add edx, 8 /* move pointer to other 4 words */
5030 movq mm4, [edx] /* load 4 words of Kernel */
5031 add edx, 8 /* move pointer to other 4 words */
5032 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5033 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5034 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5035 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
5036 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5037 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5038 movq mm1, [esi] /* load 8 bytes of the Src */
5039 dec esi
5040 add esi, eax /* move Src pointer 1 row below */
5041 movq mm3, [edx] /* load 4 words of Kernel */
5042 add edx, 8 /* move pointer to other 4 words */
5043 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5044 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5045 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5046 /* --- 9 */
5047 movq mm1, [esi] /* load 8 bytes of the Src */
5048 movq mm2, mm1 /* copy MM1 into MM2 */
5049 inc esi /* move pointer to the next 8 bytes of Src */
5050 movq mm3, [edx] /* load 4 words of Kernel */
5051 add edx, 8 /* move pointer to other 4 words */
5052 movq mm4, [edx] /* load 4 words of Kernel */
5053 add edx, 8 /* move pointer to other 4 words */
5054 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5055 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5056 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5057 pmullw mm2, mm4 /* mult. 4 high words of Src and Kernel */
5058 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5059 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5060 movq mm1, [esi] /* load 8 bytes of the Src */
5061 movq mm3, [edx] /* load 4 words of Kernel */
5062 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5063 pmullw mm1, mm3 /* mult. 4 low words of Src and Kernel */
5064 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5065 /* ---, */
5066 movq mm3, mm7 /* copy MM7 into MM3 */
5067 psrlq mm7, 32 /* shift 2 left words to the right */
5068 paddsw mm7, mm3 /* add 2 left and 2 right result words */
5069 movq mm2, mm7 /* copy MM7 into MM2 */
5070 psrlq mm7, 16 /* shift 1 left word to the right */
5071 paddsw mm7, mm2 /* add 1 left and 1 right result words */
5072 /* ---, */
5073 movd mm1, eax /* save EDX in MM1 */
5074 movd mm2, ebx /* save EDX in MM2 */
5075 movd mm3, edx /* save EDX in MM3 */
5076 movd eax, mm7 /* load summation result into EAX */
5077 psraw mm7, 15 /* spread sign bit of the result */
5078 movd ebx, mm5 /* load Divisor into EBX */
5079 movd edx, mm7 /* fill EDX with a sign bit */
5080 idiv bx /* IDIV - VERY EXPENSIVE */
5081 movd mm7, eax /* move result of division into MM7 */
5082 packuswb mm7, mm0 /* pack division result with saturation */
5083 movd eax, mm7 /* copy saturated result into EAX */
5084 mov [edi], al /* copy a byte result into Dest */
5085 movd edx, mm3 /* restore saved EDX */
5086 movd ebx, mm2 /* restore saved EBX */
5087 movd eax, mm1 /* restore saved EAX */
5088 /* --, */
5089 movd esi, mm6 /* move Src pointer to the top pixel */
5090 sub edx, 208 /* EDX = Kernel address */
5091 inc esi /* move Src pointer to the next pixel */
5092 inc edi /* move Dest pointer to the next pixel */
5093 /* ---, */
5094 dec ecx /* decrease loop counter COLUMNS */
5095 jnz L10352 /* check loop termination, proceed if required */
5096 add esi, 8 /* move to the next row in Src */
5097 add edi, 8 /* move to the next row in Dest */
5098 dec ebx /* decrease loop counter ROWS */
5099 jnz L10350 /* check loop termination, proceed if required */
5100 /* ---, */
5101 emms /* exit MMX state */
5102 popa
5103 }
5104#else
5105 asm volatile
5106 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
5107 "xor %%ebx, %%ebx \n\t" /* zero EBX */
5108 "mov %5, %%bl \n\t" /* load Divisor into BL */
5109 "movd %%ebx, %%mm5 \n\t" /* copy Divisor into MM5 */
5110 "mov %4, %%edx \n\t" /* load Kernel address into EDX */
5111 "mov %1, %%esi \n\t" /* load Src address to ESI */
5112 "mov %0, %%edi \n\t" /* load Dest address to EDI */
5113 "add $4, %%edi \n\t" /* 4 column offset from the left edge */
5114 "mov %3, %%eax \n\t" /* load columns into EAX */
5115 "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */
5116 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
5117 "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */
5118 /* --- */
5119 ".L10350: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
5120 "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */
5121 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
5122 ".L10352: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
5123 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
5124 /* --- 1 */
5125 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5126 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5127 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5128 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5129 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5130 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5131 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5132 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5133 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5134 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5135 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5136 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5137 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5138 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5139 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5140 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5141 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5142 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5143 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5144 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5145 /* --- 2 */
5146 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5147 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5148 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5149 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5150 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5151 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5152 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5153 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5154 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5155 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5156 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5157 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5158 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5159 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5160 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5161 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5162 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5163 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5164 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5165 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5166 /* --- 3 */
5167 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5168 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5169 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5170 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5171 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5172 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5173 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5174 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5175 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5176 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5177 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5178 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5179 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5180 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5181 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5182 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5183 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5184 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5185 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5186 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5187 /* --- 4 */
5188 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5189 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5190 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5191 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5192 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5193 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5194 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5195 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5196 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5197 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5198 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5199 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5200 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5201 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5202 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5203 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5204 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5205 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5206 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5207 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5208 /* --- 5 */
5209 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5210 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5211 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5212 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5213 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5214 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5215 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5216 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5217 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5218 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5219 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5220 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5221 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5222 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5223 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5224 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5225 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5226 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5227 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5228 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5229 /* --- 6 */
5230 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5231 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5232 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5233 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5234 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5235 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5236 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5237 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5238 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5239 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5240 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5241 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5242 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5243 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5244 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5245 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5246 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5247 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5248 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5249 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5250 /* --- 7 */
5251 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5252 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5253 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5254 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5255 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5256 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5257 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5258 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5259 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5260 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5261 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5262 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5263 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5264 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5265 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5266 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5267 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5268 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5269 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5270 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5271 /* --- 8 */
5272 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5273 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5274 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5275 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5276 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5277 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5278 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5279 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5280 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5281 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5282 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5283 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5284 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5285 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5286 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5287 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5288 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5289 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5290 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5291 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5292 /* --- 9 */
5293 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5294 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5295 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
5296 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5297 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5298 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5299 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5300 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5301 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5302 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5303 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5304 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5305 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5306 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5307 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5308 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5309 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5310 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5311 /* --- */
5312 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
5313 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
5314 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
5315 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
5316 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
5317 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
5318 /* --- */
5319 "movd %%eax, %%mm1 \n\t" /* save EDX in MM1 */
5320 "movd %%ebx, %%mm2 \n\t" /* save EDX in MM2 */
5321 "movd %%edx, %%mm3 \n\t" /* save EDX in MM3 */
5322 "movd %%mm7, %%eax \n\t" /* load summation result into EAX */
5323 "psraw $15, %%mm7 \n\t" /* spread sign bit of the result */
5324 "movd %%mm5, %%ebx \n\t" /* load Divisor into EBX */
5325 "movd %%mm7, %%edx \n\t" /* fill EDX with a sign bit */
5326 "idivw %%bx \n\t" /* IDIV - VERY EXPENSIVE */
5327 "movd %%eax, %%mm7 \n\t" /* move result of division into MM7 */
5328 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
5329 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
5330 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
5331 "movd %%mm3, %%edx \n\t" /* restore saved EDX */
5332 "movd %%mm2, %%ebx \n\t" /* restore saved EBX */
5333 "movd %%mm1, %%eax \n\t" /* restore saved EAX */
5334 /* -- */
5335 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
5336 "sub $208, %%edx \n\t" /* EDX = Kernel address */
5337 "inc %%esi \n\t" /* move Src pointer to the next pixel */
5338 "inc %%edi \n\t" /* move Dest pointer to the next pixel */
5339 /* --- */
5340 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
5341 "jnz .L10352 \n\t" /* check loop termination, proceed if required */
5342 "add $8, %%esi \n\t" /* move to the next row in Src */
5343 "add $8, %%edi \n\t" /* move to the next row in Dest */
5344 "dec %%ebx \n\t" /* decrease loop counter ROWS */
5345 "jnz .L10350 \n\t" /* check loop termination, proceed if required */
5346 /* --- */
5347 "emms \n\t" /* exit MMX state */
5348 "popa \n\t":"=m" (Dest) /* %0 */
5349 :"m"(Src), /* %1 */
5350 "m"(rows), /* %2 */
5351 "m"(columns), /* %3 */
5352 "m"(Kernel), /* %4 */
5353 "m"(Divisor) /* %5 */
5354 );
5355#endif
5356#endif
5357 return (0);
5358 } else {
5359 /* No non-MMX implementation yet */
5360 return (-1);
5361 }
5362}
5363
5378int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
5379 signed short *Kernel, unsigned char NRightShift)
5380{
5381 /* Validate input parameters */
5382 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
5383 return(-1);
5384
5385 if ((columns < 3) || (rows < 3) || (NRightShift > 7))
5386 return (-1);
5387
5388 if ((SDL_imageFilterMMXdetect())) {
5389//#ifdef USE_MMX
5390#if defined(USE_MMX) && defined(i386)
5391#if !defined(GCC__)
5392 __asm
5393 {
5394 pusha
5395 pxor mm0, mm0 /* zero MM0 */
5396 xor ebx, ebx /* zero EBX */
5397 mov bl, NRightShift /* load NRightShift into BL */
5398 movd mm4, ebx /* copy NRightShift into MM4 */
5399 mov edx, Kernel /* load Kernel address into EDX */
5400 movq mm5, [edx] /* MM5 = {0,K2,K1,K0} */
5401 add edx, 8 /* second row |K0 K1 K2 0| */
5402 movq mm6, [edx] /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
5403 add edx, 8 /* third row |K6 K7 K8 0| */
5404 movq mm7, [edx] /* MM7 = {0,K8,K7,K6} */
5405 /* ---, */
5406 mov eax, columns /* load columns into EAX */
5407 mov esi, Src /* ESI = Src row 0 address */
5408 mov edi, Dest /* load Dest address to EDI */
5409 add edi, eax /* EDI = EDI + columns */
5410 inc edi /* 1 byte offset from the left edge */
5411 mov edx, rows /* initialize ROWS counter */
5412 sub edx, 2 /* do not use first and last row */
5413 /* ---, */
5414L10360:
5415 mov ecx, eax /* initialize COLUMS counter */
5416 sub ecx, 2 /* do not use first and last column */
5417 align 16 /* 16 byte alignment of the loop entry */
5418L10362:
5419 /* ---, */
5420 movq mm1, [esi] /* load 8 bytes of the image first row */
5421 add esi, eax /* move one row below */
5422 movq mm2, [esi] /* load 8 bytes of the image second row */
5423 add esi, eax /* move one row below */
5424 movq mm3, [esi] /* load 8 bytes of the image third row */
5425 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5426 punpcklbw mm2, mm0 /* unpack first 4 bytes into words */
5427 punpcklbw mm3, mm0 /* unpack first 4 bytes into words */
5428 psrlw mm1, mm4 /* shift right each pixel NshiftRight times */
5429 psrlw mm2, mm4 /* shift right each pixel NshiftRight times */
5430 psrlw mm3, mm4 /* shift right each pixel NshiftRight times */
5431 pmullw mm1, mm5 /* multiply words first row image*Kernel */
5432 pmullw mm2, mm6 /* multiply words second row image*Kernel */
5433 pmullw mm3, mm7 /* multiply words third row image*Kernel */
5434 paddsw mm1, mm2 /* add 4 words of the first and second rows */
5435 paddsw mm1, mm3 /* add 4 words of the third row and result */
5436 movq mm2, mm1 /* copy MM1 into MM2 */
5437 psrlq mm1, 32 /* shift 2 left words to the right */
5438 paddsw mm1, mm2 /* add 2 left and 2 right result words */
5439 movq mm3, mm1 /* copy MM1 into MM3 */
5440 psrlq mm1, 16 /* shift 1 left word to the right */
5441 paddsw mm1, mm3 /* add 1 left and 1 right result words */
5442 packuswb mm1, mm0 /* pack shift result with saturation */
5443 movd ebx, mm1 /* copy saturated result into EBX */
5444 mov [edi], bl /* copy a byte result into Dest */
5445 /* --, */
5446 sub esi, eax /* move two rows up */
5447 sub esi, eax
5448 inc esi /* move Src pointer to the next pixel */
5449 inc edi /* move Dest pointer to the next pixel */
5450 /* ---, */
5451 dec ecx /* decrease loop counter COLUMNS */
5452 jnz L10362 /* check loop termination, proceed if required */
5453 add esi, 2 /* move to the next row in Src */
5454 add edi, 2 /* move to the next row in Dest */
5455 dec edx /* decrease loop counter ROWS */
5456 jnz L10360 /* check loop termination, proceed if required */
5457 /* ---, */
5458 emms /* exit MMX state */
5459 popa
5460 }
5461#else
5462 asm volatile
5463 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
5464 "xor %%ebx, %%ebx \n\t" /* zero EBX */
5465 "mov %5, %%bl \n\t" /* load NRightShift into BL */
5466 "movd %%ebx, %%mm4 \n\t" /* copy NRightShift into MM4 */
5467 "mov %4, %%edx \n\t" /* load Kernel address into EDX */
5468 "movq (%%edx), %%mm5 \n\t" /* MM5 = {0,K2,K1,K0} */
5469 "add $8, %%edx \n\t" /* second row |K0 K1 K2 0| */
5470 "movq (%%edx), %%mm6 \n\t" /* MM6 = {0,K5,K4,K3} K = |K3 K4 K5 0| */
5471 "add $8, %%edx \n\t" /* third row |K6 K7 K8 0| */
5472 "movq (%%edx), %%mm7 \n\t" /* MM7 = {0,K8,K7,K6} */
5473 /* --- */
5474 "mov %3, %%eax \n\t" /* load columns into EAX */
5475 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
5476 "mov %0, %%edi \n\t" /* load Dest address to EDI */
5477 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
5478 "inc %%edi \n\t" /* 1 byte offset from the left edge */
5479 "mov %2, %%edx \n\t" /* initialize ROWS counter */
5480 "sub $2, %%edx \n\t" /* do not use first and last row */
5481 /* --- */
5482 ".L10360: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
5483 "sub $2, %%ecx \n\t" /* do not use first and last column */
5484 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
5485 ".L10362: \n\t"
5486 /* --- */
5487 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */
5488 "add %%eax, %%esi \n\t" /* move one row below */
5489 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes of the image second row */
5490 "add %%eax, %%esi \n\t" /* move one row below */
5491 "movq (%%esi), %%mm3 \n\t" /* load 8 bytes of the image third row */
5492 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5493 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack first 4 bytes into words */
5494 "punpcklbw %%mm0, %%mm3 \n\t" /* unpack first 4 bytes into words */
5495 "psrlw %%mm4, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5496 "psrlw %%mm4, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5497 "psrlw %%mm4, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
5498 "pmullw %%mm5, %%mm1 \n\t" /* multiply words first row image*Kernel */
5499 "pmullw %%mm6, %%mm2 \n\t" /* multiply words second row image*Kernel */
5500 "pmullw %%mm7, %%mm3 \n\t" /* multiply words third row image*Kernel */
5501 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the first and second rows */
5502 "paddsw %%mm3, %%mm1 \n\t" /* add 4 words of the third row and result */
5503 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5504 "psrlq $32, %%mm1 \n\t" /* shift 2 left words to the right */
5505 "paddsw %%mm2, %%mm1 \n\t" /* add 2 left and 2 right result words */
5506 "movq %%mm1, %%mm3 \n\t" /* copy MM1 into MM3 */
5507 "psrlq $16, %%mm1 \n\t" /* shift 1 left word to the right */
5508 "paddsw %%mm3, %%mm1 \n\t" /* add 1 left and 1 right result words */
5509 "packuswb %%mm0, %%mm1 \n\t" /* pack shift result with saturation */
5510 "movd %%mm1, %%ebx \n\t" /* copy saturated result into EBX */
5511 "mov %%bl, (%%edi) \n\t" /* copy a byte result into Dest */
5512 /* -- */
5513 "sub %%eax, %%esi \n\t" /* move two rows up */
5514 "sub %%eax, %%esi \n\t" "inc %%esi \n\t" /* move Src pointer to the next pixel */
5515 "inc %%edi \n\t" /* move Dest pointer to the next pixel */
5516 /* --- */
5517 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
5518 "jnz .L10362 \n\t" /* check loop termination, proceed if required */
5519 "add $2, %%esi \n\t" /* move to the next row in Src */
5520 "add $2, %%edi \n\t" /* move to the next row in Dest */
5521 "dec %%edx \n\t" /* decrease loop counter ROWS */
5522 "jnz .L10360 \n\t" /* check loop termination, proceed if required */
5523 /* --- */
5524 "emms \n\t" /* exit MMX state */
5525 "popa \n\t":"=m" (Dest) /* %0 */
5526 :"m"(Src), /* %1 */
5527 "m"(rows), /* %2 */
5528 "m"(columns), /* %3 */
5529 "m"(Kernel), /* %4 */
5530 "m"(NRightShift) /* %5 */
5531 );
5532#endif
5533#endif
5534 return (0);
5535 } else {
5536 /* No non-MMX implementation yet */
5537 return (-1);
5538 }
5539}
5540
5555int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
5556 signed short *Kernel, unsigned char NRightShift)
5557{
5558 /* Validate input parameters */
5559 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
5560 return(-1);
5561
5562 if ((columns < 5) || (rows < 5) || (NRightShift > 7))
5563 return (-1);
5564
5565 if ((SDL_imageFilterMMXdetect())) {
5566//#ifdef USE_MMX
5567#if defined(USE_MMX) && defined(i386)
5568#if !defined(GCC__)
5569 __asm
5570 {
5571 pusha
5572 pxor mm0, mm0 /* zero MM0 */
5573 xor ebx, ebx /* zero EBX */
5574 mov bl, NRightShift /* load NRightShift into BL */
5575 movd mm5, ebx /* copy NRightShift into MM5 */
5576 mov edx, Kernel /* load Kernel address into EDX */
5577 mov esi, Src /* load Src address to ESI */
5578 mov edi, Dest /* load Dest address to EDI */
5579 add edi, 2 /* 2 column offset from the left edge */
5580 mov eax, columns /* load columns into EAX */
5581 shl eax, 1 /* EAX = columns * 2 */
5582 add edi, eax /* 2 row offset from the top edge */
5583 shr eax, 1 /* EAX = columns */
5584 mov ebx, rows /* initialize ROWS counter */
5585 sub ebx, 4 /* do not use first 2 and last 2 rows */
5586 /* ---, */
5587L10370:
5588 mov ecx, eax /* initialize COLUMNS counter */
5589 sub ecx, 4 /* do not use first 2 and last 2 columns */
5590 align 16 /* 16 byte alignment of the loop entry */
5591L10372:
5592 pxor mm7, mm7 /* zero MM7 (accumulator) */
5593 movd mm6, esi /* save ESI in MM6 */
5594 /* --- 1 */
5595 movq mm1, [esi] /* load 8 bytes of the Src */
5596 movq mm2, mm1 /* copy MM1 into MM2 */
5597 add esi, eax /* move Src pointer 1 row below */
5598 movq mm3, [edx] /* load 4 words of Kernel */
5599 add edx, 8 /* move pointer to other 4 words */
5600 movq mm4, [edx] /* load 4 words of Kernel */
5601 add edx, 8 /* move pointer to other 4 words */
5602 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5603 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5604 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5605 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5606 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5607 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5608 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5609 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5610 /* --- 2 */
5611 movq mm1, [esi] /* load 8 bytes of the Src */
5612 movq mm2, mm1 /* copy MM1 into MM2 */
5613 add esi, eax /* move Src pointer 1 row below */
5614 movq mm3, [edx] /* load 4 words of Kernel */
5615 add edx, 8 /* move pointer to other 4 words */
5616 movq mm4, [edx] /* load 4 words of Kernel */
5617 add edx, 8 /* move pointer to other 4 words */
5618 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5619 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5620 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5621 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5622 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5623 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5624 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5625 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5626 /* --- 3 */
5627 movq mm1, [esi] /* load 8 bytes of the Src */
5628 movq mm2, mm1 /* copy MM1 into MM2 */
5629 add esi, eax /* move Src pointer 1 row below */
5630 movq mm3, [edx] /* load 4 words of Kernel */
5631 add edx, 8 /* move pointer to other 4 words */
5632 movq mm4, [edx] /* load 4 words of Kernel */
5633 add edx, 8 /* move pointer to other 4 words */
5634 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5635 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5636 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5637 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5638 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5639 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5640 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5641 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5642 /* --- 4 */
5643 movq mm1, [esi] /* load 8 bytes of the Src */
5644 movq mm2, mm1 /* copy MM1 into MM2 */
5645 add esi, eax /* move Src pointer 1 row below */
5646 movq mm3, [edx] /* load 4 words of Kernel */
5647 add edx, 8 /* move pointer to other 4 words */
5648 movq mm4, [edx] /* load 4 words of Kernel */
5649 add edx, 8 /* move pointer to other 4 words */
5650 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5651 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5652 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5653 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5654 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5655 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5656 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5657 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5658 /* --- 5 */
5659 movq mm1, [esi] /* load 8 bytes of the Src */
5660 movq mm2, mm1 /* copy MM1 into MM2 */
5661 movq mm3, [edx] /* load 4 words of Kernel */
5662 add edx, 8 /* move pointer to other 4 words */
5663 movq mm4, [edx] /* load 4 words of Kernel */
5664 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5665 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5666 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5667 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5668 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5669 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5670 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5671 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5672 /* ---, */
5673 movq mm3, mm7 /* copy MM7 into MM3 */
5674 psrlq mm7, 32 /* shift 2 left words to the right */
5675 paddsw mm7, mm3 /* add 2 left and 2 right result words */
5676 movq mm2, mm7 /* copy MM7 into MM2 */
5677 psrlq mm7, 16 /* shift 1 left word to the right */
5678 paddsw mm7, mm2 /* add 1 left and 1 right result words */
5679 movd mm1, eax /* save EAX in MM1 */
5680 packuswb mm7, mm0 /* pack division result with saturation */
5681 movd eax, mm7 /* copy saturated result into EAX */
5682 mov [edi], al /* copy a byte result into Dest */
5683 movd eax, mm1 /* restore saved EAX */
5684 /* --, */
5685 movd esi, mm6 /* move Src pointer to the top pixel */
5686 sub edx, 72 /* EDX = Kernel address */
5687 inc esi /* move Src pointer to the next pixel */
5688 inc edi /* move Dest pointer to the next pixel */
5689 /* ---, */
5690 dec ecx /* decrease loop counter COLUMNS */
5691 jnz L10372 /* check loop termination, proceed if required */
5692 add esi, 4 /* move to the next row in Src */
5693 add edi, 4 /* move to the next row in Dest */
5694 dec ebx /* decrease loop counter ROWS */
5695 jnz L10370 /* check loop termination, proceed if required */
5696 /* ---, */
5697 emms /* exit MMX state */
5698 popa
5699 }
5700#else
5701 asm volatile
5702 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
5703 "xor %%ebx, %%ebx \n\t" /* zero EBX */
5704 "mov %5, %%bl \n\t" /* load NRightShift into BL */
5705 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */
5706 "mov %4, %%edx \n\t" /* load Kernel address into EDX */
5707 "mov %1, %%esi \n\t" /* load Src address to ESI */
5708 "mov %0, %%edi \n\t" /* load Dest address to EDI */
5709 "add $2, %%edi \n\t" /* 2 column offset from the left edge */
5710 "mov %3, %%eax \n\t" /* load columns into EAX */
5711 "shl $1, %%eax \n\t" /* EAX = columns * 2 */
5712 "add %%eax, %%edi \n\t" /* 2 row offset from the top edge */
5713 "shr $1, %%eax \n\t" /* EAX = columns */
5714 "mov %2, %%ebx \n\t" /* initialize ROWS counter */
5715 "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */
5716 /* --- */
5717 ".L10370: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
5718 "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */
5719 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
5720 ".L10372: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
5721 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
5722 /* --- 1 */
5723 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5724 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5725 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5726 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5727 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5728 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5729 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5730 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5731 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5732 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5733 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5734 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5735 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5736 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5737 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5738 /* --- 2 */
5739 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5740 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5741 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5742 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5743 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5744 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5745 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5746 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5747 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5748 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5749 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5750 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5751 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5752 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5753 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5754 /* --- 3 */
5755 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5756 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5757 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5758 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5759 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5760 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5761 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5762 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5763 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5764 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5765 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5766 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5767 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5768 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5769 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5770 /* --- 4 */
5771 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5772 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5773 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
5774 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5775 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5776 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5777 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5778 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5779 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5780 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5781 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5782 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5783 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5784 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5785 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5786 /* --- 5 */
5787 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
5788 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
5789 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
5790 "add $8, %%edx \n\t" /* move pointer to other 4 words */
5791 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
5792 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
5793 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
5794 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
5795 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
5796 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
5797 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
5798 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
5799 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
5800 /* --- */
5801 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
5802 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
5803 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
5804 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
5805 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
5806 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
5807 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */
5808 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
5809 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
5810 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
5811 "movd %%mm1, %%eax \n\t" /* restore saved EAX */
5812 /* -- */
5813 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
5814 "sub $72, %%edx \n\t" /* EDX = Kernel address */
5815 "inc %%esi \n\t" /* move Src pointer to the next pixel */
5816 "inc %%edi \n\t" /* move Dest pointer to the next pixel */
5817 /* --- */
5818 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
5819 "jnz .L10372 \n\t" /* check loop termination, proceed if required */
5820 "add $4, %%esi \n\t" /* move to the next row in Src */
5821 "add $4, %%edi \n\t" /* move to the next row in Dest */
5822 "dec %%ebx \n\t" /* decrease loop counter ROWS */
5823 "jnz .L10370 \n\t" /* check loop termination, proceed if required */
5824 /* --- */
5825 "emms \n\t" /* exit MMX state */
5826 "popa \n\t":"=m" (Dest) /* %0 */
5827 :"m"(Src), /* %1 */
5828 "m"(rows), /* %2 */
5829 "m"(columns), /* %3 */
5830 "m"(Kernel), /* %4 */
5831 "m"(NRightShift) /* %5 */
5832 );
5833#endif
5834#endif
5835 return (0);
5836 } else {
5837 /* No non-MMX implementation yet */
5838 return (-1);
5839 }
5840}
5841
5856int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
5857 signed short *Kernel, unsigned char NRightShift)
5858{
5859 /* Validate input parameters */
5860 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
5861 return(-1);
5862
5863 if ((columns < 7) || (rows < 7) || (NRightShift > 7))
5864 return (-1);
5865
5866 if ((SDL_imageFilterMMXdetect())) {
5867//#ifdef USE_MMX
5868#if defined(USE_MMX) && defined(i386)
5869#if !defined(GCC__)
5870 __asm
5871 {
5872 pusha
5873 pxor mm0, mm0 /* zero MM0 */
5874 xor ebx, ebx /* zero EBX */
5875 mov bl, NRightShift /* load NRightShift into BL */
5876 movd mm5, ebx /* copy NRightShift into MM5 */
5877 mov edx, Kernel /* load Kernel address into EDX */
5878 mov esi, Src /* load Src address to ESI */
5879 mov edi, Dest /* load Dest address to EDI */
5880 add edi, 3 /* 3 column offset from the left edge */
5881 mov eax, columns /* load columns into EAX */
5882 add edi, eax /* 3 row offset from the top edge */
5883 add edi, eax
5884 add edi, eax
5885 mov ebx, rows /* initialize ROWS counter */
5886 sub ebx, 6 /* do not use first 3 and last 3 rows */
5887 /* ---, */
5888L10380:
5889 mov ecx, eax /* initialize COLUMNS counter */
5890 sub ecx, 6 /* do not use first 3 and last 3 columns */
5891 align 16 /* 16 byte alignment of the loop entry */
5892L10382:
5893 pxor mm7, mm7 /* zero MM7 (accumulator) */
5894 movd mm6, esi /* save ESI in MM6 */
5895 /* --- 1 */
5896 movq mm1, [esi] /* load 8 bytes of the Src */
5897 movq mm2, mm1 /* copy MM1 into MM2 */
5898 add esi, eax /* move Src pointer 1 row below */
5899 movq mm3, [edx] /* load 4 words of Kernel */
5900 add edx, 8 /* move pointer to other 4 words */
5901 movq mm4, [edx] /* load 4 words of Kernel */
5902 add edx, 8 /* move pointer to other 4 words */
5903 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5904 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5905 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5906 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5907 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5908 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5909 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5910 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5911 /* --- 2 */
5912 movq mm1, [esi] /* load 8 bytes of the Src */
5913 movq mm2, mm1 /* copy MM1 into MM2 */
5914 add esi, eax /* move Src pointer 1 row below */
5915 movq mm3, [edx] /* load 4 words of Kernel */
5916 add edx, 8 /* move pointer to other 4 words */
5917 movq mm4, [edx] /* load 4 words of Kernel */
5918 add edx, 8 /* move pointer to other 4 words */
5919 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5920 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5921 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5922 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5923 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5924 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5925 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5926 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5927 /* --- 3 */
5928 movq mm1, [esi] /* load 8 bytes of the Src */
5929 movq mm2, mm1 /* copy MM1 into MM2 */
5930 add esi, eax /* move Src pointer 1 row below */
5931 movq mm3, [edx] /* load 4 words of Kernel */
5932 add edx, 8 /* move pointer to other 4 words */
5933 movq mm4, [edx] /* load 4 words of Kernel */
5934 add edx, 8 /* move pointer to other 4 words */
5935 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5936 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5937 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5938 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5939 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5940 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5941 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5942 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5943 /* --- 4 */
5944 movq mm1, [esi] /* load 8 bytes of the Src */
5945 movq mm2, mm1 /* copy MM1 into MM2 */
5946 add esi, eax /* move Src pointer 1 row below */
5947 movq mm3, [edx] /* load 4 words of Kernel */
5948 add edx, 8 /* move pointer to other 4 words */
5949 movq mm4, [edx] /* load 4 words of Kernel */
5950 add edx, 8 /* move pointer to other 4 words */
5951 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5952 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5953 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5954 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5955 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5956 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5957 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5958 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5959 /* --- 5 */
5960 movq mm1, [esi] /* load 8 bytes of the Src */
5961 movq mm2, mm1 /* copy MM1 into MM2 */
5962 add esi, eax /* move Src pointer 1 row below */
5963 movq mm3, [edx] /* load 4 words of Kernel */
5964 add edx, 8 /* move pointer to other 4 words */
5965 movq mm4, [edx] /* load 4 words of Kernel */
5966 add edx, 8 /* move pointer to other 4 words */
5967 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5968 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5969 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5970 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5971 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5972 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5973 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5974 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5975 /* --- 6 */
5976 movq mm1, [esi] /* load 8 bytes of the Src */
5977 movq mm2, mm1 /* copy MM1 into MM2 */
5978 add esi, eax /* move Src pointer 1 row below */
5979 movq mm3, [edx] /* load 4 words of Kernel */
5980 add edx, 8 /* move pointer to other 4 words */
5981 movq mm4, [edx] /* load 4 words of Kernel */
5982 add edx, 8 /* move pointer to other 4 words */
5983 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5984 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5985 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
5986 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
5987 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
5988 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
5989 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
5990 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
5991 /* --- 7 */
5992 movq mm1, [esi] /* load 8 bytes of the Src */
5993 movq mm2, mm1 /* copy MM1 into MM2 */
5994 movq mm3, [edx] /* load 4 words of Kernel */
5995 add edx, 8 /* move pointer to other 4 words */
5996 movq mm4, [edx] /* load 4 words of Kernel */
5997 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
5998 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
5999 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6000 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6001 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6002 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6003 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6004 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6005 /* ---, */
6006 movq mm3, mm7 /* copy MM7 into MM3 */
6007 psrlq mm7, 32 /* shift 2 left words to the right */
6008 paddsw mm7, mm3 /* add 2 left and 2 right result words */
6009 movq mm2, mm7 /* copy MM7 into MM2 */
6010 psrlq mm7, 16 /* shift 1 left word to the right */
6011 paddsw mm7, mm2 /* add 1 left and 1 right result words */
6012 movd mm1, eax /* save EAX in MM1 */
6013 packuswb mm7, mm0 /* pack division result with saturation */
6014 movd eax, mm7 /* copy saturated result into EAX */
6015 mov [edi], al /* copy a byte result into Dest */
6016 movd eax, mm1 /* restore saved EAX */
6017 /* --, */
6018 movd esi, mm6 /* move Src pointer to the top pixel */
6019 sub edx, 104 /* EDX = Kernel address */
6020 inc esi /* move Src pointer to the next pixel */
6021 inc edi /* move Dest pointer to the next pixel */
6022 /* ---, */
6023 dec ecx /* decrease loop counter COLUMNS */
6024 jnz L10382 /* check loop termination, proceed if required */
6025 add esi, 6 /* move to the next row in Src */
6026 add edi, 6 /* move to the next row in Dest */
6027 dec ebx /* decrease loop counter ROWS */
6028 jnz L10380 /* check loop termination, proceed if required */
6029 /* ---, */
6030 emms /* exit MMX state */
6031 popa
6032 }
6033#else
6034 asm volatile
6035 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
6036 "xor %%ebx, %%ebx \n\t" /* zero EBX */
6037 "mov %5, %%bl \n\t" /* load NRightShift into BL */
6038 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */
6039 "mov %4, %%edx \n\t" /* load Kernel address into EDX */
6040 "mov %1, %%esi \n\t" /* load Src address to ESI */
6041 "mov %0, %%edi \n\t" /* load Dest address to EDI */
6042 "add $3, %%edi \n\t" /* 3 column offset from the left edge */
6043 "mov %3, %%eax \n\t" /* load columns into EAX */
6044 "add %%eax, %%edi \n\t" /* 3 row offset from the top edge */
6045 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
6046 "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */
6047 /* --- */
6048 ".L10380: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
6049 "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */
6050 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
6051 ".L10382: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
6052 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
6053 /* --- 1 */
6054 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6055 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6056 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6057 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6058 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6059 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6060 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6061 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6062 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6063 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6064 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6065 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6066 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6067 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6068 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6069 /* --- 2 */
6070 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6071 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6072 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6073 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6074 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6075 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6076 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6077 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6078 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6079 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6080 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6081 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6082 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6083 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6084 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6085 /* --- 3 */
6086 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6087 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6088 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6089 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6090 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6091 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6092 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6093 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6094 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6095 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6096 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6097 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6098 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6099 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6100 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6101 /* --- 4 */
6102 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6103 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6104 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6105 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6106 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6107 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6108 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6109 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6110 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6111 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6112 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6113 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6114 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6115 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6116 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6117 /* --- 5 */
6118 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6119 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6120 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6121 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6122 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6123 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6124 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6125 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6126 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6127 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6128 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6129 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6130 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6131 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6132 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6133 /* --- 6 */
6134 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6135 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6136 "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6137 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6138 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6139 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6140 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6141 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6142 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6143 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6144 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6145 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6146 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6147 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6148 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6149 /* --- 7 */
6150 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6151 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6152 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6153 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6154 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6155 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6156 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6157 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6158 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6159 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6160 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6161 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6162 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6163 /* --- */
6164 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
6165 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
6166 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
6167 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
6168 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
6169 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
6170 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */
6171 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
6172 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
6173 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
6174 "movd %%mm1, %%eax \n\t" /* restore saved EAX */
6175 /* -- */
6176 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
6177 "sub $104, %%edx \n\t" /* EDX = Kernel address */
6178 "inc %%esi \n\t" /* move Src pointer to the next pixel */
6179 "inc %%edi \n\t" /* move Dest pointer to the next pixel */
6180 /* --- */
6181 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
6182 "jnz .L10382 \n\t" /* check loop termination, proceed if required */
6183 "add $6, %%esi \n\t" /* move to the next row in Src */
6184 "add $6, %%edi \n\t" /* move to the next row in Dest */
6185 "dec %%ebx \n\t" /* decrease loop counter ROWS */
6186 "jnz .L10380 \n\t" /* check loop termination, proceed if required */
6187 /* --- */
6188 "emms \n\t" /* exit MMX state */
6189 "popa \n\t":"=m" (Dest) /* %0 */
6190 :"m"(Src), /* %1 */
6191 "m"(rows), /* %2 */
6192 "m"(columns), /* %3 */
6193 "m"(Kernel), /* %4 */
6194 "m"(NRightShift) /* %5 */
6195 );
6196#endif
6197#endif
6198 return (0);
6199 } else {
6200 /* No non-MMX implementation yet */
6201 return (-1);
6202 }
6203}
6204
6219int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
6220 signed short *Kernel, unsigned char NRightShift)
6221{
6222 /* Validate input parameters */
6223 if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
6224 return(-1);
6225
6226 if ((columns < 9) || (rows < 9) || (NRightShift > 7))
6227 return (-1);
6228
6229 if ((SDL_imageFilterMMXdetect())) {
6230//#ifdef USE_MMX
6231#if defined(USE_MMX) && defined(i386)
6232#if !defined(GCC__)
6233 __asm
6234 {
6235 pusha
6236 pxor mm0, mm0 /* zero MM0 */
6237 xor ebx, ebx /* zero EBX */
6238 mov bl, NRightShift /* load NRightShift into BL */
6239 movd mm5, ebx /* copy NRightShift into MM5 */
6240 mov edx, Kernel /* load Kernel address into EDX */
6241 mov esi, Src /* load Src address to ESI */
6242 mov edi, Dest /* load Dest address to EDI */
6243 add edi, 4 /* 4 column offset from the left edge */
6244 mov eax, columns /* load columns into EAX */
6245 add edi, eax /* 4 row offset from the top edge */
6246 add edi, eax
6247 add edi, eax
6248 add edi, eax
6249 mov ebx, rows /* initialize ROWS counter */
6250 sub ebx, 8 /* do not use first 4 and last 4 rows */
6251 /* ---, */
6252L10390:
6253 mov ecx, eax /* initialize COLUMNS counter */
6254 sub ecx, 8 /* do not use first 4 and last 4 columns */
6255 align 16 /* 16 byte alignment of the loop entry */
6256L10392:
6257 pxor mm7, mm7 /* zero MM7 (accumulator) */
6258 movd mm6, esi /* save ESI in MM6 */
6259 /* --- 1 */
6260 movq mm1, [esi] /* load 8 bytes of the Src */
6261 movq mm2, mm1 /* copy MM1 into MM2 */
6262 inc esi /* move pointer to the next 8 bytes of Src */
6263 movq mm3, [edx] /* load 4 words of Kernel */
6264 add edx, 8 /* move pointer to other 4 words */
6265 movq mm4, [edx] /* load 4 words of Kernel */
6266 add edx, 8 /* move pointer to other 4 words */
6267 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6268 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6269 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6270 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6271 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6272 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6273 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6274 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6275 movq mm1, [esi] /* load 8 bytes of the Src */
6276 dec esi
6277 add esi, eax /* move Src pointer 1 row below */
6278 movq mm3, [edx] /* load 4 words of Kernel */
6279 add edx, 8 /* move pointer to other 4 words */
6280 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6281 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6282 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6283 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6284 /* --- 2 */
6285 movq mm1, [esi] /* load 8 bytes of the Src */
6286 movq mm2, mm1 /* copy MM1 into MM2 */
6287 inc esi /* move pointer to the next 8 bytes of Src */
6288 movq mm3, [edx] /* load 4 words of Kernel */
6289 add edx, 8 /* move pointer to other 4 words */
6290 movq mm4, [edx] /* load 4 words of Kernel */
6291 add edx, 8 /* move pointer to other 4 words */
6292 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6293 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6294 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6295 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6296 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6297 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6298 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6299 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6300 movq mm1, [esi] /* load 8 bytes of the Src */
6301 dec esi
6302 add esi, eax /* move Src pointer 1 row below */
6303 movq mm3, [edx] /* load 4 words of Kernel */
6304 add edx, 8 /* move pointer to other 4 words */
6305 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6306 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6307 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6308 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6309 /* --- 3 */
6310 movq mm1, [esi] /* load 8 bytes of the Src */
6311 movq mm2, mm1 /* copy MM1 into MM2 */
6312 inc esi /* move pointer to the next 8 bytes of Src */
6313 movq mm3, [edx] /* load 4 words of Kernel */
6314 add edx, 8 /* move pointer to other 4 words */
6315 movq mm4, [edx] /* load 4 words of Kernel */
6316 add edx, 8 /* move pointer to other 4 words */
6317 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6318 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6319 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6320 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6321 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6322 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6323 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6324 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6325 movq mm1, [esi] /* load 8 bytes of the Src */
6326 dec esi
6327 add esi, eax /* move Src pointer 1 row below */
6328 movq mm3, [edx] /* load 4 words of Kernel */
6329 add edx, 8 /* move pointer to other 4 words */
6330 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6331 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6332 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6333 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6334 /* --- 4 */
6335 movq mm1, [esi] /* load 8 bytes of the Src */
6336 movq mm2, mm1 /* copy MM1 into MM2 */
6337 inc esi /* move pointer to the next 8 bytes of Src */
6338 movq mm3, [edx] /* load 4 words of Kernel */
6339 add edx, 8 /* move pointer to other 4 words */
6340 movq mm4, [edx] /* load 4 words of Kernel */
6341 add edx, 8 /* move pointer to other 4 words */
6342 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6343 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6344 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6345 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6346 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6347 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6348 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6349 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6350 movq mm1, [esi] /* load 8 bytes of the Src */
6351 dec esi
6352 add esi, eax /* move Src pointer 1 row below */
6353 movq mm3, [edx] /* load 4 words of Kernel */
6354 add edx, 8 /* move pointer to other 4 words */
6355 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6356 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6357 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6358 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6359 /* --- 5 */
6360 movq mm1, [esi] /* load 8 bytes of the Src */
6361 movq mm2, mm1 /* copy MM1 into MM2 */
6362 inc esi /* move pointer to the next 8 bytes of Src */
6363 movq mm3, [edx] /* load 4 words of Kernel */
6364 add edx, 8 /* move pointer to other 4 words */
6365 movq mm4, [edx] /* load 4 words of Kernel */
6366 add edx, 8 /* move pointer to other 4 words */
6367 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6368 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6369 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6370 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6371 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6372 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6373 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6374 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6375 movq mm1, [esi] /* load 8 bytes of the Src */
6376 dec esi
6377 add esi, eax /* move Src pointer 1 row below */
6378 movq mm3, [edx] /* load 4 words of Kernel */
6379 add edx, 8 /* move pointer to other 4 words */
6380 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6381 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6382 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6383 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6384 /* --- 6 */
6385 movq mm1, [esi] /* load 8 bytes of the Src */
6386 movq mm2, mm1 /* copy MM1 into MM2 */
6387 inc esi /* move pointer to the next 8 bytes of Src */
6388 movq mm3, [edx] /* load 4 words of Kernel */
6389 add edx, 8 /* move pointer to other 4 words */
6390 movq mm4, [edx] /* load 4 words of Kernel */
6391 add edx, 8 /* move pointer to other 4 words */
6392 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6393 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6394 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6395 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6396 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6397 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6398 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6399 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6400 movq mm1, [esi] /* load 8 bytes of the Src */
6401 dec esi
6402 add esi, eax /* move Src pointer 1 row below */
6403 movq mm3, [edx] /* load 4 words of Kernel */
6404 add edx, 8 /* move pointer to other 4 words */
6405 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6406 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6407 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6408 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6409 /* --- 7 */
6410 movq mm1, [esi] /* load 8 bytes of the Src */
6411 movq mm2, mm1 /* copy MM1 into MM2 */
6412 inc esi /* move pointer to the next 8 bytes of Src */
6413 movq mm3, [edx] /* load 4 words of Kernel */
6414 add edx, 8 /* move pointer to other 4 words */
6415 movq mm4, [edx] /* load 4 words of Kernel */
6416 add edx, 8 /* move pointer to other 4 words */
6417 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6418 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6419 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6420 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6421 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6422 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6423 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6424 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6425 movq mm1, [esi] /* load 8 bytes of the Src */
6426 dec esi
6427 add esi, eax /* move Src pointer 1 row below */
6428 movq mm3, [edx] /* load 4 words of Kernel */
6429 add edx, 8 /* move pointer to other 4 words */
6430 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6431 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6432 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6433 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6434 /* --- 8 */
6435 movq mm1, [esi] /* load 8 bytes of the Src */
6436 movq mm2, mm1 /* copy MM1 into MM2 */
6437 inc esi /* move pointer to the next 8 bytes of Src */
6438 movq mm3, [edx] /* load 4 words of Kernel */
6439 add edx, 8 /* move pointer to other 4 words */
6440 movq mm4, [edx] /* load 4 words of Kernel */
6441 add edx, 8 /* move pointer to other 4 words */
6442 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6443 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6444 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6445 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6446 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6447 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6448 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6449 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6450 movq mm1, [esi] /* load 8 bytes of the Src */
6451 dec esi
6452 add esi, eax /* move Src pointer 1 row below */
6453 movq mm3, [edx] /* load 4 words of Kernel */
6454 add edx, 8 /* move pointer to other 4 words */
6455 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6456 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6457 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6458 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6459 /* --- 9 */
6460 movq mm1, [esi] /* load 8 bytes of the Src */
6461 movq mm2, mm1 /* copy MM1 into MM2 */
6462 inc esi /* move pointer to the next 8 bytes of Src */
6463 movq mm3, [edx] /* load 4 words of Kernel */
6464 add edx, 8 /* move pointer to other 4 words */
6465 movq mm4, [edx] /* load 4 words of Kernel */
6466 add edx, 8 /* move pointer to other 4 words */
6467 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6468 punpckhbw mm2, mm0 /* unpack second 4 bytes into words */
6469 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6470 psrlw mm2, mm5 /* shift right each pixel NshiftRight times */
6471 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6472 pmullw mm2, mm4 /* mult 4 high words of Src and Kernel */
6473 paddsw mm1, mm2 /* add 4 words of the high and low bytes */
6474 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6475 movq mm1, [esi] /* load 8 bytes of the Src */
6476 movq mm3, [edx] /* load 4 words of Kernel */
6477 punpcklbw mm1, mm0 /* unpack first 4 bytes into words */
6478 psrlw mm1, mm5 /* shift right each pixel NshiftRight times */
6479 pmullw mm1, mm3 /* mult 4 low words of Src and Kernel */
6480 paddsw mm7, mm1 /* add MM1 to accumulator MM7 */
6481 /* ---, */
6482 movq mm3, mm7 /* copy MM7 into MM3 */
6483 psrlq mm7, 32 /* shift 2 left words to the right */
6484 paddsw mm7, mm3 /* add 2 left and 2 right result words */
6485 movq mm2, mm7 /* copy MM7 into MM2 */
6486 psrlq mm7, 16 /* shift 1 left word to the right */
6487 paddsw mm7, mm2 /* add 1 left and 1 right result words */
6488 movd mm1, eax /* save EAX in MM1 */
6489 packuswb mm7, mm0 /* pack division result with saturation */
6490 movd eax, mm7 /* copy saturated result into EAX */
6491 mov [edi], al /* copy a byte result into Dest */
6492 movd eax, mm1 /* restore saved EAX */
6493 /* --, */
6494 movd esi, mm6 /* move Src pointer to the top pixel */
6495 sub edx, 208 /* EDX = Kernel address */
6496 inc esi /* move Src pointer to the next pixel */
6497 inc edi /* move Dest pointer to the next pixel */
6498 /* ---, */
6499 dec ecx /* decrease loop counter COLUMNS */
6500 jnz L10392 /* check loop termination, proceed if required */
6501 add esi, 8 /* move to the next row in Src */
6502 add edi, 8 /* move to the next row in Dest */
6503 dec ebx /* decrease loop counter ROWS */
6504 jnz L10390 /* check loop termination, proceed if required */
6505 /* ---, */
6506 emms /* exit MMX state */
6507 popa
6508 }
6509#else
6510 asm volatile
6511 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
6512 "xor %%ebx, %%ebx \n\t" /* zero EBX */
6513 "mov %5, %%bl \n\t" /* load NRightShift into BL */
6514 "movd %%ebx, %%mm5 \n\t" /* copy NRightShift into MM5 */
6515 "mov %4, %%edx \n\t" /* load Kernel address into EDX */
6516 "mov %1, %%esi \n\t" /* load Src address to ESI */
6517 "mov %0, %%edi \n\t" /* load Dest address to EDI */
6518 "add $4, %%edi \n\t" /* 4 column offset from the left edge */
6519 "mov %3, %%eax \n\t" /* load columns into EAX */
6520 "add %%eax, %%edi \n\t" /* 4 row offset from the top edge */
6521 "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
6522 "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */
6523 /* --- */
6524 ".L10390: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
6525 "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */
6526 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
6527 ".L10392: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
6528 "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
6529 /* --- 1 */
6530 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6531 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6532 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6533 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6534 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6535 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6536 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6537 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6538 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6539 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6540 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6541 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6542 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6543 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6544 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6545 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6546 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6547 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6548 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6549 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6550 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6551 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6552 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6553 /* --- 2 */
6554 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6555 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6556 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6557 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6558 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6559 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6560 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6561 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6562 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6563 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6564 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6565 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6566 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6567 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6568 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6569 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6570 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6571 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6572 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6573 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6574 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6575 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6576 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6577 /* --- 3 */
6578 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6579 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6580 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6581 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6582 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6583 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6584 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6585 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6586 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6587 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6588 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6589 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6590 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6591 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6592 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6593 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6594 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6595 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6596 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6597 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6598 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6599 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6600 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6601 /* --- 4 */
6602 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6603 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6604 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6605 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6606 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6607 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6608 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6609 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6610 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6611 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6612 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6613 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6614 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6615 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6616 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6617 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6618 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6619 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6620 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6621 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6622 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6623 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6624 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6625 /* --- 5 */
6626 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6627 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6628 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6629 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6630 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6631 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6632 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6633 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6634 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6635 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6636 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6637 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6638 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6639 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6640 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6641 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6642 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6643 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6644 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6645 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6646 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6647 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6648 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6649 /* --- 6 */
6650 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6651 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6652 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6653 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6654 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6655 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6656 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6657 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6658 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6659 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6660 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6661 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6662 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6663 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6664 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6665 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6666 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6667 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6668 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6669 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6670 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6671 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6672 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6673 /* --- 7 */
6674 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6675 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6676 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6677 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6678 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6679 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6680 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6681 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6682 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6683 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6684 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6685 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6686 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6687 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6688 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6689 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6690 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6691 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6692 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6693 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6694 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6695 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6696 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6697 /* --- 8 */
6698 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6699 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6700 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6701 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6702 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6703 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6704 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6705 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6706 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6707 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6708 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6709 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6710 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6711 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6712 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6713 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6714 "dec %%esi \n\t" "add %%eax, %%esi \n\t" /* move Src pointer 1 row below */
6715 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6716 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6717 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6718 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6719 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6720 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6721 /* --- 9 */
6722 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6723 "movq %%mm1, %%mm2 \n\t" /* copy MM1 into MM2 */
6724 "inc %%esi \n\t" /* move pointer to the next 8 bytes of Src */
6725 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6726 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6727 "movq (%%edx), %%mm4 \n\t" /* load 4 words of Kernel */
6728 "add $8, %%edx \n\t" /* move pointer to other 4 words */
6729 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6730 "punpckhbw %%mm0, %%mm2 \n\t" /* unpack second 4 bytes into words */
6731 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6732 "psrlw %%mm5, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
6733 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6734 "pmullw %%mm4, %%mm2 \n\t" /* mult. 4 high words of Src and Kernel */
6735 "paddsw %%mm2, %%mm1 \n\t" /* add 4 words of the high and low bytes */
6736 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6737 "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
6738 "movq (%%edx), %%mm3 \n\t" /* load 4 words of Kernel */
6739 "punpcklbw %%mm0, %%mm1 \n\t" /* unpack first 4 bytes into words */
6740 "psrlw %%mm5, %%mm1 \n\t" /* shift right each pixel NshiftRight times */
6741 "pmullw %%mm3, %%mm1 \n\t" /* mult. 4 low words of Src and Kernel */
6742 "paddsw %%mm1, %%mm7 \n\t" /* add MM1 to accumulator MM7 */
6743 /* --- */
6744 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
6745 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
6746 "paddsw %%mm3, %%mm7 \n\t" /* add 2 left and 2 right result words */
6747 "movq %%mm7, %%mm2 \n\t" /* copy MM7 into MM2 */
6748 "psrlq $16, %%mm7 \n\t" /* shift 1 left word to the right */
6749 "paddsw %%mm2, %%mm7 \n\t" /* add 1 left and 1 right result words */
6750 "movd %%eax, %%mm1 \n\t" /* save EAX in MM1 */
6751 "packuswb %%mm0, %%mm7 \n\t" /* pack division result with saturation */
6752 "movd %%mm7, %%eax \n\t" /* copy saturated result into EAX */
6753 "mov %%al, (%%edi) \n\t" /* copy a byte result into Dest */
6754 "movd %%mm1, %%eax \n\t" /* restore saved EAX */
6755 /* -- */
6756 "movd %%mm6, %%esi \n\t" /* move Src pointer to the top pixel */
6757 "sub $208, %%edx \n\t" /* EDX = Kernel address */
6758 "inc %%esi \n\t" /* move Src pointer to the next pixel */
6759 "inc %%edi \n\t" /* move Dest pointer to the next pixel */
6760 /* --- */
6761 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
6762 "jnz .L10392 \n\t" /* check loop termination, proceed if required */
6763 "add $8, %%esi \n\t" /* move to the next row in Src */
6764 "add $8, %%edi \n\t" /* move to the next row in Dest */
6765 "dec %%ebx \n\t" /* decrease loop counter ROWS */
6766 "jnz .L10390 \n\t" /* check loop termination, proceed if required */
6767 /* --- */
6768 "emms \n\t" /* exit MMX state */
6769 "popa \n\t":"=m" (Dest) /* %0 */
6770 :"m"(Src), /* %1 */
6771 "m"(rows), /* %2 */
6772 "m"(columns), /* %3 */
6773 "m"(Kernel), /* %4 */
6774 "m"(NRightShift) /* %5 */
6775 );
6776#endif
6777#endif
6778 return (0);
6779 } else {
6780 /* No non-MMX implementation yet */
6781 return (-1);
6782 }
6783}
6784
6785/* ------------------------------------------------------------------------------------ */
6786
6799int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
6800{
6801 /* Validate input parameters */
6802 if ((Src == NULL) || (Dest == NULL))
6803 return(-1);
6804
6805 if ((columns < 8) || (rows < 3))
6806 return (-1);
6807
6808 if ((SDL_imageFilterMMXdetect())) {
6809//#ifdef USE_MMX
6810#if defined(USE_MMX) && defined(i386)
6811#if !defined(GCC__)
6812 __asm
6813 {
6814 pusha
6815 pxor mm0, mm0 /* zero MM0 */
6816 mov eax, columns /* load columns into EAX */
6817 /* ---, */
6818 mov esi, Src /* ESI = Src row 0 address */
6819 mov edi, Dest /* load Dest address to EDI */
6820 add edi, eax /* EDI = EDI + columns */
6821 inc edi /* 1 byte offset from the left edge */
6822 mov edx, rows /* initialize ROWS counter */
6823 sub edx, 2 /* do not use first and last rows */
6824 /* ---, */
6825L10400:
6826 mov ecx, eax /* initialize COLUMS counter */
6827 shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */
6828 mov ebx, esi /* save ESI in EBX */
6829 movd mm1, edi /* save EDI in MM1 */
6830 align 16 /* 16 byte alignment of the loop entry */
6831L10402:
6832 /* ---, */
6833 movq mm4, [esi] /* load 8 bytes from Src */
6834 movq mm5, mm4 /* save MM4 in MM5 */
6835 add esi, 2 /* move ESI pointer 2 bytes right */
6836 punpcklbw mm4, mm0 /* unpack 4 low bytes into words */
6837 punpckhbw mm5, mm0 /* unpack 4 high bytes into words */
6838 movq mm6, [esi] /* load 8 bytes from Src */
6839 movq mm7, mm6 /* save MM6 in MM7 */
6840 sub esi, 2 /* move ESI pointer back 2 bytes left */
6841 punpcklbw mm6, mm0 /* unpack 4 low bytes into words */
6842 punpckhbw mm7, mm0 /* unpack 4 high bytes into words */
6843 add esi, eax /* move to the next row of Src */
6844 movq mm2, [esi] /* load 8 bytes from Src */
6845 movq mm3, mm2 /* save MM2 in MM3 */
6846 add esi, 2 /* move ESI pointer 2 bytes right */
6847 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
6848 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
6849 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
6850 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
6851 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
6852 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
6853 movq mm2, [esi] /* load 8 bytes from Src */
6854 movq mm3, mm2 /* save MM2 in MM3 */
6855 sub esi, 2 /* move ESI pointer back 2 bytes left */
6856 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
6857 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
6858 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
6859 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
6860 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
6861 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
6862 add esi, eax /* move to the next row of Src */
6863 movq mm2, [esi] /* load 8 bytes from Src */
6864 movq mm3, mm2 /* save MM2 in MM3 */
6865 add esi, 2 /* move ESI pointer 2 bytes right */
6866 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
6867 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
6868 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
6869 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
6870 movq mm2, [esi] /* load 8 bytes from Src */
6871 movq mm3, mm2 /* save MM2 in MM3 */
6872 sub esi, 2 /* move ESI pointer back 2 bytes left */
6873 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
6874 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
6875 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
6876 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
6877 /* ---, */
6878 movq mm2, mm4 /* copy MM4 into MM2 */
6879 psrlq mm4, 32 /* shift 2 left words to the right */
6880 psubw mm4, mm2 /* MM4 = MM4 - MM2 */
6881 movq mm3, mm6 /* copy MM6 into MM3 */
6882 psrlq mm6, 32 /* shift 2 left words to the right */
6883 psubw mm6, mm3 /* MM6 = MM6 - MM3 */
6884 punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */
6885 movq mm2, mm5 /* copy MM6 into MM2 */
6886 psrlq mm5, 32 /* shift 2 left words to the right */
6887 psubw mm5, mm2 /* MM5 = MM5 - MM2 */
6888 movq mm3, mm7 /* copy MM7 into MM3 */
6889 psrlq mm7, 32 /* shift 2 left words to the right */
6890 psubw mm7, mm3 /* MM7 = MM7 - MM3 */
6891 punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */
6892 /* Take abs values of MM4 and MM5 */
6893 movq mm6, mm4 /* copy MM4 into MM6 */
6894 movq mm7, mm5 /* copy MM5 into MM7 */
6895 psraw mm6, 15 /* fill MM6 words with word sign bit */
6896 psraw mm7, 15 /* fill MM7 words with word sign bit */
6897 pxor mm4, mm6 /* take 1's compliment of only neg words */
6898 pxor mm5, mm7 /* take 1's compliment of only neg words */
6899 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
6900 psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */
6901 packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */
6902 movq [edi], mm4 /* store result in Dest */
6903 /* ---, */
6904 sub esi, eax /* move to the current top row in Src */
6905 sub esi, eax
6906 add esi, 8 /* move Src pointer to the next 8 pixels */
6907 add edi, 8 /* move Dest pointer to the next 8 pixels */
6908 /* ---, */
6909 dec ecx /* decrease loop counter COLUMNS */
6910 jnz L10402 /* check loop termination, proceed if required */
6911 mov esi, ebx /* restore most left current row Src address */
6912 movd edi, mm1 /* restore most left current row Dest address */
6913 add esi, eax /* move to the next row in Src */
6914 add edi, eax /* move to the next row in Dest */
6915 dec edx /* decrease loop counter ROWS */
6916 jnz L10400 /* check loop termination, proceed if required */
6917 /* ---, */
6918 emms /* exit MMX state */
6919 popa
6920 }
6921#else
6922 asm volatile
6923 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
6924 "mov %3, %%eax \n\t" /* load columns into EAX */
6925 /* --- */
6926 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
6927 "mov %0, %%edi \n\t" /* load Dest address to EDI */
6928 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
6929 "inc %%edi \n\t" /* 1 byte offset from the left edge */
6930 "mov %2, %%edx \n\t" /* initialize ROWS counter */
6931 "sub $2, %%edx \n\t" /* do not use first and last rows */
6932 /* --- */
6933 ".L10400: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
6934 "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */
6935 "mov %%esi, %%ebx \n\t" /* save ESI in EBX */
6936 "movd %%edi, %%mm1 \n\t" /* save EDI in MM1 */
6937 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
6938 ".L10402: \n\t"
6939 /* --- */
6940 "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */
6941 "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */
6942 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
6943 "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */
6944 "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */
6945 "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */
6946 "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */
6947 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
6948 "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */
6949 "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */
6950 "add %%eax, %%esi \n\t" /* move to the next row of Src */
6951 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
6952 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
6953 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
6954 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
6955 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
6956 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
6957 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
6958 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
6959 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
6960 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
6961 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
6962 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
6963 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
6964 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
6965 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
6966 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
6967 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
6968 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
6969 "add %%eax, %%esi \n\t" /* move to the next row of Src */
6970 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
6971 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
6972 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
6973 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
6974 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
6975 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
6976 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
6977 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
6978 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
6979 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
6980 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
6981 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
6982 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
6983 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
6984 /* --- */
6985 "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */
6986 "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */
6987 "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */
6988 "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */
6989 "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */
6990 "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */
6991 "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */
6992 "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */
6993 "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */
6994 "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */
6995 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
6996 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
6997 "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */
6998 "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */
6999 /* Take abs values of MM4 and MM5 */
7000 "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */
7001 "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */
7002 "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */
7003 "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */
7004 "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */
7005 "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */
7006 "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
7007 "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
7008 "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */
7009 "movq %%mm4, (%%edi) \n\t" /* store result in Dest */
7010 /* --- */
7011 "sub %%eax, %%esi \n\t" /* move to the current top row in Src */
7012 "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */
7013 "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */
7014 /* --- */
7015 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
7016 "jnz .L10402 \n\t" /* check loop termination, proceed if required */
7017 "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */
7018 "movd %%mm1, %%edi \n\t" /* restore most left current row Dest address */
7019 "add %%eax, %%esi \n\t" /* move to the next row in Src */
7020 "add %%eax, %%edi \n\t" /* move to the next row in Dest */
7021 "dec %%edx \n\t" /* decrease loop counter ROWS */
7022 "jnz .L10400 \n\t" /* check loop termination, proceed if required */
7023 /* --- */
7024 "emms \n\t" /* exit MMX state */
7025 "popa \n\t":"=m" (Dest) /* %0 */
7026 :"m"(Src), /* %1 */
7027 "m"(rows), /* %2 */
7028 "m"(columns) /* %3 */
7029 );
7030#endif
7031#endif
7032 return (0);
7033 } else {
7034 /* No non-MMX implementation yet */
7035 return (-1);
7036 }
7037}
7038
7052int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
7053 unsigned char NRightShift)
7054{
7055 /* Validate input parameters */
7056 if ((Src == NULL) || (Dest == NULL))
7057 return(-1);
7058 if ((columns < 8) || (rows < 3) || (NRightShift > 7))
7059 return (-1);
7060
7061 if ((SDL_imageFilterMMXdetect())) {
7062//#ifdef USE_MMX
7063#if defined(USE_MMX) && defined(i386)
7064#if !defined(GCC__)
7065 __asm
7066 {
7067 pusha
7068 pxor mm0, mm0 /* zero MM0 */
7069 mov eax, columns /* load columns into EAX */
7070 xor ebx, ebx /* zero EBX */
7071 mov bl, NRightShift /* load NRightShift into BL */
7072 movd mm1, ebx /* copy NRightShift into MM1 */
7073 /* ---, */
7074 mov esi, Src /* ESI = Src row 0 address */
7075 mov edi, Dest /* load Dest address to EDI */
7076 add edi, eax /* EDI = EDI + columns */
7077 inc edi /* 1 byte offset from the left edge */
7078 /* initialize ROWS counter */
7079 sub rows, 2 /* do not use first and last rows */
7080 /* ---, */
7081L10410:
7082 mov ecx, eax /* initialize COLUMS counter */
7083 shr ecx, 3 /* EBX/8 (MMX loads 8 bytes at a time) */
7084 mov ebx, esi /* save ESI in EBX */
7085 mov edx, edi /* save EDI in EDX */
7086 align 16 /* 16 byte alignment of the loop entry */
7087L10412:
7088 /* ---, */
7089 movq mm4, [esi] /* load 8 bytes from Src */
7090 movq mm5, mm4 /* save MM4 in MM5 */
7091 add esi, 2 /* move ESI pointer 2 bytes right */
7092 punpcklbw mm4, mm0 /* unpack 4 low bytes into words */
7093 punpckhbw mm5, mm0 /* unpack 4 high bytes into words */
7094 psrlw mm4, mm1 /* shift right each pixel NshiftRight times */
7095 psrlw mm5, mm1 /* shift right each pixel NshiftRight times */
7096 movq mm6, [esi] /* load 8 bytes from Src */
7097 movq mm7, mm6 /* save MM6 in MM7 */
7098 sub esi, 2 /* move ESI pointer back 2 bytes left */
7099 punpcklbw mm6, mm0 /* unpack 4 low bytes into words */
7100 punpckhbw mm7, mm0 /* unpack 4 high bytes into words */
7101 psrlw mm6, mm1 /* shift right each pixel NshiftRight times */
7102 psrlw mm7, mm1 /* shift right each pixel NshiftRight times */
7103 add esi, eax /* move to the next row of Src */
7104 movq mm2, [esi] /* load 8 bytes from Src */
7105 movq mm3, mm2 /* save MM2 in MM3 */
7106 add esi, 2 /* move ESI pointer 2 bytes right */
7107 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
7108 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
7109 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
7110 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
7111 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
7112 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
7113 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
7114 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
7115 movq mm2, [esi] /* load 8 bytes from Src */
7116 movq mm3, mm2 /* save MM2 in MM3 */
7117 sub esi, 2 /* move ESI pointer back 2 bytes left */
7118 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
7119 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
7120 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
7121 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
7122 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
7123 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
7124 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
7125 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
7126 add esi, eax /* move to the next row of Src */
7127 movq mm2, [esi] /* load 8 bytes from Src */
7128 movq mm3, mm2 /* save MM2 in MM3 */
7129 add esi, 2 /* move ESI pointer 2 bytes right */
7130 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
7131 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
7132 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
7133 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
7134 paddw mm4, mm2 /* add 4 low bytes to accumolator MM4 */
7135 paddw mm5, mm3 /* add 4 high bytes to accumolator MM5 */
7136 movq mm2, [esi] /* load 8 bytes from Src */
7137 movq mm3, mm2 /* save MM2 in MM3 */
7138 sub esi, 2 /* move ESI pointer back 2 bytes left */
7139 punpcklbw mm2, mm0 /* unpack 4 low bytes into words */
7140 punpckhbw mm3, mm0 /* unpack 4 high bytes into words */
7141 psrlw mm2, mm1 /* shift right each pixel NshiftRight times */
7142 psrlw mm3, mm1 /* shift right each pixel NshiftRight times */
7143 paddw mm6, mm2 /* add 4 low bytes to accumolator MM6 */
7144 paddw mm7, mm3 /* add 4 high bytes to accumolator MM7 */
7145 /* ---, */
7146 movq mm2, mm4 /* copy MM4 into MM2 */
7147 psrlq mm4, 32 /* shift 2 left words to the right */
7148 psubw mm4, mm2 /* MM4 = MM4 - MM2 */
7149 movq mm3, mm6 /* copy MM6 into MM3 */
7150 psrlq mm6, 32 /* shift 2 left words to the right */
7151 psubw mm6, mm3 /* MM6 = MM6 - MM3 */
7152 punpckldq mm4, mm6 /* combine 2 words of MM6 and 2 words of MM4 */
7153 movq mm2, mm5 /* copy MM6 into MM2 */
7154 psrlq mm5, 32 /* shift 2 left words to the right */
7155 psubw mm5, mm2 /* MM5 = MM5 - MM2 */
7156 movq mm3, mm7 /* copy MM7 into MM3 */
7157 psrlq mm7, 32 /* shift 2 left words to the right */
7158 psubw mm7, mm3 /* MM7 = MM7 - MM3 */
7159 punpckldq mm5, mm7 /* combine 2 words of MM7 and 2 words of MM5 */
7160 /* Take abs values of MM4 and MM5 */
7161 movq mm6, mm4 /* copy MM4 into MM6 */
7162 movq mm7, mm5 /* copy MM5 into MM7 */
7163 psraw mm6, 15 /* fill MM6 words with word sign bit */
7164 psraw mm7, 15 /* fill MM7 words with word sign bit */
7165 pxor mm4, mm6 /* take 1's compliment of only neg words */
7166 pxor mm5, mm7 /* take 1's compliment of only neg words */
7167 psubsw mm4, mm6 /* add 1 to only neg words, W-(-1) or W-0 */
7168 psubsw mm5, mm7 /* add 1 to only neg words, W-(-1) or W-0 */
7169 packuswb mm4, mm5 /* combine and pack/saturate MM5 and MM4 */
7170 movq [edi], mm4 /* store result in Dest */
7171 /* ---, */
7172 sub esi, eax /* move to the current top row in Src */
7173 sub esi, eax
7174 add esi, 8 /* move Src pointer to the next 8 pixels */
7175 add edi, 8 /* move Dest pointer to the next 8 pixels */
7176 /* ---, */
7177 dec ecx /* decrease loop counter COLUMNS */
7178 jnz L10412 /* check loop termination, proceed if required */
7179 mov esi, ebx /* restore most left current row Src address */
7180 mov edi, edx /* restore most left current row Dest address */
7181 add esi, eax /* move to the next row in Src */
7182 add edi, eax /* move to the next row in Dest */
7183 dec rows /* decrease loop counter ROWS */
7184 jnz L10410 /* check loop termination, proceed if required */
7185 /* ---, */
7186 emms /* exit MMX state */
7187 popa
7188 }
7189#else
7190 asm volatile
7191 ("pusha \n\t" "pxor %%mm0, %%mm0 \n\t" /* zero MM0 */
7192 "mov %3, %%eax \n\t" /* load columns into EAX */
7193 "xor %%ebx, %%ebx \n\t" /* zero EBX */
7194 "mov %4, %%bl \n\t" /* load NRightShift into BL */
7195 "movd %%ebx, %%mm1 \n\t" /* copy NRightShift into MM1 */
7196 /* --- */
7197 "mov %1, %%esi \n\t" /* ESI = Src row 0 address */
7198 "mov %0, %%edi \n\t" /* load Dest address to EDI */
7199 "add %%eax, %%edi \n\t" /* EDI = EDI + columns */
7200 "inc %%edi \n\t" /* 1 byte offset from the left edge */
7201 /* initialize ROWS counter */
7202 "subl $2, %2 \n\t" /* do not use first and last rows */
7203 /* --- */
7204 ".L10410: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
7205 "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */
7206 "mov %%esi, %%ebx \n\t" /* save ESI in EBX */
7207 "mov %%edi, %%edx \n\t" /* save EDI in EDX */
7208 ".align 16 \n\t" /* 16 byte alignment of the loop entry */
7209 ".L10412: \n\t"
7210 /* --- */
7211 "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */
7212 "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */
7213 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
7214 "punpcklbw %%mm0, %%mm4 \n\t" /* unpack 4 low bytes into words */
7215 "punpckhbw %%mm0, %%mm5 \n\t" /* unpack 4 high bytes into words */
7216 "psrlw %%mm1, %%mm4 \n\t" /* shift right each pixel NshiftRight times */
7217 "psrlw %%mm1, %%mm5 \n\t" /* shift right each pixel NshiftRight times */
7218 "movq (%%esi), %%mm6 \n\t" /* load 8 bytes from Src */
7219 "movq %%mm6, %%mm7 \n\t" /* save MM6 in MM7 */
7220 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
7221 "punpcklbw %%mm0, %%mm6 \n\t" /* unpack 4 low bytes into words */
7222 "punpckhbw %%mm0, %%mm7 \n\t" /* unpack 4 high bytes into words */
7223 "psrlw %%mm1, %%mm6 \n\t" /* shift right each pixel NshiftRight times */
7224 "psrlw %%mm1, %%mm7 \n\t" /* shift right each pixel NshiftRight times */
7225 "add %%eax, %%esi \n\t" /* move to the next row of Src */
7226 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
7227 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
7228 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
7229 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
7230 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
7231 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
7232 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
7233 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
7234 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
7235 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
7236 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
7237 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
7238 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
7239 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
7240 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
7241 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
7242 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
7243 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
7244 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
7245 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
7246 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
7247 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
7248 "add %%eax, %%esi \n\t" /* move to the next row of Src */
7249 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
7250 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
7251 "add $2, %%esi \n\t" /* move ESI pointer 2 bytes right */
7252 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
7253 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
7254 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
7255 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
7256 "paddw %%mm2, %%mm4 \n\t" /* add 4 low bytes to accumolator MM4 */
7257 "paddw %%mm3, %%mm5 \n\t" /* add 4 high bytes to accumolator MM5 */
7258 "movq (%%esi), %%mm2 \n\t" /* load 8 bytes from Src */
7259 "movq %%mm2, %%mm3 \n\t" /* save MM2 in MM3 */
7260 "sub $2, %%esi \n\t" /* move ESI pointer back 2 bytes left */
7261 "punpcklbw %%mm0, %%mm2 \n\t" /* unpack 4 low bytes into words */
7262 "punpckhbw %%mm0, %%mm3 \n\t" /* unpack 4 high bytes into words */
7263 "psrlw %%mm1, %%mm2 \n\t" /* shift right each pixel NshiftRight times */
7264 "psrlw %%mm1, %%mm3 \n\t" /* shift right each pixel NshiftRight times */
7265 "paddw %%mm2, %%mm6 \n\t" /* add 4 low bytes to accumolator MM6 */
7266 "paddw %%mm3, %%mm7 \n\t" /* add 4 high bytes to accumolator MM7 */
7267 /* --- */
7268 "movq %%mm4, %%mm2 \n\t" /* copy MM4 into MM2 */
7269 "psrlq $32, %%mm4 \n\t" /* shift 2 left words to the right */
7270 "psubw %%mm2, %%mm4 \n\t" /* MM4 = MM4 - MM2 */
7271 "movq %%mm6, %%mm3 \n\t" /* copy MM6 into MM3 */
7272 "psrlq $32, %%mm6 \n\t" /* shift 2 left words to the right */
7273 "psubw %%mm3, %%mm6 \n\t" /* MM6 = MM6 - MM3 */
7274 "punpckldq %%mm6, %%mm4 \n\t" /* combine 2 words of MM6 and 2 words of MM4 */
7275 "movq %%mm5, %%mm2 \n\t" /* copy MM6 into MM2 */
7276 "psrlq $32, %%mm5 \n\t" /* shift 2 left words to the right */
7277 "psubw %%mm2, %%mm5 \n\t" /* MM5 = MM5 - MM2 */
7278 "movq %%mm7, %%mm3 \n\t" /* copy MM7 into MM3 */
7279 "psrlq $32, %%mm7 \n\t" /* shift 2 left words to the right */
7280 "psubw %%mm3, %%mm7 \n\t" /* MM7 = MM7 - MM3 */
7281 "punpckldq %%mm7, %%mm5 \n\t" /* combine 2 words of MM7 and 2 words of MM5 */
7282 /* Take abs values of MM4 and MM5 */
7283 "movq %%mm4, %%mm6 \n\t" /* copy MM4 into MM6 */
7284 "movq %%mm5, %%mm7 \n\t" /* copy MM5 into MM7 */
7285 "psraw $15, %%mm6 \n\t" /* fill MM6 words with word sign bit */
7286 "psraw $15, %%mm7 \n\t" /* fill MM7 words with word sign bit */
7287 "pxor %%mm6, %%mm4 \n\t" /* take 1's compliment of only neg. words */
7288 "pxor %%mm7, %%mm5 \n\t" /* take 1's compliment of only neg. words */
7289 "psubsw %%mm6, %%mm4 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
7290 "psubsw %%mm7, %%mm5 \n\t" /* add 1 to only neg. words, W-(-1) or W-0 */
7291 "packuswb %%mm5, %%mm4 \n\t" /* combine and pack/saturate MM5 and MM4 */
7292 "movq %%mm4, (%%edi) \n\t" /* store result in Dest */
7293 /* --- */
7294 "sub %%eax, %%esi \n\t" /* move to the current top row in Src */
7295 "sub %%eax, %%esi \n\t" "add $8, %%esi \n\t" /* move Src pointer to the next 8 pixels */
7296 "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */
7297 /* --- */
7298 "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
7299 "jnz .L10412 \n\t" /* check loop termination, proceed if required */
7300 "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */
7301 "mov %%edx, %%edi \n\t" /* restore most left current row Dest address */
7302 "add %%eax, %%esi \n\t" /* move to the next row in Src */
7303 "add %%eax, %%edi \n\t" /* move to the next row in Dest */
7304 "decl %2 \n\t" /* decrease loop counter ROWS */
7305 "jnz .L10410 \n\t" /* check loop termination, proceed if required */
7306 /* --- */
7307 "emms \n\t" /* exit MMX state */
7308 "popa \n\t":"=m" (Dest) /* %0 */
7309 :"m"(Src), /* %1 */
7310 "m"(rows), /* %2 */
7311 "m"(columns), /* %3 */
7312 "m"(NRightShift) /* %4 */
7313 );
7314#endif
7315#endif
7316 return (0);
7317 } else {
7318 /* No non-MMX implementation yet */
7319 return (-1);
7320 }
7321}
7322
7327{
7328#ifdef USE_MMX
7329#if !defined(GCC__)
7330 __asm
7331 { /* --- stack alignment --- */
7332 mov ebx, esp /* load ESP into EBX */
7333 sub ebx, 4 /* reserve space on stack for old value of ESP */
7334 and ebx, -32 /* align EBX along a 32 byte boundary */
7335 mov [ebx], esp /* save old value of ESP in stack, behind the bndry */
7336 mov esp, ebx /* align ESP along a 32 byte boundary */
7337 }
7338#else
7339 asm volatile
7340 ( /* --- stack alignment --- */
7341 "mov %%esp, %%ebx \n\t" /* load ESP into EBX */
7342 "sub $4, %%ebx \n\t" /* reserve space on stack for old value of ESP */
7343 "and $-32, %%ebx \n\t" /* align EBX along a 32 byte boundary */
7344 "mov %%esp, (%%ebx) \n\t" /* save old value of ESP in stack, behind the bndry */
7345 "mov %%ebx, %%esp \n\t" /* align ESP along a 32 byte boundary */
7346 ::);
7347#endif
7348#endif
7349}
7350
7355{
7356#ifdef USE_MMX
7357#if !defined(GCC__)
7358 __asm
7359 { /* --- restoring old stack --- */
7360 mov ebx, [esp] /* load old value of ESP */
7361 mov esp, ebx /* restore old value of ESP */
7362 }
7363#else
7364 asm volatile
7365 ( /* --- restoring old stack --- */
7366 "mov (%%esp), %%ebx \n\t" /* load old value of ESP */
7367 "mov %%ebx, %%esp \n\t" /* restore old value of ESP */
7368 ::);
7369#endif
7370#endif
7371}
int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
Filter using SobelX: Dij = saturation255( ... )
int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using MultByByte: D = saturation255(S * C)
int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N, unsigned char C)
Filter using ShiftRightAndMultByByte: D = saturation255((S >> N) * C)
int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, unsigned char NRightShift)
Filter using SobelXShiftRight: Dij = saturation255( ... )
int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Div: D = S1 / S2.
int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftLeftUint: D = ((uint)S << N)
int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
void SDL_imageFilterRestoreStack(void)
Restore previously aligned stack.
int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
Internal ASM Filter using MultNor: D = S1 * S2.
void SDL_imageFilterMMXon()
Enable MMX check for filter functions and use MMX code if available.
int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel7x7Divide: Dij = saturation0and255( ... )
int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using SubByte: D = saturation0(S - C)
int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Sub: D = saturation0(S1 - S2)
int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftLeftByte: D = (S << N)
int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using AbsDiff: D = | S1 - S2 |.
int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel7x7ShiftRight: Dij = saturation0and255( ... )
int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel5x5ShiftRight: Dij = saturation0and255( ... )
int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using BitOr: D = S1 | S2.
void SDL_imageFilterMMXoff()
Disable MMX check for filter functions and and force to use non-MMX C based code.
int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using MultNor: D = S1 * S2.
int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
Internal MMX Filter using SubByte: D = saturation0(S - C)
int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter using ShiftRight: D = saturation0(S >> N)
int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel9x9ShiftRight: Dij = saturation255( ... )
#define SWAP_32(x)
Swaps the byte order in a 32bit integer (LSB becomes MSB, etc.).
int SDL_imageFilterMMXdetect(void)
MMX detection routine (with override flag).
int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using MultDivby2: D = saturation255(S1/2 * S2)
int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using AddByte: D = saturation255(S + C)
int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using BitAnd: D = S1 & S2.
int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel3x3Divide: Dij = saturation0and255( ... )
int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
Filter using BinarizeUsingThreshold: D = (S >= T) ? 255:0.
int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
Filter ShiftLeft: D = saturation255(S << N)
int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Add: D = saturation255(S1 + S2)
int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin, int Nmax)
Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin, unsigned char Tmax)
Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax.
int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
Filter using AddByteToHalf: D = saturation255(S/2 + C)
int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char NRightShift)
Filter using ConvolveKernel3x3ShiftRight: Dij = saturation0and255( ... )
int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
Filter using BitNegation: D = !S.
int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel5x5Divide: Dij = saturation0and255( ... )
int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Mean: D = S1/2 + S2/2.
int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns, signed short *Kernel, unsigned char Divisor)
Filter using ConvolveKernel9x9Divide: Dij = saturation0and255( ... )
int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
Filter using Mult: D = saturation255(S1 * S2)
void SDL_imageFilterAlignStack(void)
Align stack to 32 byte boundary,.