00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #include "pent_include.h"
00036 #include "Scale2xScaler.h"
00037 #include "Manips.h"
00038
00039
00040 #include <SDL_types.h>
00041 #include <SDL_cpuinfo.h>
00042
00043 namespace Pentagram {
00044
00045
00046
00047
00048 template<class uintX, class Manip, class uintS=uintX> class Scale2xScalerInternal
00049 {
00050 public:
00051
00063 static inline void scale2x_def(uintX* dst0, uintX* dst1, const uintS* src0, const uintS* src1, const uintS* src2, unsigned count)
00064 {
00065
00066 if (src0[0] != src2[0] && src1[0] != src1[1]) {
00067 dst0[0] = Manip::copy(src1[0] == src0[0] ? src0[0] : src1[0]);
00068 dst0[1] = Manip::copy(src1[1] == src0[0] ? src0[0] : src1[0]);
00069 dst1[0] = Manip::copy(src1[0] == src2[0] ? src2[0] : src1[0]);
00070 dst1[1] = Manip::copy(src1[1] == src2[0] ? src2[0] : src1[0]);
00071 } else {
00072 dst0[0] = Manip::copy(src1[0]);
00073 dst0[1] = Manip::copy(src1[0]);
00074 dst1[0] = Manip::copy(src1[0]);
00075 dst1[1] = Manip::copy(src1[0]);
00076 }
00077 ++src0;
00078 ++src1;
00079 ++src2;
00080 dst0 += 2;
00081 dst1 += 2;
00082
00083
00084 count -= 2;
00085 while (count) {
00086 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
00087 dst0[0] = Manip::copy(src1[-1] == src0[0] ? src0[0] : src1[0]);
00088 dst0[1] = Manip::copy(src1[1] == src0[0] ? src0[0] : src1[0]);
00089 dst1[0] = Manip::copy(src1[-1] == src2[0] ? src2[0] : src1[0]);
00090 dst1[1] = Manip::copy(src1[1] == src2[0] ? src2[0] : src1[0]);
00091 } else {
00092 dst0[0] = Manip::copy(src1[0]);
00093 dst0[1] = Manip::copy(src1[0]);
00094 dst1[0] = Manip::copy(src1[0]);
00095 dst1[1] = Manip::copy(src1[0]);
00096 }
00097
00098 ++src0;
00099 ++src1;
00100 ++src2;
00101 dst0 += 2;
00102 dst1 += 2;
00103 --count;
00104 }
00105
00106
00107 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
00108 dst0[0] = Manip::copy(src1[-1] == src0[0] ? src0[0] : src1[0]);
00109 dst0[1] = Manip::copy(src1[0] == src0[0] ? src0[0] : src1[0]);
00110 dst1[0] = Manip::copy(src1[-1] == src2[0] ? src2[0] : src1[0]);
00111 dst1[1] = Manip::copy(src1[0] == src2[0] ? src2[0] : src1[0]);
00112 } else {
00113 dst0[0] = Manip::copy(src1[0]);
00114 dst0[1] = Manip::copy(src1[0]);
00115 dst1[0] = Manip::copy(src1[0]);
00116 dst1[1] = Manip::copy(src1[0]);
00117 }
00118 }
00119
00120
00121 static bool Scale( Texture *tex, sint32 sx, sint32 sy, sint32 sw, sint32 sh,
00122 uint8* pixel, sint32 dw, sint32 dh, sint32 pitch, bool clamp_src)
00123 {
00124
00125 if (sh<3 && (clamp_src || tex->height<3)) return false;
00126
00127
00128 uintS *texel = reinterpret_cast<uintS*>(tex->buffer) + (sy * tex->width + sx);
00129 int tpitch = tex->width;
00130
00131 uintS *tex_end = texel + (sh-1)*tex->width;
00132
00133 bool clip_y = true;
00134 if (sh+sy < tex->height && clamp_src == false)
00135 {
00136 clip_y = false;
00137 tex_end = texel + sh*tex->width;
00138 }
00139
00140 if (sy == 0) {
00141 scale2x_def(reinterpret_cast<uintX*>(pixel),
00142 reinterpret_cast<uintX*>(pixel+pitch),
00143 texel, texel, texel+tpitch, sw);
00144 pixel += pitch*2;
00145 texel += tpitch;
00146 }
00147
00148
00149 if (texel != tex_end) do {
00150
00151 scale2x_def(reinterpret_cast<uintX*>(pixel),
00152 reinterpret_cast<uintX*>(pixel+pitch),
00153 texel-tpitch, texel, texel+tpitch, sw);
00154 pixel += pitch*2;
00155 texel += tpitch;
00156
00157 } while (texel != tex_end);
00158
00159 if (clip_y) {
00160 scale2x_def(reinterpret_cast<uintX*>(pixel),
00161 reinterpret_cast<uintX*>(pixel+pitch),
00162 texel-tpitch, texel, texel, sw);
00163 }
00164
00165 return true;
00166 }
00167
00168 };
00169
00170
00171
00172
00173 #if (defined(__GNUC__) && defined(__i386__)) || (defined(_MSC_VER) && defined(_M_IX86))
00174
00175 #ifdef _MSC_VER
00176 #pragma warning(disable:4799) // No EMMS at end of function
00177 #endif
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217 static inline void scale2x_16_mmx_border(uint16* dst, const uint16* src0, const uint16* src1, const uint16* src2, unsigned count)
00218 {
00219
00220 count -= 2*4;
00221
00222 #if defined(__GNUC__) && defined(__i386__)
00223 __asm__ __volatile__(
00224
00225
00226 "movq 0(%1), %%mm0\n"
00227 "movq 0(%1), %%mm7\n"
00228 "movq 8(%1), %%mm1\n"
00229 "psllq $48, %%mm0\n"
00230 "psllq $48, %%mm1\n"
00231 "psrlq $48, %%mm0\n"
00232 "movq %%mm7, %%mm2\n"
00233 "movq %%mm7, %%mm3\n"
00234 "psllq $16, %%mm2\n"
00235 "psrlq $16, %%mm3\n"
00236 "por %%mm2, %%mm0\n"
00237 "por %%mm3, %%mm1\n"
00238
00239
00240 "movq (%0), %%mm6\n"
00241
00242
00243
00244 "movq %%mm0, %%mm2\n"
00245 "movq %%mm1, %%mm4\n"
00246 "movq %%mm0, %%mm3\n"
00247 "movq %%mm1, %%mm5\n"
00248 "pcmpeqw %%mm6, %%mm2\n"
00249 "pcmpeqw %%mm6, %%mm4\n"
00250 "pcmpeqw (%2), %%mm3\n"
00251 "pcmpeqw (%2), %%mm5\n"
00252 "pandn %%mm2, %%mm3\n"
00253 "pandn %%mm4, %%mm5\n"
00254 "movq %%mm0, %%mm2\n"
00255 "movq %%mm1, %%mm4\n"
00256 "pcmpeqw %%mm1, %%mm2\n"
00257 "pcmpeqw %%mm0, %%mm4\n"
00258 "pandn %%mm3, %%mm2\n"
00259 "pandn %%mm5, %%mm4\n"
00260 "movq %%mm2, %%mm3\n"
00261 "movq %%mm4, %%mm5\n"
00262 "pand %%mm6, %%mm2\n"
00263 "pand %%mm6, %%mm4\n"
00264 "pandn %%mm7, %%mm3\n"
00265 "pandn %%mm7, %%mm5\n"
00266 "por %%mm3, %%mm2\n"
00267 "por %%mm5, %%mm4\n"
00268
00269
00270 "movq %%mm2, %%mm3\n"
00271 "punpcklwd %%mm4, %%mm2\n"
00272 "punpckhwd %%mm4, %%mm3\n"
00273 "movq %%mm2, (%3)\n"
00274 "movq %%mm3, 8(%3)\n"
00275
00276
00277 "addl $8, %0\n"
00278 "addl $8, %1\n"
00279 "addl $8, %2\n"
00280 "addl $16, %3\n"
00281
00282
00283 "shrl $2, %4\n"
00284 "jz 1f\n"
00285
00286 "0:\n"
00287
00288
00289 "movq -8(%1), %%mm0\n"
00290 "movq (%1), %%mm7\n"
00291 "movq 8(%1), %%mm1\n"
00292 "psrlq $48, %%mm0\n"
00293 "psllq $48, %%mm1\n"
00294 "movq %%mm7, %%mm2\n"
00295 "movq %%mm7, %%mm3\n"
00296 "psllq $16, %%mm2\n"
00297 "psrlq $16, %%mm3\n"
00298 "por %%mm2, %%mm0\n"
00299 "por %%mm3, %%mm1\n"
00300
00301
00302 "movq (%0), %%mm6\n"
00303
00304
00305
00306 "movq %%mm0, %%mm2\n"
00307 "movq %%mm1, %%mm4\n"
00308 "movq %%mm0, %%mm3\n"
00309 "movq %%mm1, %%mm5\n"
00310 "pcmpeqw %%mm6, %%mm2\n"
00311 "pcmpeqw %%mm6, %%mm4\n"
00312 "pcmpeqw (%2), %%mm3\n"
00313 "pcmpeqw (%2), %%mm5\n"
00314 "pandn %%mm2, %%mm3\n"
00315 "pandn %%mm4, %%mm5\n"
00316 "movq %%mm0, %%mm2\n"
00317 "movq %%mm1, %%mm4\n"
00318 "pcmpeqw %%mm1, %%mm2\n"
00319 "pcmpeqw %%mm0, %%mm4\n"
00320 "pandn %%mm3, %%mm2\n"
00321 "pandn %%mm5, %%mm4\n"
00322 "movq %%mm2, %%mm3\n"
00323 "movq %%mm4, %%mm5\n"
00324 "pand %%mm6, %%mm2\n"
00325 "pand %%mm6, %%mm4\n"
00326 "pandn %%mm7, %%mm3\n"
00327 "pandn %%mm7, %%mm5\n"
00328 "por %%mm3, %%mm2\n"
00329 "por %%mm5, %%mm4\n"
00330
00331
00332 "movq %%mm2, %%mm3\n"
00333 "punpcklwd %%mm4, %%mm2\n"
00334 "punpckhwd %%mm4, %%mm3\n"
00335 "movq %%mm2, (%3)\n"
00336 "movq %%mm3, 8(%3)\n"
00337
00338
00339 "addl $8, %0\n"
00340 "addl $8, %1\n"
00341 "addl $8, %2\n"
00342 "addl $16, %3\n"
00343
00344 "decl %4\n"
00345 "jnz 0b\n"
00346 "1:\n"
00347
00348
00349
00350 "movq (%1), %%mm1\n"
00351 "movq (%1), %%mm7\n"
00352 "movq -8(%1), %%mm0\n"
00353 "psrlq $48, %%mm1\n"
00354 "psrlq $48, %%mm0\n"
00355 "psllq $48, %%mm1\n"
00356 "movq %%mm7, %%mm2\n"
00357 "movq %%mm7, %%mm3\n"
00358 "psllq $16, %%mm2\n"
00359 "psrlq $16, %%mm3\n"
00360 "por %%mm2, %%mm0\n"
00361 "por %%mm3, %%mm1\n"
00362
00363
00364 "movq (%0), %%mm6\n"
00365
00366
00367
00368 "movq %%mm0, %%mm2\n"
00369 "movq %%mm1, %%mm4\n"
00370 "movq %%mm0, %%mm3\n"
00371 "movq %%mm1, %%mm5\n"
00372 "pcmpeqw %%mm6, %%mm2\n"
00373 "pcmpeqw %%mm6, %%mm4\n"
00374 "pcmpeqw (%2), %%mm3\n"
00375 "pcmpeqw (%2), %%mm5\n"
00376 "pandn %%mm2, %%mm3\n"
00377 "pandn %%mm4, %%mm5\n"
00378 "movq %%mm0, %%mm2\n"
00379 "movq %%mm1, %%mm4\n"
00380 "pcmpeqw %%mm1, %%mm2\n"
00381 "pcmpeqw %%mm0, %%mm4\n"
00382 "pandn %%mm3, %%mm2\n"
00383 "pandn %%mm5, %%mm4\n"
00384 "movq %%mm2, %%mm3\n"
00385 "movq %%mm4, %%mm5\n"
00386 "pand %%mm6, %%mm2\n"
00387 "pand %%mm6, %%mm4\n"
00388 "pandn %%mm7, %%mm3\n"
00389 "pandn %%mm7, %%mm5\n"
00390 "por %%mm3, %%mm2\n"
00391 "por %%mm5, %%mm4\n"
00392
00393
00394 "movq %%mm2, %%mm3\n"
00395 "punpcklwd %%mm4, %%mm2\n"
00396 "punpckhwd %%mm4, %%mm3\n"
00397 "movq %%mm2, (%3)\n"
00398 "movq %%mm3, 8(%3)\n"
00399
00400 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
00401 :
00402 : "cc"
00403 );
00404 #elif defined(_MSC_VER) && defined(_M_IX86)
00405 __asm {
00406 mov eax, src0;
00407 mov ebx, src1;
00408 mov ecx, src2;
00409 mov edx, dst;
00410 mov esi, count;
00411
00412
00413
00414 movq mm0, [ebx+0];
00415 movq mm7, [ebx+0];
00416 movq mm1, [ebx+8];
00417 psllq mm0, 48;
00418 psllq mm1, 48;
00419 psrlq mm0, 48;
00420 movq mm2, mm7;
00421 movq mm3, mm7;
00422 psllq mm2, 16;
00423 psrlq mm3, 16;
00424 por mm0, mm2;
00425 por mm1, mm3;
00426
00427
00428 movq mm6, [eax];
00429
00430
00431
00432 movq mm2, mm0;
00433 movq mm4, mm1;
00434 movq mm3, mm0;
00435 movq mm5, mm1;
00436 pcmpeqw mm2, mm6;
00437 pcmpeqw mm4, mm6;
00438 pcmpeqw mm3, [ecx];
00439 pcmpeqw mm5, [ecx];
00440 pandn mm3, mm2;
00441 pandn mm5, mm4;
00442 movq mm2, mm0;
00443 movq mm4, mm1;
00444 pcmpeqw mm2, mm1;
00445 pcmpeqw mm4, mm0;
00446 pandn mm2, mm3;
00447 pandn mm4, mm5;
00448 movq mm3, mm2;
00449 movq mm5, mm4;
00450 pand mm2, mm6;
00451 pand mm4, mm6;
00452 pandn mm3, mm7;
00453 pandn mm5, mm7;
00454 por mm2, mm3;
00455 por mm4, mm5;
00456
00457
00458 movq mm3, mm2;
00459 punpcklwd mm2, mm4;
00460 punpckhwd mm3, mm4;
00461 movq [edx], mm2;
00462 movq [edx+8], mm3;
00463
00464
00465 add eax, 8;
00466 add ebx, 8;
00467 add ecx, 8;
00468 add edx, 16;
00469
00470
00471 shr esi, 2;
00472 jz label1;
00473 align 4;
00474 label0:
00475
00476
00477 movq mm0, [ebx-8];
00478 movq mm7, [ebx];
00479 movq mm1, [ebx+8];
00480 psrlq mm0, 48;
00481 psllq mm1, 48;
00482 movq mm2, mm7;
00483 movq mm3, mm7;
00484 psllq mm2, 16;
00485 psrlq mm3, 16;
00486 por mm0, mm2;
00487 por mm1, mm3;
00488
00489
00490 movq mm6, [eax];
00491
00492
00493
00494 movq mm2, mm0;
00495 movq mm4, mm1;
00496 movq mm3, mm0;
00497 movq mm5, mm1;
00498 pcmpeqw mm2, mm6;
00499 pcmpeqw mm4, mm6;
00500 pcmpeqw mm3, [ecx];
00501 pcmpeqw mm5, [ecx];
00502 pandn mm3, mm2;
00503 pandn mm5, mm4;
00504 movq mm2, mm0;
00505 movq mm4, mm1;
00506 pcmpeqw mm2, mm1;
00507 pcmpeqw mm4, mm0;
00508 pandn mm2, mm3;
00509 pandn mm4, mm5;
00510 movq mm3, mm2;
00511 movq mm5, mm4;
00512 pand mm2, mm6;
00513 pand mm4, mm6;
00514 pandn mm3, mm7;
00515 pandn mm5, mm7;
00516 por mm2, mm3;
00517 por mm4, mm5;
00518
00519
00520 movq mm3, mm2;
00521 punpcklwd mm2, mm4;
00522 punpckhwd mm3, mm4;
00523 movq [edx], mm2;
00524 movq [edx+8], mm3;
00525
00526
00527 add eax, 8;
00528 add ebx, 8;
00529 add ecx, 8;
00530 add edx, 16;
00531
00532 dec esi;
00533 jnz label0;
00534 label1:
00535
00536
00537
00538 movq mm1, [ebx];
00539 movq mm7, [ebx];
00540 movq mm0, [ebx-8];
00541 psrlq mm1, 48;
00542 psrlq mm0, 48;
00543 psllq mm1, 48;
00544 movq mm2, mm7;
00545 movq mm3, mm7;
00546 psllq mm2, 16;
00547 psrlq mm3, 16;
00548 por mm0, mm2;
00549 por mm1, mm3;
00550
00551
00552 movq mm6, [eax];
00553
00554
00555
00556 movq mm2, mm0;
00557 movq mm4, mm1;
00558 movq mm3, mm0;
00559 movq mm5, mm1;
00560 pcmpeqw mm2, mm6;
00561 pcmpeqw mm4, mm6;
00562 pcmpeqw mm3, [ecx];
00563 pcmpeqw mm5, [ecx];
00564 pandn mm3, mm2;
00565 pandn mm5, mm4;
00566 movq mm2, mm0;
00567 movq mm4, mm1;
00568 pcmpeqw mm2, mm1;
00569 pcmpeqw mm4, mm0;
00570 pandn mm2, mm3;
00571 pandn mm4, mm5;
00572 movq mm3, mm2;
00573 movq mm5, mm4;
00574 pand mm2, mm6;
00575 pand mm4, mm6;
00576 pandn mm3, mm7;
00577 pandn mm5, mm7;
00578 por mm2, mm3;
00579 por mm4, mm5;
00580
00581
00582 movq mm3, mm2;
00583 punpcklwd mm2, mm4;
00584 punpckhwd mm3, mm4;
00585 movq [edx], mm2;
00586 movq [edx+8], mm3;
00587 }
00588 #endif
00589 }
00590
00591 static inline void scale2x_32_mmx_border(uint32* dst, const uint32* src0, const uint32* src1, const uint32* src2, unsigned count)
00592 {
00593
00594 count -= 2*2;
00595
00596 #if defined(__GNUC__) && defined(__i386__)
00597 __asm__ __volatile__(
00598
00599
00600 "movq 0(%1), %%mm0\n"
00601 "movq 0(%1), %%mm7\n"
00602 "movq 8(%1), %%mm1\n"
00603 "psllq $32, %%mm0\n"
00604 "psllq $32, %%mm1\n"
00605 "psrlq $32, %%mm0\n"
00606 "movq %%mm7, %%mm2\n"
00607 "movq %%mm7, %%mm3\n"
00608 "psllq $32, %%mm2\n"
00609 "psrlq $32, %%mm3\n"
00610 "por %%mm2, %%mm0\n"
00611 "por %%mm3, %%mm1\n"
00612
00613
00614 "movq (%0), %%mm6\n"
00615
00616
00617
00618 "movq %%mm0, %%mm2\n"
00619 "movq %%mm1, %%mm4\n"
00620 "movq %%mm0, %%mm3\n"
00621 "movq %%mm1, %%mm5\n"
00622 "pcmpeqd %%mm6, %%mm2\n"
00623 "pcmpeqd %%mm6, %%mm4\n"
00624 "pcmpeqd (%2), %%mm3\n"
00625 "pcmpeqd (%2), %%mm5\n"
00626 "pandn %%mm2, %%mm3\n"
00627 "pandn %%mm4, %%mm5\n"
00628 "movq %%mm0, %%mm2\n"
00629 "movq %%mm1, %%mm4\n"
00630 "pcmpeqd %%mm1, %%mm2\n"
00631 "pcmpeqd %%mm0, %%mm4\n"
00632 "pandn %%mm3, %%mm2\n"
00633 "pandn %%mm5, %%mm4\n"
00634 "movq %%mm2, %%mm3\n"
00635 "movq %%mm4, %%mm5\n"
00636 "pand %%mm6, %%mm2\n"
00637 "pand %%mm6, %%mm4\n"
00638 "pandn %%mm7, %%mm3\n"
00639 "pandn %%mm7, %%mm5\n"
00640 "por %%mm3, %%mm2\n"
00641 "por %%mm5, %%mm4\n"
00642
00643
00644 "movq %%mm2, %%mm3\n"
00645 "punpckldq %%mm4, %%mm2\n"
00646 "punpckhdq %%mm4, %%mm3\n"
00647 "movq %%mm2, (%3)\n"
00648 "movq %%mm3, 8(%3)\n"
00649
00650
00651 "addl $8, %0\n"
00652 "addl $8, %1\n"
00653 "addl $8, %2\n"
00654 "addl $16, %3\n"
00655
00656
00657 "shrl $1, %4\n"
00658 "jz 1f\n"
00659
00660 "0:\n"
00661
00662
00663 "movq -8(%1), %%mm0\n"
00664 "movq (%1), %%mm7\n"
00665 "movq 8(%1), %%mm1\n"
00666 "psrlq $32, %%mm0\n"
00667 "psllq $32, %%mm1\n"
00668 "movq %%mm7, %%mm2\n"
00669 "movq %%mm7, %%mm3\n"
00670 "psllq $32, %%mm2\n"
00671 "psrlq $32, %%mm3\n"
00672 "por %%mm2, %%mm0\n"
00673 "por %%mm3, %%mm1\n"
00674
00675
00676 "movq (%0), %%mm6\n"
00677
00678
00679
00680 "movq %%mm0, %%mm2\n"
00681 "movq %%mm1, %%mm4\n"
00682 "movq %%mm0, %%mm3\n"
00683 "movq %%mm1, %%mm5\n"
00684 "pcmpeqd %%mm6, %%mm2\n"
00685 "pcmpeqd %%mm6, %%mm4\n"
00686 "pcmpeqd (%2), %%mm3\n"
00687 "pcmpeqd (%2), %%mm5\n"
00688 "pandn %%mm2, %%mm3\n"
00689 "pandn %%mm4, %%mm5\n"
00690 "movq %%mm0, %%mm2\n"
00691 "movq %%mm1, %%mm4\n"
00692 "pcmpeqd %%mm1, %%mm2\n"
00693 "pcmpeqd %%mm0, %%mm4\n"
00694 "pandn %%mm3, %%mm2\n"
00695 "pandn %%mm5, %%mm4\n"
00696 "movq %%mm2, %%mm3\n"
00697 "movq %%mm4, %%mm5\n"
00698 "pand %%mm6, %%mm2\n"
00699 "pand %%mm6, %%mm4\n"
00700 "pandn %%mm7, %%mm3\n"
00701 "pandn %%mm7, %%mm5\n"
00702 "por %%mm3, %%mm2\n"
00703 "por %%mm5, %%mm4\n"
00704
00705
00706 "movq %%mm2, %%mm3\n"
00707 "punpckldq %%mm4, %%mm2\n"
00708 "punpckhdq %%mm4, %%mm3\n"
00709 "movq %%mm2, (%3)\n"
00710 "movq %%mm3, 8(%3)\n"
00711
00712
00713 "addl $8, %0\n"
00714 "addl $8, %1\n"
00715 "addl $8, %2\n"
00716 "addl $16, %3\n"
00717
00718 "decl %4\n"
00719 "jnz 0b\n"
00720 "1:\n"
00721
00722
00723
00724 "movq (%1), %%mm1\n"
00725 "movq (%1), %%mm7\n"
00726 "movq -8(%1), %%mm0\n"
00727 "psrlq $32, %%mm1\n"
00728 "psrlq $32, %%mm0\n"
00729 "psllq $32, %%mm1\n"
00730 "movq %%mm7, %%mm2\n"
00731 "movq %%mm7, %%mm3\n"
00732 "psllq $32, %%mm2\n"
00733 "psrlq $32, %%mm3\n"
00734 "por %%mm2, %%mm0\n"
00735 "por %%mm3, %%mm1\n"
00736
00737
00738 "movq (%0), %%mm6\n"
00739
00740
00741
00742 "movq %%mm0, %%mm2\n"
00743 "movq %%mm1, %%mm4\n"
00744 "movq %%mm0, %%mm3\n"
00745 "movq %%mm1, %%mm5\n"
00746 "pcmpeqd %%mm6, %%mm2\n"
00747 "pcmpeqd %%mm6, %%mm4\n"
00748 "pcmpeqd (%2), %%mm3\n"
00749 "pcmpeqd (%2), %%mm5\n"
00750 "pandn %%mm2, %%mm3\n"
00751 "pandn %%mm4, %%mm5\n"
00752 "movq %%mm0, %%mm2\n"
00753 "movq %%mm1, %%mm4\n"
00754 "pcmpeqd %%mm1, %%mm2\n"
00755 "pcmpeqd %%mm0, %%mm4\n"
00756 "pandn %%mm3, %%mm2\n"
00757 "pandn %%mm5, %%mm4\n"
00758 "movq %%mm2, %%mm3\n"
00759 "movq %%mm4, %%mm5\n"
00760 "pand %%mm6, %%mm2\n"
00761 "pand %%mm6, %%mm4\n"
00762 "pandn %%mm7, %%mm3\n"
00763 "pandn %%mm7, %%mm5\n"
00764 "por %%mm3, %%mm2\n"
00765 "por %%mm5, %%mm4\n"
00766
00767
00768 "movq %%mm2, %%mm3\n"
00769 "punpckldq %%mm4, %%mm2\n"
00770 "punpckhdq %%mm4, %%mm3\n"
00771 "movq %%mm2, (%3)\n"
00772 "movq %%mm3, 8(%3)\n"
00773
00774 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
00775 :
00776 : "cc"
00777 );
00778 #elif defined(_MSC_VER) && defined(_M_IX86)
00779 __asm {
00780 mov eax, src0;
00781 mov ebx, src1;
00782 mov ecx, src2;
00783 mov edx, dst;
00784 mov esi, count;
00785
00786
00787
00788 movq mm0, [ebx+0];
00789 movq mm7, [ebx+0];
00790 movq mm1, [ebx+8];
00791 psllq mm0, 32;
00792 psllq mm1, 32;
00793 psrlq mm0, 32;
00794 movq mm2, mm7;
00795 movq mm3, mm7;
00796 psllq mm2, 32;
00797 psrlq mm3, 32;
00798 por mm0, mm2;
00799 por mm1, mm3;
00800
00801
00802 movq mm6, [eax];
00803
00804
00805
00806 movq mm2, mm0;
00807 movq mm4, mm1;
00808 movq mm3, mm0;
00809 movq mm5, mm1;
00810 pcmpeqd mm2, mm6;
00811 pcmpeqd mm4, mm6;
00812 pcmpeqd mm3, [ecx];
00813 pcmpeqd mm5, [ecx];
00814 pandn mm3, mm2;
00815 pandn mm5, mm4;
00816 movq mm2, mm0;
00817 movq mm4, mm1;
00818 pcmpeqd mm2, mm1;
00819 pcmpeqd mm4, mm0;
00820 pandn mm2, mm3;
00821 pandn mm4, mm5;
00822 movq mm3, mm2;
00823 movq mm5, mm4;
00824 pand mm2, mm6;
00825 pand mm4, mm6;
00826 pandn mm3, mm7;
00827 pandn mm5, mm7;
00828 por mm2, mm3;
00829 por mm4, mm5;
00830
00831
00832 movq mm3, mm2;
00833 punpckldq mm2, mm4;
00834 punpckhdq mm3, mm4;
00835 movq [edx], mm2;
00836 movq [edx+8], mm3;
00837
00838
00839 add eax, 8;
00840 add ebx, 8;
00841 add ecx, 8;
00842 add edx, 16;
00843
00844
00845 shr esi, 1;
00846 jz label1;
00847
00848 label0:
00849
00850
00851 movq mm0, [ebx-8];
00852 movq mm7, [ebx];
00853 movq mm1, [ebx+8];
00854 psrlq mm0, 32;
00855 psllq mm1, 32;
00856 movq mm2, mm7;
00857 movq mm3, mm7;
00858 psllq mm2, 32;
00859 psrlq mm3, 32;
00860 por mm0, mm2;
00861 por mm1, mm3;
00862
00863
00864 movq mm6, [eax];
00865
00866
00867
00868 movq mm2, mm0;
00869 movq mm4, mm1;
00870 movq mm3, mm0;
00871 movq mm5, mm1;
00872 pcmpeqd mm2, mm6;
00873 pcmpeqd mm4, mm6;
00874 pcmpeqd mm3, [ecx];
00875 pcmpeqd mm5, [ecx];
00876 pandn mm3, mm2;
00877 pandn mm5, mm4;
00878 movq mm2, mm0;
00879 movq mm4, mm1;
00880 pcmpeqd mm2, mm1;
00881 pcmpeqd mm4, mm0;
00882 pandn mm2, mm3;
00883 pandn mm4, mm5;
00884 movq mm3, mm2;
00885 movq mm5, mm4;
00886 pand mm2, mm6;
00887 pand mm4, mm6;
00888 pandn mm3, mm7;
00889 pandn mm5, mm7;
00890 por mm2, mm3;
00891 por mm4, mm5;
00892
00893
00894 movq mm3, mm2;
00895 punpckldq mm2, mm4;
00896 punpckhdq mm3, mm4;
00897 movq [edx], mm2;
00898 movq [edx+8], mm3;
00899
00900
00901 add eax, 8;
00902 add ebx, 8;
00903 add ecx, 8;
00904 add edx, 16;
00905
00906 dec esi;
00907 jnz label0
00908 label1:
00909
00910
00911
00912 movq mm1, [ebx];
00913 movq mm7, [ebx];
00914 movq mm0, [ebx-8];
00915 psrlq mm1, 32;
00916 psrlq mm0, 32;
00917 psllq mm1, 32;
00918 movq mm2, mm7;
00919 movq mm3, mm7;
00920 psllq mm2, 32;
00921 psrlq mm3, 32;
00922 por mm0, mm2;
00923 por mm1, mm3;
00924
00925
00926 movq mm6, [eax];
00927
00928
00929
00930 movq mm2, mm0;
00931 movq mm4, mm1;
00932 movq mm3, mm0;
00933 movq mm5, mm1;
00934 pcmpeqd mm2, mm6;
00935 pcmpeqd mm4, mm6;
00936 pcmpeqd mm3, [ecx];
00937 pcmpeqd mm5, [ecx];
00938 pandn mm3, mm2;
00939 pandn mm5, mm4;
00940 movq mm2, mm0;
00941 movq mm4, mm1;
00942 pcmpeqd mm2, mm1;
00943 pcmpeqd mm4, mm0;
00944 pandn mm2, mm3;
00945 pandn mm4, mm5;
00946 movq mm3, mm2;
00947 movq mm5, mm4;
00948 pand mm2, mm6;
00949 pand mm4, mm6;
00950 pandn mm3, mm7;
00951 pandn mm5, mm7;
00952 por mm2, mm3;
00953 por mm4, mm5;
00954
00955
00956 movq mm3, mm2;
00957 punpckldq mm2, mm4;
00958 punpckhdq mm3, mm4;
00959 movq [edx], mm2;
00960 movq [edx+8], mm3;
00961 };
00962 #endif
00963 }
00964
00970 static inline void scale2x_mmx_emms(void)
00971 {
00972 #if defined(__GNUC__) && defined(__i386__)
00973 __asm__ __volatile__ (
00974 "emms"
00975 );
00976 #elif defined(_MSC_VER) && defined(_M_IX86)
00977 __asm emms;
00978 #endif
00979 }
00980
00992 static void scale2x_16_mmx(uint16* dst0, uint16* dst1, const uint16* src0, const uint16* src1, const uint16* src2, unsigned count)
00993 {
00994 if (count % 4 != 0 || count < 8) {
00995 Scale2xScalerInternal<uint16, Manip_Nat2Nat_16>::scale2x_def(dst0, dst1, src0, src1, src2, count);
00996 } else {
00997 scale2x_16_mmx_border(dst0, src0, src1, src2, count);
00998 scale2x_16_mmx_border(dst1, src2, src1, src0, count);
00999 }
01000 }
01001
01013 static void scale2x_32_mmx(uint32* dst0, uint32* dst1, const uint32* src0, const uint32* src1, const uint32* src2, unsigned count)
01014 {
01015 if (count % 2 != 0 || count < 4) {
01016 Scale2xScalerInternal<uint32, Manip_Nat2Nat_32>::scale2x_def(dst0, dst1, src0, src1, src2, count);
01017 } else {
01018 scale2x_32_mmx_border(dst0, src0, src1, src2, count);
01019 scale2x_32_mmx_border(dst1, src2, src1, src0, count);
01020 }
01021 }
01022
01023
01024
01025
01026
01027 static bool Scale2x_16MMX( Texture *tex , sint32 sx, sint32 sy, sint32 sw, sint32 sh,
01028 uint8* pixel, sint32 dw, sint32 dh, sint32 pitch, bool clamp_src)
01029 {
01030
01031 if (sh<3 && (clamp_src || tex->height<3)) return false;
01032
01033
01034 uint16 *texel = reinterpret_cast<uint16*>(tex->buffer) + (sy * tex->width + sx);
01035 int tpitch = tex->width;
01036
01037 uint16 *tex_end = texel + (sh-1)*tex->width;
01038
01039 bool clip_y = true;
01040 if (sh+sy < tex->height && clamp_src == false)
01041 {
01042 clip_y = false;
01043 tex_end = texel + sh*tex->width;
01044 }
01045
01046 if (sy == 0) {
01047 scale2x_16_mmx(reinterpret_cast<uint16*>(pixel),
01048 reinterpret_cast<uint16*>(pixel+pitch),
01049 texel, texel, texel+tpitch, sw);
01050 pixel += pitch*2;
01051 texel += tpitch;
01052 }
01053
01054
01055 if (texel != tex_end) do {
01056
01057 scale2x_16_mmx(reinterpret_cast<uint16*>(pixel),
01058 reinterpret_cast<uint16*>(pixel+pitch),
01059 texel-tpitch, texel, texel+tpitch, sw);
01060 pixel += pitch*2;
01061 texel += tpitch;
01062
01063 } while (texel != tex_end);
01064
01065 if (clip_y) {
01066 scale2x_16_mmx(reinterpret_cast<uint16*>(pixel),
01067 reinterpret_cast<uint16*>(pixel+pitch),
01068 texel-tpitch, texel, texel, sw);
01069 }
01070
01071 scale2x_mmx_emms();
01072 return true;
01073 }
01074
01075 static bool Scale2x_32MMX( Texture *tex , sint32 sx, sint32 sy, sint32 sw, sint32 sh,
01076 uint8* pixel, sint32 dw, sint32 dh, sint32 pitch, bool clamp_src)
01077 {
01078
01079 if (sh<3 && (clamp_src || tex->height<3)) return false;
01080
01081
01082 uint32 *texel = reinterpret_cast<uint32*>(tex->buffer) + (sy * tex->width + sx);
01083 int tpitch = tex->width;
01084
01085 uint32 *tex_end = texel + (sh-1)*tex->width;
01086
01087 bool clip_y = true;
01088 if (sh+sy < tex->height && clamp_src == false)
01089 {
01090 clip_y = false;
01091 tex_end = texel + sh*tex->width;
01092 }
01093
01094 if (sy == 0) {
01095 scale2x_32_mmx(reinterpret_cast<uint32*>(pixel),
01096 reinterpret_cast<uint32*>(pixel+pitch),
01097 texel, texel, texel+tpitch, sw);
01098 pixel += pitch*2;
01099 texel += tpitch;
01100 }
01101
01102
01103 if (texel != tex_end) do {
01104
01105 scale2x_32_mmx(reinterpret_cast<uint32*>(pixel),
01106 reinterpret_cast<uint32*>(pixel+pitch),
01107 texel-tpitch, texel, texel+tpitch, sw);
01108 pixel += pitch*2;
01109 texel += tpitch;
01110
01111 } while (texel != tex_end);
01112
01113 if (clip_y) {
01114 scale2x_32_mmx(reinterpret_cast<uint32*>(pixel),
01115 reinterpret_cast<uint32*>(pixel+pitch),
01116 texel-tpitch, texel, texel, sw);
01117 }
01118
01119 scale2x_mmx_emms();
01120 return true;
01121 }
01122 #endif
01123
01124 Scale2xScaler::Scale2xScaler() : Scaler()
01125 {
01126 Scale16Nat = Scale2xScalerInternal<uint16, Manip_Nat2Nat_16, uint16>::Scale;
01127 Scale16Sta = Scale2xScalerInternal<uint16, Manip_Sta2Nat_16, uint32>::Scale;
01128
01129 Scale32Nat = Scale2xScalerInternal<uint32, Manip_Nat2Nat_32, uint32>::Scale;
01130 Scale32Sta = Scale2xScalerInternal<uint32, Manip_Sta2Nat_32, uint32>::Scale;
01131 Scale32_A888 = Scale2xScalerInternal<uint32, Manip_Nat2Nat_32, uint32>::Scale;
01132 Scale32_888A = Scale2xScalerInternal<uint32, Manip_Nat2Nat_32, uint32>::Scale;
01133
01134 #if (defined(__GNUC__) && defined(__i386__)) || (defined(_MSC_VER) && defined(_M_IX86))
01135 if (SDL_HasMMX()) {
01136 Scale16Nat = Scale2x_16MMX;
01137
01138 Scale32Nat = Scale2x_32MMX;
01139 Scale32_A888 = Scale2x_32MMX;
01140 Scale32_888A = Scale2x_32MMX;
01141 }
01142 #endif
01143 }
01144
01145 const uint32 Scale2xScaler::ScaleBits() const { return 1<<2; }
01146 const bool Scale2xScaler::ScaleArbitrary() const { return false; }
01147
01148 const char *Scale2xScaler::ScalerName() const { return "scale2x"; }
01149 const char *Scale2xScaler::ScalerDesc() const { return "AdvMame Scale2x Scaler"; }
01150 const char *Scale2xScaler::ScalerCopyright() const { return "Copyright (C) 2001, 2002, 2003, 2004 Andrea Mazzoleni"; }
01151
01152 const Scale2xScaler scale2x_scaler;
01153
01154 };