Scale2xScaler.cpp

Go to the documentation of this file.
00001 /*
00002  * This file is part of the Scale2x project.
00003  *
00004  * Copyright (C) 2001, 2002, 2003, 2004 Andrea Mazzoleni
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00019  */
00020 
00021 /*
00022  * This file contains a C and MMX implementation of the Scale2x effect.
00023  *
00024  * You can find an high level description of the effect at :
00025  *
00026  * http://scale2x.sourceforge.net/
00027  *
00028  * Alternatively at the previous license terms, you are allowed to use this
00029  * code in your program with these conditions:
00030  * - the program is not used in commercial activities.
00031  * - the whole source code of the program is released with the binary.
00032  * - derivative works of the program are allowed.
00033  */
00034 
00035 #include "pent_include.h"
00036 #include "Scale2xScaler.h"
00037 #include "Manips.h"
00038 
00039 // SDL_cpuinfo.h needs SDL_types.h, but fails to include it
00040 #include <SDL_types.h>
00041 #include <SDL_cpuinfo.h>
00042 
00043 namespace Pentagram {
00044 
00045 /***************************************************************************/
00046 /* Scale2x C implementation */
00047 
00048 template<class uintX, class Manip, class uintS=uintX> class Scale2xScalerInternal
00049 {
00050 public:
00051 
00063 static inline void scale2x_def(uintX* dst0, uintX* dst1, const uintS* src0, const uintS* src1, const uintS* src2, unsigned count)
00064 {
00065         /* first pixel */
00066         if (src0[0] != src2[0] && src1[0] != src1[1]) {
00067                 dst0[0] = Manip::copy(src1[0] == src0[0] ? src0[0] : src1[0]);
00068                 dst0[1] = Manip::copy(src1[1] == src0[0] ? src0[0] : src1[0]);
00069                 dst1[0] = Manip::copy(src1[0] == src2[0] ? src2[0] : src1[0]);
00070                 dst1[1] = Manip::copy(src1[1] == src2[0] ? src2[0] : src1[0]);
00071         } else {
00072                 dst0[0] = Manip::copy(src1[0]);
00073                 dst0[1] = Manip::copy(src1[0]);
00074                 dst1[0] = Manip::copy(src1[0]);
00075                 dst1[1] = Manip::copy(src1[0]);
00076         }
00077         ++src0;
00078         ++src1;
00079         ++src2;
00080         dst0 += 2;
00081         dst1 += 2;
00082 
00083         /* central pixels */
00084         count -= 2;
00085         while (count) {
00086                 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
00087                         dst0[0] = Manip::copy(src1[-1] == src0[0] ? src0[0] : src1[0]);
00088                         dst0[1] = Manip::copy(src1[1] == src0[0] ? src0[0] : src1[0]);
00089                         dst1[0] = Manip::copy(src1[-1] == src2[0] ? src2[0] : src1[0]);
00090                         dst1[1] = Manip::copy(src1[1] == src2[0] ? src2[0] : src1[0]);
00091                 } else {
00092                         dst0[0] = Manip::copy(src1[0]);
00093                         dst0[1] = Manip::copy(src1[0]);
00094                         dst1[0] = Manip::copy(src1[0]);
00095                         dst1[1] = Manip::copy(src1[0]);
00096                 }
00097 
00098                 ++src0;
00099                 ++src1;
00100                 ++src2;
00101                 dst0 += 2;
00102                 dst1 += 2;
00103                 --count;
00104         }
00105 
00106         /* last pixel */
00107         if (src0[0] != src2[0] && src1[-1] != src1[0]) {
00108                 dst0[0] = Manip::copy(src1[-1] == src0[0] ? src0[0] : src1[0]);
00109                 dst0[1] = Manip::copy(src1[0] == src0[0] ? src0[0] : src1[0]);
00110                 dst1[0] = Manip::copy(src1[-1] == src2[0] ? src2[0] : src1[0]);
00111                 dst1[1] = Manip::copy(src1[0] == src2[0] ? src2[0] : src1[0]);
00112         } else {
00113                 dst0[0] = Manip::copy(src1[0]);
00114                 dst0[1] = Manip::copy(src1[0]);
00115                 dst1[0] = Manip::copy(src1[0]);
00116                 dst1[1] = Manip::copy(src1[0]);
00117         }
00118 }
00119 
00120 
00121 static bool Scale( Texture *tex, sint32 sx, sint32 sy, sint32 sw, sint32 sh, 
00122                                         uint8* pixel, sint32 dw, sint32 dh, sint32 pitch, bool clamp_src)
00123 {
00124         // Must be at least 3 high
00125         if (sh<3 && (clamp_src || tex->height<3)) return false;
00126 
00127         // Source buffer pointers
00128         uintS *texel = reinterpret_cast<uintS*>(tex->buffer) + (sy * tex->width + sx);
00129         int tpitch = tex->width;
00130 //      uintS *tline_end = texel + sw;
00131         uintS *tex_end = texel + (sh-1)*tex->width;
00132 
00133         bool clip_y = true;
00134         if (sh+sy < tex->height && clamp_src == false)
00135         {
00136                 clip_y = false;
00137                 tex_end = texel + sh*tex->width;
00138         }
00139 
00140         if (sy == 0) {
00141                 scale2x_def(reinterpret_cast<uintX*>(pixel), 
00142                                         reinterpret_cast<uintX*>(pixel+pitch), 
00143                                         texel, texel, texel+tpitch, sw);
00144                 pixel += pitch*2;
00145                 texel += tpitch;
00146         }
00147 
00148         // Src Loop Y
00149         if (texel != tex_end) do {
00150 
00151                 scale2x_def(reinterpret_cast<uintX*>(pixel), 
00152                                         reinterpret_cast<uintX*>(pixel+pitch), 
00153                                         texel-tpitch, texel, texel+tpitch, sw);
00154                 pixel += pitch*2;
00155                 texel += tpitch;
00156 
00157         } while (texel != tex_end);
00158 
00159         if (clip_y) {
00160                 scale2x_def(reinterpret_cast<uintX*>(pixel), 
00161                                         reinterpret_cast<uintX*>(pixel+pitch), 
00162                                         texel-tpitch, texel, texel, sw);
00163         }
00164 
00165         return true;
00166 }
00167 
00168 };
00169 
00170 /***************************************************************************/
00171 /* Scale2x MMX implementation */
00172 
00173 #if (defined(__GNUC__) && defined(__i386__)) || (defined(_MSC_VER) && defined(_M_IX86))
00174 
00175 #ifdef _MSC_VER
00176 #pragma warning(disable:4799)   // No EMMS at end of function
00177 #endif
00178 
00179 /*
00180  * Apply the Scale2x effect at a single row.
00181  * This function must be called only by the other scale2x functions.
00182  *
00183  * Considering the pixel map :
00184  *
00185  *      ABC (src0)
00186  *      DEF (src1)
00187  *      GHI (src2)
00188  *
00189  * this functions compute 2 new pixels in substitution of the source pixel E
00190  * like this map :
00191  *
00192  *      ab (dst)
00193  *
00194  * with these variables :
00195  *
00196  *      &current -> E
00197  *      &current_left -> D
00198  *      &current_right -> F
00199  *      &current_upper -> B
00200  *      &current_lower -> H
00201  *
00202  *      %0 -> current_upper
00203  *      %1 -> current
00204  *      %2 -> current_lower
00205  *      %3 -> dst
00206  *      %4 -> counter
00207  *
00208  *      %mm0 -> *current_left
00209  *      %mm1 -> *current_next
00210  *      %mm2 -> tmp0
00211  *      %mm3 -> tmp1
00212  *      %mm4 -> tmp2
00213  *      %mm5 -> tmp3
00214  *      %mm6 -> *current_upper
00215  *      %mm7 -> *current
00216  */
00217 static inline void scale2x_16_mmx_border(uint16* dst, const uint16* src0, const uint16* src1, const uint16* src2, unsigned count)
00218 {
00219         /* always do the first and last run */
00220         count -= 2*4;
00221 
00222 #if defined(__GNUC__) && defined(__i386__)
00223         __asm__ __volatile__(
00224 /* first run */
00225                 /* set the current, current_pre, current_next registers */
00226                 "movq 0(%1), %%mm0\n"
00227                 "movq 0(%1), %%mm7\n"
00228                 "movq 8(%1), %%mm1\n"
00229                 "psllq $48, %%mm0\n"
00230                 "psllq $48, %%mm1\n"
00231                 "psrlq $48, %%mm0\n"
00232                 "movq %%mm7, %%mm2\n"
00233                 "movq %%mm7, %%mm3\n"
00234                 "psllq $16, %%mm2\n"
00235                 "psrlq $16, %%mm3\n"
00236                 "por %%mm2, %%mm0\n"
00237                 "por %%mm3, %%mm1\n"
00238 
00239                 /* current_upper */
00240                 "movq (%0), %%mm6\n"
00241 
00242                 /* compute the upper-left pixel for dst on %%mm2 */
00243                 /* compute the upper-right pixel for dst on %%mm4 */
00244                 "movq %%mm0, %%mm2\n"
00245                 "movq %%mm1, %%mm4\n"
00246                 "movq %%mm0, %%mm3\n"
00247                 "movq %%mm1, %%mm5\n"
00248                 "pcmpeqw %%mm6, %%mm2\n"
00249                 "pcmpeqw %%mm6, %%mm4\n"
00250                 "pcmpeqw (%2), %%mm3\n"
00251                 "pcmpeqw (%2), %%mm5\n"
00252                 "pandn %%mm2, %%mm3\n"
00253                 "pandn %%mm4, %%mm5\n"
00254                 "movq %%mm0, %%mm2\n"
00255                 "movq %%mm1, %%mm4\n"
00256                 "pcmpeqw %%mm1, %%mm2\n"
00257                 "pcmpeqw %%mm0, %%mm4\n"
00258                 "pandn %%mm3, %%mm2\n"
00259                 "pandn %%mm5, %%mm4\n"
00260                 "movq %%mm2, %%mm3\n"
00261                 "movq %%mm4, %%mm5\n"
00262                 "pand %%mm6, %%mm2\n"
00263                 "pand %%mm6, %%mm4\n"
00264                 "pandn %%mm7, %%mm3\n"
00265                 "pandn %%mm7, %%mm5\n"
00266                 "por %%mm3, %%mm2\n"
00267                 "por %%mm5, %%mm4\n"
00268 
00269                 /* set *dst */
00270                 "movq %%mm2, %%mm3\n"
00271                 "punpcklwd %%mm4, %%mm2\n"
00272                 "punpckhwd %%mm4, %%mm3\n"
00273                 "movq %%mm2, (%3)\n"
00274                 "movq %%mm3, 8(%3)\n"
00275 
00276                 /* next */
00277                 "addl $8, %0\n"
00278                 "addl $8, %1\n"
00279                 "addl $8, %2\n"
00280                 "addl $16, %3\n"
00281 
00282 /* central runs */
00283                 "shrl $2, %4\n"
00284                 "jz 1f\n"
00285 
00286                 "0:\n"
00287 
00288                 /* set the current, current_pre, current_next registers */
00289                 "movq -8(%1), %%mm0\n"
00290                 "movq (%1), %%mm7\n"
00291                 "movq 8(%1), %%mm1\n"
00292                 "psrlq $48, %%mm0\n"
00293                 "psllq $48, %%mm1\n"
00294                 "movq %%mm7, %%mm2\n"
00295                 "movq %%mm7, %%mm3\n"
00296                 "psllq $16, %%mm2\n"
00297                 "psrlq $16, %%mm3\n"
00298                 "por %%mm2, %%mm0\n"
00299                 "por %%mm3, %%mm1\n"
00300 
00301                 /* current_upper */
00302                 "movq (%0), %%mm6\n"
00303 
00304                 /* compute the upper-left pixel for dst on %%mm2 */
00305                 /* compute the upper-right pixel for dst on %%mm4 */
00306                 "movq %%mm0, %%mm2\n"
00307                 "movq %%mm1, %%mm4\n"
00308                 "movq %%mm0, %%mm3\n"
00309                 "movq %%mm1, %%mm5\n"
00310                 "pcmpeqw %%mm6, %%mm2\n"
00311                 "pcmpeqw %%mm6, %%mm4\n"
00312                 "pcmpeqw (%2), %%mm3\n"
00313                 "pcmpeqw (%2), %%mm5\n"
00314                 "pandn %%mm2, %%mm3\n"
00315                 "pandn %%mm4, %%mm5\n"
00316                 "movq %%mm0, %%mm2\n"
00317                 "movq %%mm1, %%mm4\n"
00318                 "pcmpeqw %%mm1, %%mm2\n"
00319                 "pcmpeqw %%mm0, %%mm4\n"
00320                 "pandn %%mm3, %%mm2\n"
00321                 "pandn %%mm5, %%mm4\n"
00322                 "movq %%mm2, %%mm3\n"
00323                 "movq %%mm4, %%mm5\n"
00324                 "pand %%mm6, %%mm2\n"
00325                 "pand %%mm6, %%mm4\n"
00326                 "pandn %%mm7, %%mm3\n"
00327                 "pandn %%mm7, %%mm5\n"
00328                 "por %%mm3, %%mm2\n"
00329                 "por %%mm5, %%mm4\n"
00330 
00331                 /* set *dst */
00332                 "movq %%mm2, %%mm3\n"
00333                 "punpcklwd %%mm4, %%mm2\n"
00334                 "punpckhwd %%mm4, %%mm3\n"
00335                 "movq %%mm2, (%3)\n"
00336                 "movq %%mm3, 8(%3)\n"
00337 
00338                 /* next */
00339                 "addl $8, %0\n"
00340                 "addl $8, %1\n"
00341                 "addl $8, %2\n"
00342                 "addl $16, %3\n"
00343 
00344                 "decl %4\n"
00345                 "jnz 0b\n"
00346                 "1:\n"
00347 
00348 /* final run */
00349                 /* set the current, current_pre, current_next registers */
00350                 "movq (%1), %%mm1\n"
00351                 "movq (%1), %%mm7\n"
00352                 "movq -8(%1), %%mm0\n"
00353                 "psrlq $48, %%mm1\n"
00354                 "psrlq $48, %%mm0\n"
00355                 "psllq $48, %%mm1\n"
00356                 "movq %%mm7, %%mm2\n"
00357                 "movq %%mm7, %%mm3\n"
00358                 "psllq $16, %%mm2\n"
00359                 "psrlq $16, %%mm3\n"
00360                 "por %%mm2, %%mm0\n"
00361                 "por %%mm3, %%mm1\n"
00362 
00363                 /* current_upper */
00364                 "movq (%0), %%mm6\n"
00365 
00366                 /* compute the upper-left pixel for dst on %%mm2 */
00367                 /* compute the upper-right pixel for dst on %%mm4 */
00368                 "movq %%mm0, %%mm2\n"
00369                 "movq %%mm1, %%mm4\n"
00370                 "movq %%mm0, %%mm3\n"
00371                 "movq %%mm1, %%mm5\n"
00372                 "pcmpeqw %%mm6, %%mm2\n"
00373                 "pcmpeqw %%mm6, %%mm4\n"
00374                 "pcmpeqw (%2), %%mm3\n"
00375                 "pcmpeqw (%2), %%mm5\n"
00376                 "pandn %%mm2, %%mm3\n"
00377                 "pandn %%mm4, %%mm5\n"
00378                 "movq %%mm0, %%mm2\n"
00379                 "movq %%mm1, %%mm4\n"
00380                 "pcmpeqw %%mm1, %%mm2\n"
00381                 "pcmpeqw %%mm0, %%mm4\n"
00382                 "pandn %%mm3, %%mm2\n"
00383                 "pandn %%mm5, %%mm4\n"
00384                 "movq %%mm2, %%mm3\n"
00385                 "movq %%mm4, %%mm5\n"
00386                 "pand %%mm6, %%mm2\n"
00387                 "pand %%mm6, %%mm4\n"
00388                 "pandn %%mm7, %%mm3\n"
00389                 "pandn %%mm7, %%mm5\n"
00390                 "por %%mm3, %%mm2\n"
00391                 "por %%mm5, %%mm4\n"
00392 
00393                 /* set *dst */
00394                 "movq %%mm2, %%mm3\n"
00395                 "punpcklwd %%mm4, %%mm2\n"
00396                 "punpckhwd %%mm4, %%mm3\n"
00397                 "movq %%mm2, (%3)\n"
00398                 "movq %%mm3, 8(%3)\n"
00399 
00400                 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
00401                 :
00402                 : "cc"
00403         );
00404 #elif defined(_MSC_VER) && defined(_M_IX86)
00405         __asm {
00406                 mov eax, src0;
00407                 mov ebx, src1;
00408             mov ecx, src2;
00409                 mov edx, dst;
00410                 mov esi, count;
00411 
00412 /* first run */
00413                 /* set the current, current_pre, current_next registers */
00414                 movq mm0, [ebx+0];
00415                 movq mm7, [ebx+0];
00416                 movq mm1, [ebx+8];
00417                 psllq mm0, 48;
00418                 psllq mm1, 48;
00419                 psrlq mm0, 48;
00420                 movq mm2, mm7;
00421                 movq mm3, mm7;
00422                 psllq mm2, 16;
00423                 psrlq mm3, 16;
00424                 por mm0, mm2;
00425                 por mm1, mm3;
00426 
00427                 /* current_upper */
00428                 movq mm6, [eax];
00429 
00430                 /* compute the upper-left pixel for dst on mm2 */
00431                 /* compute the upper-right pixel for dst on mm4 */
00432                 movq mm2, mm0;
00433                 movq mm4, mm1;
00434                 movq mm3, mm0;
00435                 movq mm5, mm1;
00436                 pcmpeqw mm2, mm6;
00437                 pcmpeqw mm4, mm6;
00438                 pcmpeqw mm3, [ecx];
00439                 pcmpeqw mm5, [ecx];
00440                 pandn mm3, mm2;
00441                 pandn mm5, mm4;
00442                 movq mm2, mm0;
00443                 movq mm4, mm1;
00444                 pcmpeqw mm2, mm1;
00445                 pcmpeqw mm4, mm0;
00446                 pandn mm2, mm3;
00447                 pandn mm4, mm5;
00448                 movq mm3, mm2;
00449                 movq mm5, mm4;
00450                 pand mm2, mm6;
00451                 pand mm4, mm6;
00452                 pandn mm3, mm7;
00453                 pandn mm5, mm7;
00454                 por mm2, mm3;
00455                 por mm4, mm5;
00456 
00457                 /* set *dst */
00458                 movq mm3, mm2;
00459                 punpcklwd mm2, mm4;
00460                 punpckhwd mm3, mm4;
00461                 movq [edx], mm2;
00462                 movq [edx+8], mm3;
00463 
00464                 /* next */
00465                 add eax, 8;
00466                 add ebx, 8;
00467                 add ecx, 8;
00468                 add edx, 16;
00469 
00470 /* central runs */
00471                 shr esi, 2;
00472                 jz label1;
00473             align 4;
00474 label0:
00475 
00476                 /* set the current, current_pre, current_next registers */
00477                 movq mm0, [ebx-8];
00478                 movq mm7, [ebx];
00479                 movq mm1, [ebx+8];
00480                 psrlq mm0, 48;
00481                 psllq mm1, 48;
00482                 movq mm2, mm7;
00483                 movq mm3, mm7;
00484                 psllq mm2, 16;
00485                 psrlq mm3, 16;
00486                 por mm0, mm2;
00487                 por mm1, mm3;
00488 
00489                 /* current_upper */
00490                 movq mm6, [eax];
00491 
00492                 /* compute the upper-left pixel for dst on mm2 */
00493                 /* compute the upper-right pixel for dst on mm4 */
00494                 movq mm2, mm0;
00495                 movq mm4, mm1;
00496                 movq mm3, mm0;
00497                 movq mm5, mm1;
00498                 pcmpeqw mm2, mm6;
00499                 pcmpeqw mm4, mm6;
00500                 pcmpeqw mm3, [ecx];
00501                 pcmpeqw mm5, [ecx];
00502                 pandn mm3, mm2;
00503                 pandn mm5, mm4;
00504                 movq mm2, mm0;
00505                 movq mm4, mm1;
00506                 pcmpeqw mm2, mm1;
00507                 pcmpeqw mm4, mm0;
00508                 pandn mm2, mm3;
00509                 pandn mm4, mm5;
00510                 movq mm3, mm2;
00511                 movq mm5, mm4;
00512                 pand mm2, mm6;
00513                 pand mm4, mm6;
00514                 pandn mm3, mm7;
00515                 pandn mm5, mm7;
00516                 por mm2, mm3;
00517                 por mm4, mm5;
00518 
00519                 /* set *dst */
00520                 movq mm3, mm2;
00521                 punpcklwd mm2, mm4;
00522                 punpckhwd mm3, mm4;
00523                 movq [edx], mm2;
00524                 movq [edx+8], mm3;
00525 
00526                 /* next */
00527                 add eax, 8;
00528                 add ebx, 8;
00529                 add ecx, 8;
00530                 add edx, 16;
00531 
00532                 dec esi;
00533                 jnz label0;
00534 label1:
00535 
00536 /* final run */
00537                 /* set the current, current_pre, current_next registers */
00538                 movq mm1, [ebx];
00539                 movq mm7, [ebx];
00540                 movq mm0, [ebx-8];
00541                 psrlq mm1, 48;
00542                 psrlq mm0, 48;
00543                 psllq mm1, 48;
00544                 movq mm2, mm7;
00545                 movq mm3, mm7;
00546                 psllq mm2, 16;
00547                 psrlq mm3, 16;
00548                 por mm0, mm2;
00549                 por mm1, mm3;
00550 
00551                 /* current_upper */
00552                 movq mm6, [eax];
00553 
00554                 /* compute the upper-left pixel for dst on mm2 */
00555                 /* compute the upper-right pixel for dst on mm4 */
00556                 movq mm2, mm0;
00557                 movq mm4, mm1;
00558                 movq mm3, mm0;
00559                 movq mm5, mm1;
00560                 pcmpeqw mm2, mm6;
00561                 pcmpeqw mm4, mm6;
00562                 pcmpeqw mm3, [ecx];
00563                 pcmpeqw mm5, [ecx];
00564                 pandn mm3, mm2;
00565                 pandn mm5, mm4;
00566                 movq mm2, mm0;
00567                 movq mm4, mm1;
00568                 pcmpeqw mm2, mm1;
00569                 pcmpeqw mm4, mm0;
00570                 pandn mm2, mm3;
00571                 pandn mm4, mm5;
00572                 movq mm3, mm2;
00573                 movq mm5, mm4;
00574                 pand mm2, mm6;
00575                 pand mm4, mm6;
00576                 pandn mm3, mm7;
00577                 pandn mm5, mm7;
00578                 por mm2, mm3;
00579                 por mm4, mm5;
00580 
00581                 /* set *dst */
00582                 movq mm3, mm2;
00583                 punpcklwd mm2, mm4;
00584                 punpckhwd mm3, mm4;
00585                 movq [edx], mm2;
00586                 movq [edx+8], mm3;
00587         }
00588 #endif
00589 }
00590 
00591 static inline void scale2x_32_mmx_border(uint32* dst, const uint32* src0, const uint32* src1, const uint32* src2, unsigned count)
00592 {
00593         /* always do the first and last run */
00594         count -= 2*2;
00595 
00596 #if defined(__GNUC__) && defined(__i386__)
00597         __asm__ __volatile__(
00598 /* first run */
00599                 /* set the current, current_pre, current_next registers */
00600                 "movq 0(%1), %%mm0\n"
00601                 "movq 0(%1), %%mm7\n"
00602                 "movq 8(%1), %%mm1\n"
00603                 "psllq $32, %%mm0\n"
00604                 "psllq $32, %%mm1\n"
00605                 "psrlq $32, %%mm0\n"
00606                 "movq %%mm7, %%mm2\n"
00607                 "movq %%mm7, %%mm3\n"
00608                 "psllq $32, %%mm2\n"
00609                 "psrlq $32, %%mm3\n"
00610                 "por %%mm2, %%mm0\n"
00611                 "por %%mm3, %%mm1\n"
00612 
00613                 /* current_upper */
00614                 "movq (%0), %%mm6\n"
00615 
00616                 /* compute the upper-left pixel for dst on %%mm2 */
00617                 /* compute the upper-right pixel for dst on %%mm4 */
00618                 "movq %%mm0, %%mm2\n"
00619                 "movq %%mm1, %%mm4\n"
00620                 "movq %%mm0, %%mm3\n"
00621                 "movq %%mm1, %%mm5\n"
00622                 "pcmpeqd %%mm6, %%mm2\n"
00623                 "pcmpeqd %%mm6, %%mm4\n"
00624                 "pcmpeqd (%2), %%mm3\n"
00625                 "pcmpeqd (%2), %%mm5\n"
00626                 "pandn %%mm2, %%mm3\n"
00627                 "pandn %%mm4, %%mm5\n"
00628                 "movq %%mm0, %%mm2\n"
00629                 "movq %%mm1, %%mm4\n"
00630                 "pcmpeqd %%mm1, %%mm2\n"
00631                 "pcmpeqd %%mm0, %%mm4\n"
00632                 "pandn %%mm3, %%mm2\n"
00633                 "pandn %%mm5, %%mm4\n"
00634                 "movq %%mm2, %%mm3\n"
00635                 "movq %%mm4, %%mm5\n"
00636                 "pand %%mm6, %%mm2\n"
00637                 "pand %%mm6, %%mm4\n"
00638                 "pandn %%mm7, %%mm3\n"
00639                 "pandn %%mm7, %%mm5\n"
00640                 "por %%mm3, %%mm2\n"
00641                 "por %%mm5, %%mm4\n"
00642 
00643                 /* set *dst */
00644                 "movq %%mm2, %%mm3\n"
00645                 "punpckldq %%mm4, %%mm2\n"
00646                 "punpckhdq %%mm4, %%mm3\n"
00647                 "movq %%mm2, (%3)\n"
00648                 "movq %%mm3, 8(%3)\n"
00649 
00650                 /* next */
00651                 "addl $8, %0\n"
00652                 "addl $8, %1\n"
00653                 "addl $8, %2\n"
00654                 "addl $16, %3\n"
00655 
00656 /* central runs */
00657                 "shrl $1, %4\n"
00658                 "jz 1f\n"
00659 
00660                 "0:\n"
00661 
00662                 /* set the current, current_pre, current_next registers */
00663                 "movq -8(%1), %%mm0\n"
00664                 "movq (%1), %%mm7\n"
00665                 "movq 8(%1), %%mm1\n"
00666                 "psrlq $32, %%mm0\n"
00667                 "psllq $32, %%mm1\n"
00668                 "movq %%mm7, %%mm2\n"
00669                 "movq %%mm7, %%mm3\n"
00670                 "psllq $32, %%mm2\n"
00671                 "psrlq $32, %%mm3\n"
00672                 "por %%mm2, %%mm0\n"
00673                 "por %%mm3, %%mm1\n"
00674 
00675                 /* current_upper */
00676                 "movq (%0), %%mm6\n"
00677 
00678                 /* compute the upper-left pixel for dst on %%mm2 */
00679                 /* compute the upper-right pixel for dst on %%mm4 */
00680                 "movq %%mm0, %%mm2\n"
00681                 "movq %%mm1, %%mm4\n"
00682                 "movq %%mm0, %%mm3\n"
00683                 "movq %%mm1, %%mm5\n"
00684                 "pcmpeqd %%mm6, %%mm2\n"
00685                 "pcmpeqd %%mm6, %%mm4\n"
00686                 "pcmpeqd (%2), %%mm3\n"
00687                 "pcmpeqd (%2), %%mm5\n"
00688                 "pandn %%mm2, %%mm3\n"
00689                 "pandn %%mm4, %%mm5\n"
00690                 "movq %%mm0, %%mm2\n"
00691                 "movq %%mm1, %%mm4\n"
00692                 "pcmpeqd %%mm1, %%mm2\n"
00693                 "pcmpeqd %%mm0, %%mm4\n"
00694                 "pandn %%mm3, %%mm2\n"
00695                 "pandn %%mm5, %%mm4\n"
00696                 "movq %%mm2, %%mm3\n"
00697                 "movq %%mm4, %%mm5\n"
00698                 "pand %%mm6, %%mm2\n"
00699                 "pand %%mm6, %%mm4\n"
00700                 "pandn %%mm7, %%mm3\n"
00701                 "pandn %%mm7, %%mm5\n"
00702                 "por %%mm3, %%mm2\n"
00703                 "por %%mm5, %%mm4\n"
00704 
00705                 /* set *dst */
00706                 "movq %%mm2, %%mm3\n"
00707                 "punpckldq %%mm4, %%mm2\n"
00708                 "punpckhdq %%mm4, %%mm3\n"
00709                 "movq %%mm2, (%3)\n"
00710                 "movq %%mm3, 8(%3)\n"
00711 
00712                 /* next */
00713                 "addl $8, %0\n"
00714                 "addl $8, %1\n"
00715                 "addl $8, %2\n"
00716                 "addl $16, %3\n"
00717 
00718                 "decl %4\n"
00719                 "jnz 0b\n"
00720                 "1:\n"
00721 
00722 /* final run */
00723                 /* set the current, current_pre, current_next registers */
00724                 "movq (%1), %%mm1\n"
00725                 "movq (%1), %%mm7\n"
00726                 "movq -8(%1), %%mm0\n"
00727                 "psrlq $32, %%mm1\n"
00728                 "psrlq $32, %%mm0\n"
00729                 "psllq $32, %%mm1\n"
00730                 "movq %%mm7, %%mm2\n"
00731                 "movq %%mm7, %%mm3\n"
00732                 "psllq $32, %%mm2\n"
00733                 "psrlq $32, %%mm3\n"
00734                 "por %%mm2, %%mm0\n"
00735                 "por %%mm3, %%mm1\n"
00736 
00737                 /* current_upper */
00738                 "movq (%0), %%mm6\n"
00739 
00740                 /* compute the upper-left pixel for dst on %%mm2 */
00741                 /* compute the upper-right pixel for dst on %%mm4 */
00742                 "movq %%mm0, %%mm2\n"
00743                 "movq %%mm1, %%mm4\n"
00744                 "movq %%mm0, %%mm3\n"
00745                 "movq %%mm1, %%mm5\n"
00746                 "pcmpeqd %%mm6, %%mm2\n"
00747                 "pcmpeqd %%mm6, %%mm4\n"
00748                 "pcmpeqd (%2), %%mm3\n"
00749                 "pcmpeqd (%2), %%mm5\n"
00750                 "pandn %%mm2, %%mm3\n"
00751                 "pandn %%mm4, %%mm5\n"
00752                 "movq %%mm0, %%mm2\n"
00753                 "movq %%mm1, %%mm4\n"
00754                 "pcmpeqd %%mm1, %%mm2\n"
00755                 "pcmpeqd %%mm0, %%mm4\n"
00756                 "pandn %%mm3, %%mm2\n"
00757                 "pandn %%mm5, %%mm4\n"
00758                 "movq %%mm2, %%mm3\n"
00759                 "movq %%mm4, %%mm5\n"
00760                 "pand %%mm6, %%mm2\n"
00761                 "pand %%mm6, %%mm4\n"
00762                 "pandn %%mm7, %%mm3\n"
00763                 "pandn %%mm7, %%mm5\n"
00764                 "por %%mm3, %%mm2\n"
00765                 "por %%mm5, %%mm4\n"
00766 
00767                 /* set *dst */
00768                 "movq %%mm2, %%mm3\n"
00769                 "punpckldq %%mm4, %%mm2\n"
00770                 "punpckhdq %%mm4, %%mm3\n"
00771                 "movq %%mm2, (%3)\n"
00772                 "movq %%mm3, 8(%3)\n"
00773 
00774                 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
00775                 :
00776                 : "cc"
00777         );
00778 #elif defined(_MSC_VER) && defined(_M_IX86)
00779         __asm {
00780                 mov eax, src0;
00781                 mov ebx, src1;
00782             mov ecx, src2;
00783                 mov edx, dst;
00784                 mov esi, count;
00785 
00786 /* first run */
00787                 /* set the current, current_pre, current_next registers */
00788                 movq mm0, [ebx+0];
00789                 movq mm7, [ebx+0];
00790                 movq mm1, [ebx+8];
00791                 psllq mm0, 32;
00792                 psllq mm1, 32;
00793                 psrlq mm0, 32;
00794                 movq mm2, mm7;
00795                 movq mm3, mm7;
00796                 psllq mm2, 32;
00797                 psrlq mm3, 32;
00798                 por mm0, mm2;
00799                 por mm1, mm3;
00800 
00801                 /* current_upper */
00802                 movq mm6, [eax];
00803 
00804                 /* compute the upper-left pixel for dst on mm2 */
00805                 /* compute the upper-right pixel for dst on mm4 */
00806                 movq mm2, mm0;
00807                 movq mm4, mm1;
00808                 movq mm3, mm0;
00809                 movq mm5, mm1;
00810                 pcmpeqd mm2, mm6;
00811                 pcmpeqd mm4, mm6;
00812                 pcmpeqd mm3, [ecx];
00813                 pcmpeqd mm5, [ecx];
00814                 pandn mm3, mm2;
00815                 pandn mm5, mm4;
00816                 movq mm2, mm0;
00817                 movq mm4, mm1;
00818                 pcmpeqd mm2, mm1;
00819                 pcmpeqd mm4, mm0;
00820                 pandn mm2, mm3;
00821                 pandn mm4, mm5;
00822                 movq mm3, mm2;
00823                 movq mm5, mm4;
00824                 pand mm2, mm6;
00825                 pand mm4, mm6;
00826                 pandn mm3, mm7;
00827                 pandn mm5, mm7;
00828                 por mm2, mm3;
00829                 por mm4, mm5;
00830 
00831                 /* set *dst */
00832                 movq mm3, mm2;
00833                 punpckldq mm2, mm4;
00834                 punpckhdq mm3, mm4;
00835                 movq [edx], mm2;
00836                 movq [edx+8], mm3;
00837 
00838                 /* next */
00839                 add eax, 8;
00840                 add ebx, 8;
00841                 add ecx, 8;
00842                 add edx, 16;
00843 
00844 /* central runs */
00845                 shr esi, 1;
00846                 jz label1;
00847 
00848 label0:
00849 
00850                 /* set the current, current_pre, current_next registers */
00851                 movq mm0, [ebx-8];
00852                 movq mm7, [ebx];
00853                 movq mm1, [ebx+8];
00854                 psrlq mm0, 32;
00855                 psllq mm1, 32;
00856                 movq mm2, mm7;
00857                 movq mm3, mm7;
00858                 psllq mm2, 32;
00859                 psrlq mm3, 32;
00860                 por mm0, mm2;
00861                 por mm1, mm3;
00862 
00863                 /* current_upper */
00864                 movq mm6, [eax];
00865 
00866                 /* compute the upper-left pixel for dst on mm2 */
00867                 /* compute the upper-right pixel for dst on mm4 */
00868                 movq mm2, mm0;
00869                 movq mm4, mm1;
00870                 movq mm3, mm0;
00871                 movq mm5, mm1;
00872                 pcmpeqd mm2, mm6;
00873                 pcmpeqd mm4, mm6;
00874                 pcmpeqd mm3, [ecx];
00875                 pcmpeqd mm5, [ecx];
00876                 pandn mm3, mm2;
00877                 pandn mm5, mm4;
00878                 movq mm2, mm0;
00879                 movq mm4, mm1;
00880                 pcmpeqd mm2, mm1;
00881                 pcmpeqd mm4, mm0;
00882                 pandn mm2, mm3;
00883                 pandn mm4, mm5;
00884                 movq mm3, mm2;
00885                 movq mm5, mm4;
00886                 pand mm2, mm6;
00887                 pand mm4, mm6;
00888                 pandn mm3, mm7;
00889                 pandn mm5, mm7;
00890                 por mm2, mm3;
00891                 por mm4, mm5;
00892 
00893                 /* set *dst */
00894                 movq mm3, mm2;
00895                 punpckldq mm2, mm4;
00896                 punpckhdq mm3, mm4;
00897                 movq [edx], mm2;
00898                 movq [edx+8], mm3;
00899 
00900                 /* next */
00901                 add eax, 8;
00902                 add ebx, 8;
00903                 add ecx, 8;
00904                 add edx, 16;
00905 
00906                 dec esi;
00907                 jnz label0
00908 label1:
00909 
00910 /* final run */
00911                 /* set the current, current_pre, current_next registers */
00912                 movq mm1, [ebx];
00913                 movq mm7, [ebx];
00914                 movq mm0, [ebx-8];
00915                 psrlq mm1, 32;
00916                 psrlq mm0, 32;
00917                 psllq mm1, 32;
00918                 movq mm2, mm7;
00919                 movq mm3, mm7;
00920                 psllq mm2, 32;
00921                 psrlq mm3, 32;
00922                 por mm0, mm2;
00923                 por mm1, mm3;
00924 
00925                 /* current_upper */
00926                 movq mm6, [eax];
00927 
00928                 /* compute the upper-left pixel for dst on mm2 */
00929                 /* compute the upper-right pixel for dst on mm4 */
00930                 movq mm2, mm0;
00931                 movq mm4, mm1;
00932                 movq mm3, mm0;
00933                 movq mm5, mm1;
00934                 pcmpeqd mm2, mm6;
00935                 pcmpeqd mm4, mm6;
00936                 pcmpeqd mm3, [ecx];
00937                 pcmpeqd mm5, [ecx];
00938                 pandn mm3, mm2;
00939                 pandn mm5, mm4;
00940                 movq mm2, mm0;
00941                 movq mm4, mm1;
00942                 pcmpeqd mm2, mm1;
00943                 pcmpeqd mm4, mm0;
00944                 pandn mm2, mm3;
00945                 pandn mm4, mm5;
00946                 movq mm3, mm2;
00947                 movq mm5, mm4;
00948                 pand mm2, mm6;
00949                 pand mm4, mm6;
00950                 pandn mm3, mm7;
00951                 pandn mm5, mm7;
00952                 por mm2, mm3;
00953                 por mm4, mm5;
00954 
00955                 /* set *dst */
00956                 movq mm3, mm2;
00957                 punpckldq mm2, mm4;
00958                 punpckhdq mm3, mm4;
00959                 movq [edx], mm2;
00960                 movq [edx+8], mm3;
00961         };
00962 #endif
00963 }
00964 
00970 static inline void scale2x_mmx_emms(void)
00971 {
00972 #if defined(__GNUC__) && defined(__i386__)
00973         __asm__ __volatile__ (
00974                 "emms"
00975         );
00976 #elif defined(_MSC_VER) && defined(_M_IX86)
00977         __asm emms;
00978 #endif
00979 }
00980 
00992 static void scale2x_16_mmx(uint16* dst0, uint16* dst1, const uint16* src0, const uint16* src1, const uint16* src2, unsigned count)
00993 {
00994         if (count % 4 != 0 || count < 8) {
00995                 Scale2xScalerInternal<uint16, Manip_Nat2Nat_16>::scale2x_def(dst0, dst1, src0, src1, src2, count);
00996         } else {
00997                 scale2x_16_mmx_border(dst0, src0, src1, src2, count);
00998                 scale2x_16_mmx_border(dst1, src2, src1, src0, count);
00999         }
01000 }
01001 
01013 static void scale2x_32_mmx(uint32* dst0, uint32* dst1, const uint32* src0, const uint32* src1, const uint32* src2, unsigned count)
01014 {
01015         if (count % 2 != 0 || count < 4) {
01016                 Scale2xScalerInternal<uint32, Manip_Nat2Nat_32>::scale2x_def(dst0, dst1, src0, src1, src2, count);
01017         } else {
01018                 scale2x_32_mmx_border(dst0, src0, src1, src2, count);
01019                 scale2x_32_mmx_border(dst1, src2, src1, src0, count);
01020         }
01021 }
01022 
01023 //
01024 // Pentagram Scale2x Implementation
01025 //
01026 
01027 static bool Scale2x_16MMX( Texture *tex , sint32 sx, sint32 sy, sint32 sw, sint32 sh, 
01028                                         uint8* pixel, sint32 dw, sint32 dh, sint32 pitch, bool clamp_src)
01029 {
01030         // Must be at least 3 high
01031         if (sh<3 && (clamp_src || tex->height<3)) return false;
01032 
01033         // Source buffer pointers
01034         uint16 *texel = reinterpret_cast<uint16*>(tex->buffer) + (sy * tex->width + sx);
01035         int tpitch = tex->width;
01036 //      uint16 *tline_end = texel + sw;
01037         uint16 *tex_end = texel + (sh-1)*tex->width;
01038 
01039         bool clip_y = true;
01040         if (sh+sy < tex->height && clamp_src == false)
01041         {
01042                 clip_y = false;
01043                 tex_end = texel + sh*tex->width;
01044         }
01045 
01046         if (sy == 0) {
01047                 scale2x_16_mmx(reinterpret_cast<uint16*>(pixel), 
01048                                         reinterpret_cast<uint16*>(pixel+pitch), 
01049                                         texel, texel, texel+tpitch, sw);
01050                 pixel += pitch*2;
01051                 texel += tpitch;
01052         }
01053 
01054         // Src Loop Y
01055         if (texel != tex_end) do {
01056 
01057                 scale2x_16_mmx(reinterpret_cast<uint16*>(pixel), 
01058                                         reinterpret_cast<uint16*>(pixel+pitch), 
01059                                         texel-tpitch, texel, texel+tpitch, sw);
01060                 pixel += pitch*2;
01061                 texel += tpitch;
01062 
01063         } while (texel != tex_end);
01064 
01065         if (clip_y) {
01066                 scale2x_16_mmx(reinterpret_cast<uint16*>(pixel), 
01067                                         reinterpret_cast<uint16*>(pixel+pitch), 
01068                                         texel-tpitch, texel, texel, sw);
01069         }
01070 
01071         scale2x_mmx_emms();
01072         return true;
01073 }
01074 
01075 static bool Scale2x_32MMX( Texture *tex , sint32 sx, sint32 sy, sint32 sw, sint32 sh, 
01076                                         uint8* pixel, sint32 dw, sint32 dh, sint32 pitch, bool clamp_src)
01077 {
01078         // Must be at least 3 high
01079         if (sh<3 && (clamp_src || tex->height<3)) return false;
01080 
01081         // Source buffer pointers
01082         uint32 *texel = reinterpret_cast<uint32*>(tex->buffer) + (sy * tex->width + sx);
01083         int tpitch = tex->width;
01084 //      uint32 *tline_end = texel + sw;
01085         uint32 *tex_end = texel + (sh-1)*tex->width;
01086 
01087         bool clip_y = true;
01088         if (sh+sy < tex->height && clamp_src == false)
01089         {
01090                 clip_y = false;
01091                 tex_end = texel + sh*tex->width;
01092         }
01093 
01094         if (sy == 0) {
01095                 scale2x_32_mmx(reinterpret_cast<uint32*>(pixel), 
01096                                         reinterpret_cast<uint32*>(pixel+pitch), 
01097                                         texel, texel, texel+tpitch, sw);
01098                 pixel += pitch*2;
01099                 texel += tpitch;
01100         }
01101 
01102         // Src Loop Y
01103         if (texel != tex_end) do {
01104 
01105                 scale2x_32_mmx(reinterpret_cast<uint32*>(pixel), 
01106                                         reinterpret_cast<uint32*>(pixel+pitch), 
01107                                         texel-tpitch, texel, texel+tpitch, sw);
01108                 pixel += pitch*2;
01109                 texel += tpitch;
01110 
01111         } while (texel != tex_end);
01112 
01113         if (clip_y) {
01114                 scale2x_32_mmx(reinterpret_cast<uint32*>(pixel), 
01115                                         reinterpret_cast<uint32*>(pixel+pitch), 
01116                                         texel-tpitch, texel, texel, sw);
01117         }
01118 
01119         scale2x_mmx_emms();
01120         return true;
01121 }
01122 #endif
01123 
01124 Scale2xScaler::Scale2xScaler() : Scaler()
01125 {
01126         Scale16Nat = Scale2xScalerInternal<uint16, Manip_Nat2Nat_16, uint16>::Scale;
01127         Scale16Sta = Scale2xScalerInternal<uint16, Manip_Sta2Nat_16, uint32>::Scale;
01128 
01129         Scale32Nat = Scale2xScalerInternal<uint32, Manip_Nat2Nat_32, uint32>::Scale;
01130         Scale32Sta = Scale2xScalerInternal<uint32, Manip_Sta2Nat_32, uint32>::Scale;
01131         Scale32_A888 = Scale2xScalerInternal<uint32, Manip_Nat2Nat_32, uint32>::Scale;
01132         Scale32_888A = Scale2xScalerInternal<uint32, Manip_Nat2Nat_32, uint32>::Scale;
01133 
01134 #if (defined(__GNUC__) && defined(__i386__)) || (defined(_MSC_VER) && defined(_M_IX86))
01135         if (SDL_HasMMX()) {
01136                 Scale16Nat = Scale2x_16MMX;
01137 
01138                 Scale32Nat = Scale2x_32MMX;
01139                 Scale32_A888 = Scale2x_32MMX;
01140                 Scale32_888A = Scale2x_32MMX;
01141         }
01142 #endif
01143 }
01144 
01145 const uint32 Scale2xScaler::ScaleBits() const { return 1<<2; }
01146 const bool Scale2xScaler::ScaleArbitrary() const { return false; }
01147 
01148 const char *Scale2xScaler::ScalerName() const { return "scale2x"; }
01149 const char *Scale2xScaler::ScalerDesc() const { return "AdvMame Scale2x Scaler"; }
01150 const char *Scale2xScaler::ScalerCopyright() const { return "Copyright (C) 2001, 2002, 2003, 2004 Andrea Mazzoleni"; }
01151 
01152 const Scale2xScaler scale2x_scaler;
01153 
01154 };

Generated on Fri Jul 27 22:27:33 2007 for pentagram by  doxygen 1.4.7