1;// file : mmx_zoom.s 2;// author : JC Hoelt <jeko@free.fr> 3;// 4;// history 5;// 07/01/2001 : Changing FEMMS to EMMS : slower... but run on intel machines 6;// 03/01/2001 : WIDTH and HEIGHT are now variable 7;// 28/12/2000 : adding comments to the code, suppress some useless lines 8;// 27/12/2000 : reducing memory access... improving performance by 20% 9;// coefficients are now on 1 byte 10;// 22/12/2000 : Changing data structure 11;// 16/12/2000 : AT&T version 12;// 14/12/2000 : unrolling loop 13;// 12/12/2000 : 64 bits memory access 14 15 16.data 17 18chaine: 19 .string "pos = %d\n\0" 20 .long 0x0 21 22thezero: 23 .long 0x00000000 24 .long 0x00000000 25 26.text 27 28.globl mmx_zoom ;// name of the function to call by C program 29/* .extern coeffs ;// the transformation buffer */ 30.extern expix1,expix2 ;// the source and destination buffer 31.extern mmx_zoom_size, zoom_width ;// size of the buffers 32 33.extern brutS,brutD,buffratio,precalCoef,prevX,prevY 34 35#define PERTEMASK 15 36/* faire : a / sqrtperte <=> a >> PERTEDEC*/ 37#define PERTEDEC 4 38 39.align 16 40mmx_zoom: 41 42 pushl %ebp 43 movl %esp,%ebp 44 subl $12,%esp 45 46 movl prevX,%eax 47 decl %eax 48 sarl $4,%eax 49 movl %eax,-4(%ebp) 50 51 movl prevY,%eax 52 decl %eax 53 sarl $4,%eax 54 movl %eax,-8(%ebp) 55 56;// initialisation du mm7 � zero 57 movq (thezero), %mm7 58 59movl mmx_zoom_size, %ecx 60decl %ecx 61 62.while: 63 ;// esi <- nouvelle position 64 movl brutS, %eax 65 leal (%eax, %ecx, 8),%eax 66 67 movl (%eax),%edx /* = brutS.px (brutSmypos) */ 68 movl 4(%eax),%eax /* = brutS.py */ 69 70 movl brutD,%ebx 71 leal (%ebx, %ecx, 8),%ebx 72 movl (%ebx),%esi 73 subl %edx, %esi 74 imull buffratio,%esi 75 sarl $16,%esi 76 addl %edx,%esi /* esi = px */ 77 78 /* eax contient deja brutS.py = le nouveau brutSmypos*/ 79 /* ebx pointe sur brutD[myPos] */ 80 movl 4(%ebx),%edi 81 subl %eax,%edi 82 imull buffratio,%edi 83 sarl $16,%edi 84 addl %eax,%edi /* edi = py */ 85 86/* pushl %eax 87 pushl %ebx*/ 88/* popl %ebx 89 popl %eax*/ 90 91 movl %esi,%eax 92 andl $15,%eax /* eax = coefh */ 93 movl %edi,%ebx 94 andl $15,%ebx /* ebx = coefv */ 95 96 leal 0(,%ebx,4),%ebx 97 sall $6,%eax 98 addl %ebx,%eax 99 movl $precalCoef,%ebx 100/* movd (%eax,%ebx),%mm6*/ /* mm6 = coeffs */ 101 102 cmpl -8(%ebp),%edi 103 jge .then1 104 cmpl -4(%ebp),%esi 105 jge .then1 106 107 sarl $4,%esi 108 sarl $4,%edi 109 imull zoom_width,%edi 110 leal (%esi,%edi),%esi 111 jmp .finsi1 112 113.then1: 114 movl $0,%esi 115.finsi1: 116 117 /** apres ce calcul, %esi = pos, %mm6 = coeffs **/ 118/* pushl %esi 119 pushl $chaine 120 call printf 121 addl $8,%esp*/ 122 123 movl expix1,%eax 124 125 ;// recuperation des deux premiers pixels dans mm0 et mm1 126/* movq (%eax,%esi,4), %mm0 /* b1-v1-r1-a1-b2-v2-r2-a2 */ 127 movq %mm0, %mm1 /* b1-v1-r1-a1-b2-v2-r2-a2 */ 128 129 ;// depackage du premier pixel 130 punpcklbw %mm7, %mm0 /* 00-b2-00-v2-00-r2-00-a2 */ 131 132 movq %mm6, %mm5 /* ??-??-??-??-c4-c3-c2-c1 */ 133 ;// depackage du 2ieme pixel 134 punpckhbw %mm7, %mm1 /* 00-b1-00-v1-00-r1-00-a1 */ 135 136 ;// extraction des coefficients... 137 punpcklbw %mm5, %mm6 /* c4-c4-c3-c3-c2-c2-c1-c1 */ 138 movq %mm6, %mm4 /* c4-c4-c3-c3-c2-c2-c1-c1 */ 139 movq %mm6, %mm5 /* c4-c4-c3-c3-c2-c2-c1-c1 */ 140 141 punpcklbw %mm5, %mm6 /* c2-c2-c2-c2-c1-c1-c1-c1 */ 142 punpckhbw %mm5, %mm4 /* c4-c4-c4-c4-c3-c3-c3-c3 */ 143 144 movq %mm6, %mm3 /* c2-c2-c2-c2-c1-c1-c1-c1 */ 145 punpcklbw %mm7, %mm6 /* 00-c1-00-c1-00-c1-00-c1 */ 146 punpckhbw %mm7, %mm3 /* 00-c2-00-c2-00-c2-00-c2 */ 147 148 ;// multiplication des pixels par les coefficients 149 pmullw %mm6, %mm0 /* c1*b2-c1*v2-c1*r2-c1*a2 */ 150 pmullw %mm3, %mm1 /* c2*b1-c2*v1-c2*r1-c2*a1 */ 151 paddw %mm1, %mm0 152 153 ;// ...extraction des 2 derniers coefficients 154 movq %mm4, %mm5 /* c4-c4-c4-c4-c3-c3-c3-c3 */ 155 punpcklbw %mm7, %mm4 /* 00-c3-00-c3-00-c3-00-c3 */ 156 punpckhbw %mm7, %mm5 /* 00-c4-00-c4-00-c4-00-c4 */ 157 158 /* ajouter la longueur de ligne a esi */ 159 addl prevX,%esi 160 161 ;// recuperation des 2 derniers pixels 162/* movq (%eax,%esi,4), %mm1*/ 163 movq %mm1, %mm2 164 165 ;// depackage des pixels 166 punpcklbw %mm7, %mm1 167 punpckhbw %mm7, %mm2 168 169 ;// multiplication pas les coeffs 170 pmullw %mm4, %mm1 171 pmullw %mm5, %mm2 172 173 ;// ajout des valeurs obtenues � la valeur finale 174 paddw %mm1, %mm0 175 paddw %mm2, %mm0 176 177 ;// division par 256 = 16+16+16+16, puis repackage du pixel final 178 psrlw $8, %mm0 179 packuswb %mm7, %mm0 180 181 ;// passage au suivant 182 183 ;// enregistrement du resultat 184 movl expix2,%eax 185/* movd %mm0,(%eax,%ecx,4)*/ 186 187 decl %ecx 188 ;// test de fin du tantque 189 cmpl $0, %ecx ;// 400x300 190 191 jz .fin_while 192 jmp .while 193 194.fin_while: 195 emms 196 197 movl %ebp,%esp 198 popl %ebp 199 200 ret ;//The End 201