1;;; 2;;; void ablend16_ppd(BYTE *write, BYTE *src, BYTE *dst, int a, int w, int h, int pitchw, int pitchs, int pitchd) 3;;; write: write pixel (Pointer) 4;;; src: source pixel (Pointer) 5;;; dst: destination pixel (Pointer) 6;;; a: alpha pixels (Data) 7;;; w: width 8;;; h: height 9;;; pitchw: scan line of write 10;;; pitchs: scan line of source 11;;; pitchd: scan line of dst 12 13ablend16_ppd: 14 push ebp 15 push ebx 16 push ecx 17 push edx 18 push esi 19 push edi 20 21%assign _P 4*6 22%define write [esp + _P + 4] 23%define src [esp + _P + 8] 24%define dst [esp + _P + 12] 25%define alpha [esp + _P + 16] 26%define width [esp + _P + 20] 27%define height [esp + _P + 24] 28%define pitchw [esp + _P + 28] 29%define pitchs [esp + _P + 32] 30%define pitchd [esp + _P + 36] 31 32 33 mov ebp, write ;ebp=write 34 mov esi, src ;esi=src 35 mov edi, dst ;edi=dst 36 mov eax, alpha ;eax=a 37 mov ebx, width ;ecx=w 38 mov ecx, height ;ecx=h 39 40 align 16 41.primeloop: 42 movd mm1, eax ; mm1=00 00 00 00 a3 a2 a1 00 43 pxor mm2, mm2 ; mm2=0 44 45 movq mm4, [esi] ; g1: mm4=src3 src2 src1 src0 46 punpcklbw mm1, mm2 ; mm1=00a3 00a2 00a1 00a0 47 48 align 16 49.loopqword: 50 mov edx, eax 51 test ebx, 0xfffffffc ; check if only 3 pixel left 52 jnz .lp1 53 jmp .checkback ; 3 or less pixel left 54.lp1: 55 cmp edx, 0xffffffff ; test for alpha value of 0 56 jne .lp2 57 jmp .copyback ; if 1's copy the source pixel to destination 58.lp2 59 60 test edx, 0xffffffff ; test for alpha value of 1 61 jnz .lp3 62 jmp .leavefront ; if so go to the next 4 pixel 63.lp3 64 65; green 66; i+a*src+(63-a)*dst 67; i=(i+32)+((i+32)>>6>>6 68; red/blue 69; i+a*src+(31-a)*dst 70; i=(i+32)+((i+32)>>5>>5 71 72 movq mm5, [edi] ; g2: mm5=dst3 dst2 dst1 dst0 73 psrlw mm1, 2 ; mm1=a? >>2 nule out lower 2bit 74 75 movq mm7, [maskshiftg]; g3: mm7=1 bit shifted gree mask 76 psrlw mm4, 1 ; g3a: move src green down by 1 so that we dont overflow 77 78 movq mm0, mm1 ; mm0=00a3 00a2 00a1 00a0 79 psrlw mm5, 1 ; g3a: move dst green down by 1 so that we dont overflow 80 81 psrlw mm1, 1 ; mm1=a? >>1 nuke out lowe 1 bit 82 pand mm4, mm7 ; g5: mm4=sg3 sg2 sg1 sg0 83 84 movq mm2, [sixones] ; g4 mm2 = 63 85 pand mm5, mm7 ; g7: mm5=dg3 dg2 dg1 dg0 86 87 movq mm3, [esi] ; b1: mm3=src3 src2 src1 src0 88 psubsb mm2, mm0 ; g6: mm2=63-a3 63-a2 63-a1 63-a0 89 90 movq mm7, [maskb16] ; b2: mm7=blue mask 91 pmullw mm4, mm0 ; g8: mm4=sg * a? 92 93 movq mm0, [edi] ; b3: mm0=dst3 dst2 dst1 dst0 94 pmullw mm5, mm2 ; g9: mm5=dg? * (1-a?) 95 96 movq mm2, mm7 ; b4: mm2=finevones 97 pand mm3, mm7 ; b4: mm3=sb3 sb2 sb1 sb0 98 99 pmullw mm3, mm1 ; b6: mm3=sb? * a? 100 pand mm0, mm7 ; b5: mm0=db3 sb2 db1 db0 101 102 movq mm7, [esi] ; r1: mm7=src3 src2 src1 src0 103 paddw mm4, mm5 ; g10: mm4=sg? * a? + dg? * (1-a?) 104 105 pand mm7, [maskr16] ; r2: mm7=sr3 sr2 sr1 sr0 106 psubsb mm2, mm1 ; b5a mm2=31-a3 31-a2 31-a1 31-a0 107 108 paddw mm4, [fivetwelve]; g11: mm4=(mm4+512) green 109 pmullw mm0, mm2 ; b7: mm0=db? * (1-a?) 110 111 movq mm5, mm4 ; g12: mm5=mm4 green 112 psrlw mm7, 11 ; r4: shift src red down to position 0 113 114 psrlw mm4, 6 ; g13: mm4=mm4 >> 6 115 116 paddw mm4, mm5 ; g14: mm4=mm4+mm5 green 117 118 paddw mm0, mm3 ; b8: mm0=sb? * a? + db? * (1-a?) 119 120 movq mm5, [edi] ; r3: mm5 = dst3 dst2 dst1 dst0 121 122 paddw mm0, [sixteen] ; b9: mm0=(mm0+16) blue 123 124 pand mm5, [maskr16] ; r5: mm5=dr3 dr2 dr1 dr0 125 psrlw mm4, 5 ; g15: mm4=0?g0 0?g0 0?g0 0?g0 green 126 127 movq mm3, mm0 ; b10: mm3=mm0 blue 128 psrlw mm0, 5 ; b11: mm0=mm0 >> 5 blue 129 130 psrlw mm5, 11 ; r6: shift dst red down to position 0 131 paddw mm0, mm3 ; b12: mm0=mm3+mm0 blue 132 133 psrlw mm0, 5 ; b13: mm0=000b 000b 000b 000b blue 134 pmullw mm7, mm1 ; mm7=sr? * a? 135 136 pand mm4, [maskg16] ; g16: mm4=00g0 00g0 00g0 00g0 green 137 pmullw mm5, mm2 ; r7: mm5=dr? * (31-a?) 138 139 por mm0, mm4 ; mm0=00gb 00gb 00gb 00gb 140 141 add esi, 8 ; move to next 4 pixel in src 142 add edi, 8 ; move to next 4 pixel in dst 143 add ebp, 8 ; move to next 4 pixel in write 144 145 movd mm1, eax ; mm1=00 00 00 00 a a a a 146 paddw mm5, mm7 ; r8: mm5=sr? * a? + dr? * (31-a?) 147 148 paddw mm5, [sixteen] ; r9: mm5=(mm5+16) red 149 pxor mm2, mm2 ; mm2=0 150 151 movq mm7, mm5 ; r10: mm7=mm5 152 psrlw mm5, 5 ; r11: mm5=mm5>>5 red 153 154 movq mm4, [esi] 155 paddw mm5, mm7 ; r12: mm5=mm7+mm5 156 157 punpcklbw mm1, mm2 ; mm1=00a3 00a2 00a1 00a0 158 159 psrlw mm5, 5 ; r13: mm5=mm5>>5 red 160 161 psllw mm5, 11 ; r14: mm5=mm5<10 red 162 163 por mm0, mm5 ; mm0=0rgb 0rgb 0rgb 0rgb 164 165 sub ebx, 4 ; polished off 4 pixels 166 167 movq [ebp-8], mm0 ; dst = 0rgb 0rgb 0rgb 0rgb 168 jmp .loopqword ; go back to start 169 170.copyback: 171 movq [ebp], mm4 ; copy souce to write 172 173.leavefront: 174 add ebp, 8 ; advance write 4 pixels 175 add edi, 8 ; advance destination 4 pixels 176 add esi, 8 ; advance source by 4 pixel 177 sub ebx, 4 ; decrease pixel count by 4 178 jmp .primeloop 179 180.checkback: 181 test ebx, 0xff ; check if 0 pixel left 182 jnz .lp6 183 jmp .nextline 184.lp6: 185 movq mm5, [edi] ; g2: mm5=dst3 dst2 dst1 dst0 186 psrlw mm1, 2 ; mm1=a? >>2 nule out lower 2bit 187 188 movq mm7, [maskshiftg]; g3: mm7=1 bit shifted gree mask 189 psrlw mm4, 1 ; g3a: move src green down by 1 so that we dont overflow 190 191 movq mm0,mm1 ; mm0=00a3 00a2 00a1 00a0 192 psrlw mm5, 1 ; g3a: move dst green down by 1 so that we dont overflow 193 194 psrlw mm1, 1 ; mm1=a? >>1 nuke out lowe 1 bit 195 pand mm4, mm7 ; g5: mm4=sg3 sg2 sg1 sg0 196 197 movq mm2, [sixones] ; g4 mm2 = 63 198 pand mm5, mm7 ; g7: mm5=dg3 dg2 dg1 dg0 199 200 movq mm3, [esi] ; b1: mm3=src3 src2 src1 src0 201 psubsb mm2, mm0 ; g6: mm2=63-a3 63-a2 63-a1 63-a0 202 203 movq mm7, [maskb16] ; b2: mm7=blue mask 204 pmullw mm4, mm0 ; g8: mm4=sg * a? 205 206 movq mm0, [edi] ; b3: mm0=dst3 dst2 dst1 dst0 207 pmullw mm5, mm2 ; g9: mm5=dg? * (1-a?) 208 209 movq mm2, mm7 ; b4: mm2=finevones 210 pand mm3, mm7 ; b4: mm3=sb3 sb2 sb1 sb0 211 212 pmullw mm3, mm1 ; b6: mm3=sb? * a? 213 pand mm0, mm7 ; b5: mm0=db3 sb2 db1 db0 214 215 movq mm7, [esi] ; r1: mm7=src3 src2 src1 src0 216 paddw mm4, mm5 ; g10: mm4=sg? * a? + dg? * (1-a?) 217 218 pand mm7, [maskr16] ; r2: mm7=sr3 sr2 sr1 sr0 219 psubsb mm2, mm1 ; b5a mm2=31-a3 31-a2 31-a1 31-a0 220 221 paddw mm4, [fivetwelve]; g11: mm4=(mm4+512) green 222 pmullw mm0, mm2 ; b7: mm0=db? * (1-a?) 223 224 movq mm5, mm4 ; g12: mm5=mm4 green 225 psrlw mm7, 11 ; r4: shift src red down to position 0 226 227 psrlw mm4, 6 ; g13: mm4=mm4 >> 6 228 229 paddw mm4, mm5 ; g14: mm4=mm4+mm5 green 230 231 paddw mm0, mm3 ; b8: mm0=sb? * a? + db? * (1-a?) 232 233 movq mm5, [edi] ; r3: mm5 = dst3 dst2 dst1 dst0 234 235 paddw mm0, [sixteen] ; b9: mm0=(mm0+16) blue 236 237 pand mm5, [maskr16] ; r5: mm5=dr3 dr2 dr1 dr0 238 psrlw mm4, 5 ; g15: mm4=0?g0 0?g0 0?g0 0?g0 green 239 240 movq mm3, mm0 ; b10: mm3=mm0 blue 241 psrlw mm0, 5 ; b11: mm0=mm0 >> 5 blue 242 243 psrlw mm5, 11 ; r6: shift dst red down to position 0 244 paddw mm0, mm3 ; b12: mm0=mm3+mm0 blue 245 246 psrlw mm0, 5 ; b13: mm0=000b 000b 000b 000b blue 247 pmullw mm7, mm1 ; mm7=sr? * a? 248 249 pand mm4, [maskg16] ; g16: mm4=00g0 00g0 00g0 00g0 green 250 pmullw mm5, mm2 ; r7: mm5=dr? * (31-a?) 251 252 por mm0, mm4 ; mm0=00gb 00gb 00gb 00gb 253 254 paddw mm5, mm7 ; r8: mm5=sr? * a? + dr? * (31-a?) 255 256 paddw mm5, [sixteen] ; r9: mm5=(mm5+16) red 257 258 movq mm7, mm5 ; r10: mm7=mm5 259 psrlw mm5, 5 ; r11: mm5=mm5>>5 red 260 261 paddw mm5, mm7 ; r12: mm5=mm7+mm5 262 263 psrlw mm5, 5 ; r13: mm5=mm5>>5 red 264 265 psllw mm5, 11 ; r14: mm5=mm5<10 red 266 267 por mm0, mm5 ; mm0=0rgb 0rgb 0rgb 0rgb 268 test ebx, 2 ; check if there are 2 pixel 269 270 jz .oneendpixel ; goto one pixel if thats it 271 movd [ebp], mm0 ; write = 0000 000 0rgb 0rgb 272 psrlq mm0, 32 ; mm0 >> 32 273 274 add edi, 4 ; edi=edi+4 275 add ebp, 4 ; ebp=ebp+4 276 sub ebx, 2 ; save 2 pixels 277 jz .nextline ; all done goto next line 278 279.oneendpixel: 280 movd edx, mm0 ; edx=0rgb 281 282 mov [ebp], dx ; dst=0rgb 283 284.nextline: 285 dec ecx ; nuke one line 286 jz .done ; all done 287 288 mov esi, src ;esi=src 289 mov edi, dst ;edi=dst 290 mov ebp, write ;ebp=write 291 292 add esi, pitchs ;pitch 293 add edi, pitchd ;pitch 294 add ebp, pitchw ;pitch 295 296 mov ebx, width 297 mov src, esi 298 mov dst, edi 299 mov write, ebp 300 301 jmp .primeloop 302.done: 303 emms 304 pop edi 305 pop esi 306 pop edx 307 pop ecx 308 pop ebx 309 pop ebp 310 ret 311