1;;;
2;;; void ablend16_ppd(BYTE *write, BYTE *src, BYTE *dst, int a, int w, int h, int pitchw, int pitchs, int pitchd)
3;;;  write:	write pixel       (Pointer)
4;;;  src:	source pixel      (Pointer)
5;;;  dst:	destination pixel (Pointer)
6;;;  a:		alpha pixels      (Data)
7;;;  w:		width
8;;;  h:		height
9;;;  pitchw:	scan line of write
10;;;  pitchs:	scan line of source
11;;;  pitchd:	scan line of dst
12
13ablend16_ppd:
14	push	ebp
15	push	ebx
16	push	ecx
17	push	edx
18	push	esi
19	push	edi
20
21%assign _P 4*6
22%define write [esp + _P + 4]
23%define src   [esp + _P + 8]
24%define dst   [esp + _P + 12]
25%define alpha [esp + _P + 16]
26%define width [esp + _P + 20]
27%define height [esp + _P + 24]
28%define pitchw [esp + _P + 28]
29%define pitchs [esp + _P + 32]
30%define pitchd [esp + _P + 36]
31
32
33	mov	ebp, write		;ebp=write
34	mov	esi, src		;esi=src
35	mov	edi, dst		;edi=dst
36	mov	eax, alpha		;eax=a
37	mov	ebx, width		;ecx=w
38	mov	ecx, height		;ecx=h
39
40	align	16
41.primeloop:
42	movd	mm1, eax	; mm1=00 00 00 00 a3 a2 a1 00
43	pxor	mm2, mm2	; mm2=0
44
45	movq	mm4, [esi]	; g1: mm4=src3 src2 src1 src0
46	punpcklbw mm1, mm2	; mm1=00a3 00a2 00a1 00a0
47
48	align	16
49.loopqword:
50	mov	edx, eax
51	test	ebx, 0xfffffffc	; check if only 3 pixel left
52	jnz	.lp1
53	jmp	.checkback	; 3 or less pixel left
54.lp1:
55	cmp	edx, 0xffffffff	; test for alpha value of 0
56	jne	.lp2
57	jmp	.copyback	; if 1's copy the source pixel to destination
58.lp2
59
60	test	edx, 0xffffffff	; test for alpha value of 1
61	jnz	.lp3
62	jmp	.leavefront	; if so go to the next 4 pixel
63.lp3
64
65; green
66;	i+a*src+(63-a)*dst
67;	i=(i+32)+((i+32)>>6>>6
68; red/blue
69;	i+a*src+(31-a)*dst
70;	i=(i+32)+((i+32)>>5>>5
71
72	movq	mm5, [edi]	; g2: mm5=dst3 dst2 dst1 dst0
73	psrlw	mm1, 2		; mm1=a? >>2 nule out lower 2bit
74
75	movq	mm7, [maskshiftg]; g3: mm7=1 bit shifted gree mask
76	psrlw	mm4, 1		; g3a: move src green down by 1 so that we dont overflow
77
78	movq	mm0, mm1	; mm0=00a3 00a2 00a1 00a0
79	psrlw	mm5, 1		; g3a: move dst green down by 1 so that we dont overflow
80
81	psrlw	mm1, 1		; mm1=a? >>1 nuke out lowe 1 bit
82	pand	mm4, mm7	; g5: mm4=sg3 sg2 sg1 sg0
83
84	movq	mm2, [sixones]	; g4 mm2 = 63
85	pand	mm5, mm7	; g7: mm5=dg3 dg2 dg1 dg0
86
87	movq	mm3, [esi]	; b1: mm3=src3 src2 src1 src0
88	psubsb	mm2, mm0	; g6: mm2=63-a3 63-a2 63-a1 63-a0
89
90	movq	mm7, [maskb16]	; b2: mm7=blue mask
91	pmullw	mm4, mm0	; g8: mm4=sg * a?
92
93	movq	mm0, [edi]	; b3: mm0=dst3 dst2 dst1 dst0
94	pmullw	mm5, mm2	; g9: mm5=dg? * (1-a?)
95
96	movq	mm2, mm7	; b4: mm2=finevones
97	pand	mm3, mm7	; b4: mm3=sb3 sb2 sb1 sb0
98
99	pmullw	mm3, mm1	; b6: mm3=sb? * a?
100	pand	mm0, mm7	; b5: mm0=db3 sb2 db1 db0
101
102	movq	mm7, [esi]	; r1: mm7=src3 src2 src1 src0
103	paddw	mm4, mm5	; g10: mm4=sg? * a? + dg? * (1-a?)
104
105	pand	mm7, [maskr16]	; r2: mm7=sr3 sr2 sr1 sr0
106	psubsb	mm2, mm1	; b5a mm2=31-a3 31-a2 31-a1 31-a0
107
108	paddw	mm4, [fivetwelve]; g11: mm4=(mm4+512) green
109	pmullw	mm0, mm2	; b7: mm0=db? * (1-a?)
110
111	movq	mm5, mm4	; g12: mm5=mm4 green
112	psrlw	mm7, 11		; r4: shift src red down to position 0
113
114	psrlw	mm4, 6		; g13: mm4=mm4 >> 6
115
116	paddw	mm4, mm5	; g14: mm4=mm4+mm5 green
117
118	paddw	mm0, mm3	; b8: mm0=sb? * a? + db? * (1-a?)
119
120	movq	mm5, [edi]	; r3: mm5 = dst3 dst2 dst1 dst0
121
122	paddw	mm0, [sixteen]	; b9: mm0=(mm0+16) blue
123
124	pand	mm5, [maskr16]	; r5: mm5=dr3 dr2 dr1 dr0
125	psrlw	mm4, 5		; g15: mm4=0?g0 0?g0 0?g0 0?g0 green
126
127	movq	mm3, mm0	; b10: mm3=mm0 blue
128	psrlw	mm0, 5		; b11: mm0=mm0 >> 5 blue
129
130	psrlw	mm5, 11		; r6: shift dst red down to position 0
131	paddw	mm0, mm3	; b12: mm0=mm3+mm0 blue
132
133	psrlw	mm0, 5		; b13: mm0=000b 000b 000b 000b blue
134	pmullw	mm7, mm1	; mm7=sr? * a?
135
136	pand	mm4, [maskg16]	; g16: mm4=00g0 00g0 00g0 00g0 green
137	pmullw	mm5, mm2	; r7: mm5=dr? * (31-a?)
138
139	por	mm0, mm4	; mm0=00gb 00gb 00gb 00gb
140
141	add	esi, 8		; move to next 4 pixel in src
142	add	edi, 8		; move to next 4 pixel in dst
143	add	ebp, 8		; move to next 4 pixel in write
144
145	movd	mm1, eax	; mm1=00 00 00 00 a a a a
146	paddw	mm5, mm7	; r8: mm5=sr? * a? + dr? * (31-a?)
147
148	paddw	mm5, [sixteen]	; r9: mm5=(mm5+16) red
149	pxor	mm2, mm2	; mm2=0
150
151	movq	mm7, mm5	; r10: mm7=mm5
152	psrlw	mm5, 5		; r11: mm5=mm5>>5 red
153
154	movq	mm4, [esi]
155	paddw	mm5, mm7	; r12: mm5=mm7+mm5
156
157	punpcklbw mm1, mm2      ; mm1=00a3 00a2 00a1 00a0
158
159	psrlw	mm5, 5		; r13: mm5=mm5>>5 red
160
161	psllw	mm5, 11		; r14: mm5=mm5<10 red
162
163	por	mm0, mm5	; mm0=0rgb 0rgb 0rgb 0rgb
164
165	sub	ebx, 4		; polished off 4 pixels
166
167	movq	[ebp-8], mm0	; dst = 0rgb 0rgb 0rgb 0rgb
168	jmp	.loopqword	; go back to start
169
170.copyback:
171	movq	[ebp], mm4	; copy souce to write
172
173.leavefront:
174	add	ebp, 8		; advance write 4 pixels
175	add	edi, 8		; advance destination 4 pixels
176	add	esi, 8		; advance source by 4 pixel
177	sub	ebx, 4		; decrease pixel count by 4
178	jmp	.primeloop
179
180.checkback:
181	test	ebx, 0xff	; check if 0 pixel left
182	jnz	.lp6
183	jmp	.nextline
184.lp6:
185	movq	mm5, [edi]	; g2: mm5=dst3 dst2 dst1 dst0
186	psrlw	mm1, 2		; mm1=a? >>2 nule out lower 2bit
187
188	movq	mm7, [maskshiftg]; g3: mm7=1 bit shifted gree mask
189	psrlw	mm4, 1		; g3a: move src green down by 1 so that we dont overflow
190
191	movq	mm0,mm1		; mm0=00a3 00a2 00a1 00a0
192	psrlw	mm5, 1		; g3a: move dst green down by 1 so that we dont overflow
193
194	psrlw	mm1, 1		; mm1=a? >>1 nuke out lowe 1 bit
195	pand	mm4, mm7	; g5: mm4=sg3 sg2 sg1 sg0
196
197	movq	mm2, [sixones]	; g4 mm2 = 63
198	pand	mm5, mm7	; g7: mm5=dg3 dg2 dg1 dg0
199
200	movq	mm3, [esi]	; b1: mm3=src3 src2 src1 src0
201	psubsb	mm2, mm0	; g6: mm2=63-a3 63-a2 63-a1 63-a0
202
203	movq	mm7, [maskb16]	; b2: mm7=blue mask
204	pmullw	mm4, mm0	; g8: mm4=sg * a?
205
206	movq	mm0, [edi]	; b3: mm0=dst3 dst2 dst1 dst0
207	pmullw	mm5, mm2	; g9: mm5=dg? * (1-a?)
208
209	movq	mm2, mm7	; b4: mm2=finevones
210	pand	mm3, mm7	; b4: mm3=sb3 sb2 sb1 sb0
211
212	pmullw	mm3, mm1	; b6: mm3=sb? * a?
213	pand	mm0, mm7	; b5: mm0=db3 sb2 db1 db0
214
215	movq	mm7, [esi]	; r1: mm7=src3 src2 src1 src0
216	paddw	mm4, mm5	; g10: mm4=sg? * a? + dg? * (1-a?)
217
218	pand	mm7, [maskr16]	; r2: mm7=sr3 sr2 sr1 sr0
219	psubsb	mm2, mm1	; b5a mm2=31-a3 31-a2 31-a1 31-a0
220
221	paddw	mm4, [fivetwelve]; g11: mm4=(mm4+512) green
222	pmullw	mm0, mm2	; b7: mm0=db? * (1-a?)
223
224	movq	mm5, mm4	; g12: mm5=mm4 green
225	psrlw	mm7, 11		; r4: shift src red down to position 0
226
227	psrlw	mm4, 6		; g13: mm4=mm4 >> 6
228
229	paddw	mm4, mm5	; g14: mm4=mm4+mm5 green
230
231	paddw	mm0, mm3	; b8: mm0=sb? * a? + db? * (1-a?)
232
233	movq	mm5, [edi]	; r3: mm5 = dst3 dst2 dst1 dst0
234
235	paddw	mm0, [sixteen]	; b9: mm0=(mm0+16) blue
236
237	pand	mm5, [maskr16]	; r5: mm5=dr3 dr2 dr1 dr0
238	psrlw	mm4, 5		; g15: mm4=0?g0 0?g0 0?g0 0?g0 green
239
240	movq	mm3, mm0	; b10: mm3=mm0 blue
241	psrlw	mm0, 5		; b11: mm0=mm0 >> 5 blue
242
243	psrlw	mm5, 11		; r6: shift dst red down to position 0
244	paddw	mm0, mm3	; b12: mm0=mm3+mm0 blue
245
246	psrlw	mm0, 5		; b13: mm0=000b 000b 000b 000b blue
247	pmullw	mm7, mm1	; mm7=sr? * a?
248
249	pand	mm4, [maskg16]	; g16: mm4=00g0 00g0 00g0 00g0 green
250	pmullw	mm5, mm2	; r7: mm5=dr? * (31-a?)
251
252	por	mm0, mm4	; mm0=00gb 00gb 00gb 00gb
253
254	paddw	mm5, mm7	; r8: mm5=sr? * a? + dr? * (31-a?)
255
256	paddw	mm5, [sixteen]	; r9: mm5=(mm5+16) red
257
258	movq	mm7, mm5	; r10: mm7=mm5
259	psrlw	mm5, 5		; r11: mm5=mm5>>5 red
260
261	paddw	mm5, mm7	; r12: mm5=mm7+mm5
262
263	psrlw	mm5, 5		; r13: mm5=mm5>>5 red
264
265	psllw	mm5, 11		; r14: mm5=mm5<10 red
266
267	por	mm0, mm5	; mm0=0rgb 0rgb 0rgb 0rgb
268	test	ebx, 2		; check if there are 2 pixel
269
270	jz	.oneendpixel	; goto one pixel if thats it
271	movd	[ebp], mm0	; write = 0000 000 0rgb 0rgb
272	psrlq	mm0, 32		; mm0 >> 32
273
274	add	edi, 4		; edi=edi+4
275	add	ebp, 4		; ebp=ebp+4
276	sub	ebx, 2		; save 2 pixels
277	jz	.nextline	; all done goto next line
278
279.oneendpixel:
280	movd	edx, mm0	; edx=0rgb
281
282	mov	[ebp], dx	; dst=0rgb
283
284.nextline:
285	dec	ecx		; nuke one line
286	jz	.done		; all done
287
288	mov	esi, src		;esi=src
289	mov	edi, dst		;edi=dst
290	mov	ebp, write		;ebp=write
291
292	add	esi, pitchs		;pitch
293	add	edi, pitchd		;pitch
294	add	ebp, pitchw		;pitch
295
296	mov	ebx, width
297	mov	src, esi
298	mov	dst, edi
299	mov	write, ebp
300
301	jmp	.primeloop
302.done:
303	emms
304	pop	edi
305	pop	esi
306	pop	edx
307	pop	ecx
308	pop	ebx
309	pop	ebp
310	ret
311