1;SuperScale by ElSemi
2;
3;	Based on the original idea of 2xScale of AdvanceMAME.
4;		AdvanceMAME code is GPLed and I didn't get permisson to use it. so this code is
5;		my own implementation of that effect.
6;	similar to Kawaks' KScale effect
7;
8;It expands the central pixel of a 3x3 matrix to 2x2 pixels resulting in a 2x scale in
9;both axis.
10;
11;A B C		E0 E1
12;D E F	->	E2 E3
13;G H I
14;
15;
16;Original algorithm (AdvanceMAME)
17;
18;E0 = (D == B && B != F && D != H) ? D : E;
19;E1 = (B == F && B != D && F != H) ? F : E;
20;E2 = (D == H && D != B && H != F) ? D : E;
21;E3 = (H == F && D != H && B != F) ? F : E;
22;
23;
24;(I'll only put the first 2 equations as the 3rd and 4th
25;are symmetrical ABC<->GHI)
26;
27;Reduce variable usage
28;
29;D==B => E0=D -> E0=B
30;	D!=H -> B!=H
31;
32;B==F => E1=F -> E1=B
33;	F!=H -> B!=H
34;
35;Group and reorder
36;
37;E0=(B==D && B!=F && B!=H)?B:E;
38;E1=(B!=D && B==F && B!=H)?B:E;
39;
40;as you can see, there are only 3 "real" conditions (B==D, B==F, and B!=H). the other
41;2 can be get reversing the first 1 conditions (B!=D, B!=F) that makes the code
42;suitable to pcmpeqw,pand,pandn
43
44	BITS 32
45	SECTION .text ALIGN = 32
46	GLOBAL _superscale_line
47	GLOBAL _superscale_line_75
48	GLOBAL superscale_line
49	GLOBAL superscale_line_75
50
51%macro ALIGN32 0
52        times ($$-$)&31 nop
53%endmacro
54
55_superscale_line:
56superscale_line:
57	push ebp
58	mov ebp,esp
59	push eax
60	push esi
61	push ebx
62	push edi
63	push ecx
64
65	mov       eax,[ebp+8h]		;ABC   ;line -1
66	mov       esi,[ebp+0Ch]		;DEF   ;current line
67	mov       ebx,[ebp+10h]		;GHI   ;line +1
68	mov       edi,[ebp+14h]		;dst
69	mov       ecx,[ebp+18h]		;count
70	shr       ecx,2			;/4, we'll make 4 pixels per loop
71	ALIGN32
72iloop:
73;This code makes 4 pixels per loop
74;The comments refer only to 1 pixel (the 3rd one to make explanations clearer and
75;closer to the algorithm)
76;					   v
77	movq      mm0,[eax]		;xABC
78;v	movq 	  mm1,[esi]		;xDEF
79	movq      mm2,[ebx]		;xGHI
80	movq      mm3,[esi-2]		;xxDE
81	movq 	  mm1,[esi]		;xDEF
82	movq      mm4,[esi+2]		;DEFx
83
84;Prepare basic comparisons
85	pcmpeqw   mm2,mm0		;mm2=xABC==xGHI (B==H)
86	pcmpeqw   mm3,mm0		;mm3=xABC==xxDE	(B==D)
87	pcmpeqw   mm4,mm0		;mm4=xABC==DEFx	(B==F)
88	movq      mm5,mm2		;mm5=(B==H) (we need this for both pixel conditions)
89
90;prepare pixel masks
91	pandn     mm2,mm3		;mm2=(B==D) && (B!=H)
92	pandn     mm5,mm4		;mm5=(B==F) && (B!=H)
93	pandn     mm4,mm2		;mm4=(B==D) && (B!=H) && (B!=F)	-> left pixel mask
94	pandn     mm3,mm5		;mm3=(B==F) && (B!=H) && (B!=D) -> right pixel mask
95
96;Mask pixels and merge
97	movq      mm2,mm4		;store to temp as I will overwrite them in the masking process
98	movq      mm5,mm3		;
99	pand      mm4,mm0		;Mask out pixels that don't satisfy the conditions
100	pand      mm3,mm0		;leaving only the "new" pixels (E0,E1=B)
101	pandn     mm2,mm1		;Invert the mask and merge to get the pixels that remain
102	pandn     mm5,mm1		;unchanged (E0,E1=E)
103	por       mm4,mm2		;merge both data to get the final pixels xxE0x
104	por       mm3,mm5		;xxE1x
105	movq      mm0,mm4
106					;merge both regs, interleaving E0 and E1 data
107	punpcklwd mm4,mm3		;in 2 regs (mm4|mm0)
108	punpckhwd mm0,mm3		;
109	movq      [edi],mm4
110	movq      [edi+8],mm0
111	add       eax,8
112	add       esi,8
113	add       ebx,8
114	add       edi,16
115	loop	  iloop
116
117	pop ecx
118	pop edi
119	pop ebx
120	pop esi
121	pop eax
122	pop ebp
123	ret
124
125ALIGN32
126
127_superscale_line_75:    ;do a 75% reduction on the final pixel colour
128superscale_line_75:    ;do a 75% reduction on the final pixel colour
129	push ebp
130	mov ebp,esp
131	push eax
132	push esi
133	push ebx
134	push edi
135	push ecx
136
137	mov       eax,[ebp+8h]		;ABC   ;line -1
138	mov       esi,[ebp+0Ch]		;DEF   ;current line
139	mov       ebx,[ebp+10h]		;GHI   ;line +1
140	mov       edi,[ebp+14h]		;dst
141    	mov       ecx,[ebp+1Ch]     ;mask
142    	movq      mm7,[ecx]     ;
143	mov       ecx,[ebp+18h]		;count
144	shr       ecx,2			;/4, we'll make 4 pixels per loop
145	ALIGN32
146iloop2:
147;This code makes 4 pixels per loop resulting in 8 pixels expansion
148;The comments refer only to 1 pixel (the 3rd one to make explanations clearer and
149;closer to the algorithm)
150;					   v
151	movq      mm0,[eax]		;xABC
152;v	movq 	  mm1,[esi]		;xDEF
153	movq      mm2,[ebx]		;xGHI
154	movq      mm3,[esi-2]		;xxDE
155	movq 	  mm1,[esi]		;xDEF
156	movq      mm4,[esi+2]		;DEFx
157
158;Prepare basic conditions
159	pcmpeqw   mm2,mm0		;mm2=xABC==xGHI (B==H)
160	pcmpeqw   mm3,mm0		;mm3=xABC==xxDE	(B==D)
161	pcmpeqw   mm4,mm0		;mm4=xABC==DEFx	(B==F)
162	movq      mm5,mm2		;mm5=(B==H) (we need this for both pixel conditions)
163
164;Prepare pixel masks
165	pandn     mm2,mm3		;mm2=(B==D) && (B!=H)
166	pandn     mm5,mm4		;mm5=(B==F) && (B!=H)
167	pandn     mm4,mm2		;mm4=(B==D) && (B!=H) && (B!=F)	-> left pixel mask
168	pandn     mm3,mm5		;mm3=(B==F) && (B!=H) && (B!=D) -> right pixel mask
169
170;Mask pixels and merge
171	movq      mm2,mm4		;store to temp as I will overwrite them in the masking process
172	movq      mm5,mm3		;
173	pand      mm4,mm0		;Mask out pixels that don't satisfy the conditions
174	pand      mm3,mm0		;leaving only the "new" pixels (E0,E1=B)
175	pandn     mm2,mm1		;Invert the mask and merge to get the pixels that remain
176	pandn     mm5,mm1		;unchanged (E0,E1=E)
177	por       mm4,mm2		;merge both data to get the final pixels xxE0x
178	por       mm3,mm5		;xxE1x
179	movq      mm0,mm4		;merge both regs, interleaving E0 and E1 data
180	punpcklwd mm4,mm3		;in 2 regs (mm4|mm0)
181	punpckhwd mm0,mm3		;
182
183;Reduce color bright to 75% using shift/mask
184	psrlw	  mm4,1
185	psrlw	  mm0,1
186	pand	  mm4,mm7
187	pand	  mm0,mm7
188	movq	  mm2,mm4
189	movq	  mm1,mm0
190	psrlw	  mm2,1
191	psrlw	  mm1,1
192	pand	  mm2,mm7
193   	pand      mm1,mm7
194	paddw	  mm4,mm2
195	paddw	  mm0,mm1
196	movq      [edi],mm4
197	movq      [edi+8],mm0
198	add       eax,8
199	add       esi,8
200	add       ebx,8
201	add       edi,16
202        dec ecx
203	jnz near iloop2
204
205	pop ecx
206	pop edi
207	pop ebx
208	pop esi
209	pop eax
210	pop ebp
211	ret
212