1// ****************************************************************************
2// *
3// *  XVID MPEG-4 VIDEO CODEC
4// *  - IA64 halfpel refinement -
5// *
6// *  Copyright(C) 2002 Johannes Singler, Daniel Winkler
7// *
8// *  This program is free software; you can redistribute it and/or modify it
9// *  under the terms of the GNU General Public License as published by
10// *  the Free Software Foundation; either version 2 of the License, or
11// *  (at your option) any later version.
12// *
13// *  This program is distributed in the hope that it will be useful,
14// *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15// *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16// *  GNU General Public License for more details.
17// *
18// *  You should have received a copy of the GNU General Public License
19// *  along with this program; if not, write to the Free Software
20// *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21// *
22// * $Id: halfpel8_refine_ia64.s,v 1.4 2009-02-19 17:07:29 Isibaar Exp $
23// *
24// ***************************************************************************/
25//
26// ****************************************************************************
27// *
28// *  halfpel8_refine_ia64.s, IA-64 halfpel refinement
29// *
30// *  This version was implemented during an IA-64 practical training at
31// *  the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/)
32// *
33// ****************************************************************************
34
35//   ------------------------------------------------------------------------------
36//   * Programmed by
37//   * Johannes Singler (email@jsingler.de), Daniel Winkler (infostudent@uni.de)
38//   *
39//   * Programmed for the IA64 laboratory held at University Karlsruhe 2002
40//   * http://www.info.uni-karlsruhe.de/~rubino/ia64p/
41//   *
42//   ------------------------------------------------------------------------------
43//   *
44//   * This is the optimized assembler version of Halfpel8_Refine. This function
45//   * is worth it to be optimized for the IA-64 architecture because of the huge
46//   * register set. We can hold all necessary data in general use registers
47//   * and reuse it.
48//   *
49//   * Our approach uses:
50//   *   - The Itanium command psad1, which solves the problem in hardware.
51//   *   - Alignment resolving to avoid memory faults
52//   *   - Massive lopp unrolling
53//   *
54//   ------------------------------------------------------------------------------
55//   *
56//   *    -------	Half-pixel steps around the center (*) and corresponding
57//   *    |0|1|0|       register set parts.
58//   *    -------
59//   *    |2|*|2|
60//   *    -------
61//   *    |0|1|0|
62//   *    -------
63//   *
64//   ------------------------------------------------------------------------------
65//   * calc_delta is split up in three parts wich are included from
66//   *
67//   * calc_delta_1.s
68//   * calc_delta_2.s
69//   * calc_delta_3.s
70//   *
71//   ------------------------------------------------------------------------------
72//   * We assume    min_dx <= currX <= max_dx     &&     min_dy <= currY <= max_dy
73
74
75.sdata
76	.align 4
77	.type	 lambda_vec8#,@object
78	.size	 lambda_vec8#,128
79lambda_vec8:
80	data4	0
81	data4	1
82	data4	1
83	data4	1
84	data4	1
85	data4	2
86	data4	2
87	data4	2
88	data4	2
89	data4	3
90	data4	3
91	data4	3
92	data4	4
93	data4	4
94	data4	4
95	data4	5
96	data4	5
97	data4	6
98	data4	7
99	data4	7
100	data4	8
101	data4	9
102	data4	10
103	data4	11
104	data4	13
105	data4	14
106	data4	16
107	data4	18
108	data4	21
109	data4	25
110	data4	30
111	data4	36
112
113
114	.type	 mvtab#,@object
115	.size	 mvtab#,132
116mvtab:
117	data4	1
118	data4	2
119	data4	3
120	data4	4
121	data4	6
122	data4	7
123	data4	7
124	data4	7
125	data4	9
126	data4	9
127	data4	9
128	data4	10
129	data4	10
130	data4	10
131	data4	10
132	data4	10
133	data4	10
134	data4	10
135	data4	10
136	data4	10
137	data4	10
138	data4	10
139	data4	10
140	data4	10
141	data4	10
142	data4	11
143	data4	11
144	data4	11
145	data4	11
146	data4	11
147	data4	11
148	data4	12
149	data4	12
150.text
151	.align 16
152	.global Halfpel8_Refine_ia64#
153	.proc Halfpel8_Refine_ia64#
154
155Halfpel8_Refine_ia64:
156
157	pfs = r14
158	prsave = r15
159
160	// Save important registers
161
162	alloc pfs = ar.pfs, 18, 74, 4, 96
163	mov prsave = pr
164
165	// Naming registers for better readability
166
167	pRef = in0
168	pRefH = in1
169	pRefV = in2
170	pRefHV = in3
171	cura = in4
172	x = in5
173	y = in6
174	currMV = in7
175	iMinSAD = in8
176	dx = in9
177	dy = in10
178	min_dx = in11
179	max_dx = in12
180	min_dy = in13
181	max_dy = in14
182	iFcode = in15
183	iQuant = in16
184	iEdgedWidth = in17
185
186	iSAD = r17
187	backupX = r18
188	backupY = r19
189	currX = r20
190	currY = r21
191	currYAddress = r22
192	bitX0 = r23
193	bitY0 = r24
194	dxd2 = r25
195	dyd2 = r26
196	offset = r27
197	block = r28
198	nob02 = r29
199	nob1 = r30
200	nob64m02 = r31
201	nob64m1 = r127
202	const7 = r126
203	nob56m02 = r125
204	oldX = r124
205	oldY = r123
206
207	.rotr	inregisters[18], refaa[3], refab[3], cur[8], ref0a[9], ref0b[9], ref1a[9], mpr[9], ref2a[8], ref2b[8], component[2], sc[2], tabaddress[2]
208
209	fx = f8
210	fy = f9
211	fblock = f10
212	fiEdgedWidth = f11
213	fdxd2 = f12
214	fdyd2 = f13
215	foffset = f14
216	fydiEdgedWidth = f15
217	fQuant = f32
218	fmv = f33
219
220	n = p16
221	h = p17
222	v = p18
223	hv = p19
224	l = p20
225	r = p21
226	t = p22
227	b = p23
228	lt = p24
229	lb = p25
230	rt = p26
231	rb = p27
232	fb = p28
233	non0_0 = p30
234	non0_1 = p31
235	non0_2 = p32
236	non0_3 = p33
237	neg_0 = p34
238	neg_1 = p35
239	neg_2 = p36
240	neg_3 = p37
241	cg32_0 = p29
242	cg32_1 = p38
243
244	// Initialize input variables
245
246	add sp = 16, sp
247	;;
248	ld4 iMinSAD = [sp], 8
249	;;
250	sxt4 iMinSAD = iMinSAD
251
252
253	ld4 dx = [sp], 8
254	;;
255	sxt4 dx = dx
256
257	ld4 dy = [sp], 8
258	;;
259	sxt4 dy = dy
260
261	ld4 min_dx = [sp], 8
262	;;
263	sxt4 min_dx = min_dx
264
265	ld4 max_dx = [sp], 8
266	;;
267	sxt4 max_dx = max_dx
268
269	ld4 min_dy = [sp], 8
270	;;
271	sxt4 min_dy = min_dy
272
273	ld4 max_dy = [sp], 8
274	;;
275	sxt4 max_dy = max_dy
276
277	ld4 iFcode = [sp], 8
278	;;
279	sxt4 iFcode = iFcode
280
281	ld4 iQuant = [sp], 8
282
283	add tabaddress[0] = @gprel(lambda_vec8#), gp
284	;;
285	shladd tabaddress[0] = iQuant, 2, tabaddress[0]
286	;;
287	ld4 iQuant = [tabaddress[0]]
288	;;
289	sxt4 iQuant = iQuant
290	;;
291	add iFcode = -1, iFcode		//only used in decreased version
292	shl iQuant = iQuant, 1
293	;;
294	setf.sig fQuant = iQuant
295
296	ld4 iEdgedWidth = [sp]
297	add sp = -88, sp
298
299
300
301
302	// Initialize local variables
303
304
305	ld4 currX = [currMV]
306	add currYAddress = 4, currMV
307	;;
308	sxt4 currX = currX
309	ld4 currY = [currYAddress]
310	;;
311	sxt4 currY = currY
312	;;
313	// Calculate references
314
315	cmp.gt l, p0 = currX, min_dx
316	cmp.lt r, p0 = currX, max_dx
317	cmp.gt t, p0 = currY, min_dy
318	cmp.lt b, p0 = currY, max_dy
319	add backupX = -1, currX			//move to left upper corner of quadrate
320	add backupY = -1, currY
321
322	;;
323(b)	cmp.gt.unc lb, p0 = currX, min_dx
324(t)	cmp.lt.unc rt, p0 = currX, max_dx
325(l)	cmp.gt.unc lt, p0 = currY, min_dy
326(r)	cmp.lt.unc rb, p0 = currY, max_dy
327
328	and bitX0 = 1, backupX
329	and bitY0 = 1, backupY
330	;;
331	cmp.eq n, p0 = 0, bitX0
332	cmp.eq h, p0 = 1, bitX0
333	cmp.eq v, p0 = 0, bitX0
334	cmp.eq hv, p0 = 1, bitX0
335	;;
336	cmp.eq.and n, p0 = 0, bitY0
337	cmp.eq.and h, p0 = 0, bitY0
338	cmp.eq.and v, p0 = 1, bitY0
339	cmp.eq.and hv, p0 = 1, bitY0
340	;;
341
342	.pred.rel "mutex", p16, p17, p18, p19	//n, h, v, hv
343(n)	mov refaa[0] = pRef
344(h)	mov refaa[0] = pRefH
345(v)	mov refaa[0] = pRefV
346(hv)	mov refaa[0] = pRefHV
347
348(n)	mov refaa[1] = pRefH
349(h)	mov refaa[1] = pRef
350(v)	mov refaa[1] = pRefHV
351(hv)	mov refaa[1] = pRefV
352
353(n)	mov refaa[2] = pRefV
354(h)	mov refaa[2] = pRefHV
355(v)	mov refaa[2] = pRef
356(hv)	mov refaa[2] = pRefH
357
358
359	// Calculate offset (integer multiplication on IA-64 sucks!)
360
361	mov block = 8
362
363	shr dxd2 = backupX, 1
364	shr dyd2 = backupY, 1
365
366	setf.sig fx = x
367	setf.sig fy = y
368	;;
369	setf.sig fblock = block
370	setf.sig fiEdgedWidth = iEdgedWidth
371	;;
372	setf.sig fdxd2 = dxd2
373	setf.sig fdyd2 = dyd2
374	;;
375	xma.l foffset = fx, fblock, fdxd2
376	xma.l fydiEdgedWidth = fy, fblock, fdyd2
377	;;
378	xma.l foffset = fydiEdgedWidth, fiEdgedWidth, foffset
379	;;
380	getf.sig offset = foffset
381	;;
382	add refaa[0] = refaa[0], offset
383	add refaa[1] = refaa[1], offset
384	add refaa[2] = refaa[2], offset
385	;;
386(h)	add refaa[1] = 1, refaa[1]
387(hv)	add refaa[1] = 1, refaa[1]
388(v)	add refaa[2] = iEdgedWidth, refaa[2]
389(hv)	add refaa[2] = iEdgedWidth, refaa[2]
390
391	// Load respecting misalignment of refx...
392
393	mov const7 = 7
394	;;
395	dep.z nob02 = refaa[0], 3, 3
396	dep.z nob1 = refaa[1], 3, 3
397	;;
398	andcm refaa[0] = refaa[0], const7	// set last 3 bits = 0
399	andcm refaa[1] = refaa[1], const7
400	andcm refaa[2] = refaa[2], const7
401	;;
402	add refab[0] = 8, refaa[0]
403	add refab[1] = 8, refaa[1]
404	add refab[2] = 8, refaa[2]
405	;;
406	ld8 cur[0] = [cura], iEdgedWidth
407	ld8 ref0a[0] = [refaa[0]], iEdgedWidth
408	sub nob64m02 = 64, nob02		// 64 - nob
409
410	ld8 ref0b[0] = [refab[0]], iEdgedWidth
411	ld8 ref1a[0] = [refaa[1]], iEdgedWidth
412	sub nob56m02 = 56, nob02		// 56 - nob
413
414	ld8 mpr[0] = [refab[1]], iEdgedWidth
415	ld8 ref2a[0] = [refaa[2]], iEdgedWidth
416	sub nob64m1 = 64, nob1
417
418	ld8 ref2b[0] = [refab[2]], iEdgedWidth
419	;;
420	ld8 cur[1] = [cura], iEdgedWidth
421	ld8 ref0a[1] = [refaa[0]], iEdgedWidth
422	ld8 ref0b[1] = [refab[0]], iEdgedWidth
423	ld8 ref1a[1] = [refaa[1]], iEdgedWidth
424	ld8 mpr[1] = [refab[1]], iEdgedWidth
425	ld8 ref2a[1] = [refaa[2]], iEdgedWidth
426	ld8 ref2b[1] = [refab[2]], iEdgedWidth
427	;;
428	ld8 cur[2] = [cura], iEdgedWidth
429	ld8 ref0a[2] = [refaa[0]], iEdgedWidth
430	ld8 ref0b[2] = [refab[0]], iEdgedWidth
431	ld8 ref1a[2] = [refaa[1]], iEdgedWidth
432	ld8 mpr[2] = [refab[1]], iEdgedWidth
433	ld8 ref2a[2] = [refaa[2]], iEdgedWidth
434	ld8 ref2b[2] = [refab[2]], iEdgedWidth
435	;;
436	ld8 cur[3] = [cura], iEdgedWidth
437	ld8 ref0a[3] = [refaa[0]], iEdgedWidth
438	ld8 ref0b[3] = [refab[0]], iEdgedWidth
439	ld8 ref1a[3] = [refaa[1]], iEdgedWidth
440	ld8 mpr[3] = [refab[1]], iEdgedWidth
441	ld8 ref2a[3] = [refaa[2]], iEdgedWidth
442	ld8 ref2b[3] = [refab[2]], iEdgedWidth
443	;;
444	ld8 cur[4] = [cura], iEdgedWidth
445	ld8 ref0a[4] = [refaa[0]], iEdgedWidth
446	ld8 ref0b[4] = [refab[0]], iEdgedWidth
447	ld8 ref1a[4] = [refaa[1]], iEdgedWidth
448	ld8 mpr[4] = [refab[1]], iEdgedWidth
449	ld8 ref2a[4] = [refaa[2]], iEdgedWidth
450	ld8 ref2b[4] = [refab[2]], iEdgedWidth
451	;;
452	ld8 cur[5] = [cura], iEdgedWidth
453	ld8 ref0a[5] = [refaa[0]], iEdgedWidth
454	ld8 ref0b[5] = [refab[0]], iEdgedWidth
455	ld8 ref1a[5] = [refaa[1]], iEdgedWidth
456	ld8 mpr[5] = [refab[1]], iEdgedWidth
457	ld8 ref2a[5] = [refaa[2]], iEdgedWidth
458	ld8 ref2b[5] = [refab[2]], iEdgedWidth
459	;;
460	ld8 cur[6] = [cura], iEdgedWidth
461	ld8 ref0a[6] = [refaa[0]], iEdgedWidth
462	ld8 ref0b[6] = [refab[0]], iEdgedWidth
463	ld8 ref1a[6] = [refaa[1]], iEdgedWidth
464	ld8 mpr[6] = [refab[1]], iEdgedWidth
465	ld8 ref2a[6] = [refaa[2]], iEdgedWidth
466	ld8 ref2b[6] = [refab[2]], iEdgedWidth
467	;;
468	ld8 cur[7] = [cura]
469	ld8 ref0a[7] = [refaa[0]], iEdgedWidth
470	ld8 ref0b[7] = [refab[0]], iEdgedWidth
471	ld8 ref1a[7] = [refaa[1]], iEdgedWidth
472	ld8 mpr[7] = [refab[1]], iEdgedWidth
473	ld8 ref2a[7] = [refaa[2]]
474	ld8 ref2b[7] = [refab[2]]
475	;;
476	ld8 ref0a[8] = [refaa[0]]
477	ld8 ref0b[8] = [refab[0]]
478	ld8 ref1a[8] = [refaa[1]]
479	ld8 mpr[8] = [refab[1]]
480	;;
481
482
483	// Align ref1
484
485     	shr.u ref1a[0] = ref1a[0], nob1
486     	shr.u ref1a[1] = ref1a[1], nob1
487     	shr.u ref1a[2] = ref1a[2], nob1
488     	shr.u ref1a[3] = ref1a[3], nob1
489     	shr.u ref1a[4] = ref1a[4], nob1
490     	shr.u ref1a[5] = ref1a[5], nob1
491     	shr.u ref1a[6] = ref1a[6], nob1
492     	shr.u ref1a[7] = ref1a[7], nob1
493     	shr.u ref1a[8] = ref1a[8], nob1
494
495	shl mpr[0] = mpr[0], nob64m1
496	shl mpr[1] = mpr[1], nob64m1
497	shl mpr[2] = mpr[2], nob64m1
498	shl mpr[3] = mpr[3], nob64m1
499	shl mpr[4] = mpr[4], nob64m1
500	shl mpr[5] = mpr[5], nob64m1
501	shl mpr[6] = mpr[6], nob64m1
502	shl mpr[7] = mpr[7], nob64m1
503	shl mpr[8] = mpr[8], nob64m1
504	;;
505.explicit
506{.mii
507	or ref1a[0] = ref1a[0], mpr[0]
508     	shr.u ref0a[0] = ref0a[0], nob02
509     	shr.u ref0a[1] = ref0a[1], nob02
510}
511{.mmi
512	or ref1a[1] = ref1a[1], mpr[1]
513	or ref1a[2] = ref1a[2], mpr[2]
514     	shr.u ref0a[2] = ref0a[2], nob02
515}
516{.mii
517	or ref1a[3] = ref1a[3], mpr[3]
518     	shr.u ref0a[3] = ref0a[3], nob02
519     	shr.u ref0a[4] = ref0a[4], nob02
520}
521{.mmi
522	or ref1a[4] = ref1a[4], mpr[4]
523	or ref1a[5] = ref1a[5], mpr[5]
524	shr.u ref0a[5] = ref0a[5], nob02
525}
526{.mii
527	or ref1a[6] = ref1a[6], mpr[6]
528     	shr.u ref0a[6] = ref0a[6], nob02
529     	shr.u ref0a[7] = ref0a[7], nob02
530}
531{.mii
532	or ref1a[7] = ref1a[7], mpr[7]
533	or ref1a[8] = ref1a[8], mpr[8]
534     	shr.u ref0a[8] = ref0a[8], nob02
535}
536.default
537	// ref1a[] now contains center position values
538	// mpr[] not used any more
539
540	// Align ref0 left
541
542	;;
543	shl mpr[0] = ref0b[0], nob56m02
544	shl mpr[1] = ref0b[1], nob56m02
545	shl mpr[2] = ref0b[2], nob56m02
546	shl mpr[3] = ref0b[3], nob56m02
547	shl mpr[4] = ref0b[4], nob56m02
548	shl mpr[5] = ref0b[5], nob56m02
549	shl mpr[6] = ref0b[6], nob56m02
550	shl mpr[7] = ref0b[7], nob56m02
551	shl mpr[8] = ref0b[8], nob56m02
552
553	shl ref0b[0] = ref0b[0], nob64m02
554	shl ref0b[1] = ref0b[1], nob64m02
555	shl ref0b[2] = ref0b[2], nob64m02
556	shl ref0b[3] = ref0b[3], nob64m02
557	shl ref0b[4] = ref0b[4], nob64m02
558	shl ref0b[5] = ref0b[5], nob64m02
559	shl ref0b[6] = ref0b[6], nob64m02
560	shl ref0b[7] = ref0b[7], nob64m02
561	shl ref0b[8] = ref0b[8], nob64m02
562	;;
563	or ref0a[0] = ref0a[0], ref0b[0]
564	or ref0a[1] = ref0a[1], ref0b[1]
565	or ref0a[2] = ref0a[2], ref0b[2]
566	or ref0a[3] = ref0a[3], ref0b[3]
567	or ref0a[4] = ref0a[4], ref0b[4]
568	or ref0a[5] = ref0a[5], ref0b[5]
569	or ref0a[6] = ref0a[6], ref0b[6]
570	or ref0a[7] = ref0a[7], ref0b[7]
571	or ref0a[8] = ref0a[8], ref0b[8]
572	;;
573
574	// ref0a[] now contains left position values
575	// mpr[] contains intermediate result for right position values (former ref0a << 56 - nob02)
576
577	// Align ref0 right
578
579	// Shift one byte more to the right (seen als big-endian)
580	shr.u ref0b[0] = ref0a[0], 8
581	shr.u ref0b[1] = ref0a[1], 8
582	shr.u ref0b[2] = ref0a[2], 8
583	shr.u ref0b[3] = ref0a[3], 8
584	shr.u ref0b[4] = ref0a[4], 8
585	shr.u ref0b[5] = ref0a[5], 8
586	shr.u ref0b[6] = ref0a[6], 8
587	shr.u ref0b[7] = ref0a[7], 8
588	shr.u ref0b[8] = ref0a[8], 8
589	;;
590.explicit
591{.mii
592	or  ref0b[0] = ref0b[0], mpr[0]
593     	shr.u ref2a[0] = ref2a[0], nob02
594     	shr.u ref2a[1] = ref2a[1], nob02
595}
596{.mmi
597	or  ref0b[1] = ref0b[1], mpr[1]
598	or  ref0b[2] = ref0b[2], mpr[2]
599     	shr.u ref2a[2] = ref2a[2], nob02
600}
601{.mii
602	or  ref0b[3] = ref0b[3], mpr[3]
603     	shr.u ref2a[3] = ref2a[3], nob02
604     	shr.u ref2a[4] = ref2a[4], nob02
605}
606{.mmi
607	or  ref0b[4] = ref0b[4], mpr[4]
608	or  ref0b[5] = ref0b[5], mpr[5]
609     	shr.u ref2a[5] = ref2a[5], nob02
610}
611{.mii
612	or  ref0b[6] = ref0b[6], mpr[6]
613     	shr.u ref2a[6] = ref2a[6], nob02
614     	shr.u ref2a[7] = ref2a[7], nob02
615}
616.default
617	or  ref0b[7] = ref0b[7], mpr[7]
618	or  ref0b[8] = ref0b[8], mpr[8]
619
620	// ref0b[] now contains right position values
621	// mpr[] not needed any more
622
623
624	// Align ref2 left
625
626	;;
627	shl mpr[0] = ref2b[0], nob56m02
628	shl mpr[1] = ref2b[1], nob56m02
629	shl mpr[2] = ref2b[2], nob56m02
630	shl mpr[3] = ref2b[3], nob56m02
631	shl mpr[4] = ref2b[4], nob56m02
632	shl mpr[5] = ref2b[5], nob56m02
633	shl mpr[6] = ref2b[6], nob56m02
634	shl mpr[7] = ref2b[7], nob56m02
635
636	shl ref2b[0] = ref2b[0], nob64m02
637	shl ref2b[1] = ref2b[1], nob64m02
638	shl ref2b[2] = ref2b[2], nob64m02
639	shl ref2b[3] = ref2b[3], nob64m02
640	shl ref2b[4] = ref2b[4], nob64m02
641	shl ref2b[5] = ref2b[5], nob64m02
642	shl ref2b[6] = ref2b[6], nob64m02
643	shl ref2b[7] = ref2b[7], nob64m02
644	;;
645	or ref2a[0] = ref2a[0], ref2b[0]
646	or ref2a[1] = ref2a[1], ref2b[1]
647	or ref2a[2] = ref2a[2], ref2b[2]
648	or ref2a[3] = ref2a[3], ref2b[3]
649	or ref2a[4] = ref2a[4], ref2b[4]
650	or ref2a[5] = ref2a[5], ref2b[5]
651	or ref2a[6] = ref2a[6], ref2b[6]
652	or ref2a[7] = ref2a[7], ref2b[7]
653	;;
654
655	// ref2a[] now contains left position values
656	// mpr[] contains intermediate result for right position values (former ref2a << 56 - nob02)
657
658	// Align ref2 right
659
660	// Shift one byte more to the right (seen als big-endian)
661	shr.u ref2b[0] = ref2a[0], 8
662	shr.u ref2b[1] = ref2a[1], 8
663	shr.u ref2b[2] = ref2a[2], 8
664	shr.u ref2b[3] = ref2a[3], 8
665	shr.u ref2b[4] = ref2a[4], 8
666	shr.u ref2b[5] = ref2a[5], 8
667	shr.u ref2b[6] = ref2a[6], 8
668	shr.u ref2b[7] = ref2a[7], 8
669	;;
670	or  ref2b[0] = ref2b[0], mpr[0]
671	or  ref2b[1] = ref2b[1], mpr[1]
672	or  ref2b[2] = ref2b[2], mpr[2]
673	or  ref2b[3] = ref2b[3], mpr[3]
674	or  ref2b[4] = ref2b[4], mpr[4]
675	or  ref2b[5] = ref2b[5], mpr[5]
676	or  ref2b[6] = ref2b[6], mpr[6]
677	or  ref2b[7] = ref2b[7], mpr[7]
678
679
680	// ref2b[] now contains right position values
681	// mpr[] not needed any more
682
683
684
685	// Let's SAD
686
687	// Left top corner
688
689
690	sub dx = backupX, dx
691	psad1 mpr[0] = cur[0], ref0a[0]
692	psad1 mpr[1] = cur[1], ref0a[1]
693
694	sub dy = backupY, dy
695	psad1 mpr[2] = cur[2], ref0a[2]
696	psad1 mpr[3] = cur[3], ref0a[3]
697	psad1 mpr[4] = cur[4], ref0a[4]
698	psad1 mpr[5] = cur[5], ref0a[5]
699	psad1 mpr[6] = cur[6], ref0a[6]
700	psad1 mpr[7] = cur[7], ref0a[7]
701	;;
702.include "../../src/motion/ia64_asm/calc_delta_1.s"
703
704	// Top edge
705
706	psad1 mpr[0] = cur[0], ref1a[0]
707	psad1 mpr[1] = cur[1], ref1a[1]
708	psad1 mpr[2] = cur[2], ref1a[2]
709	psad1 mpr[3] = cur[3], ref1a[3]
710	psad1 mpr[4] = cur[4], ref1a[4]
711
712	add dx = 1, dx
713	psad1 mpr[5] = cur[5], ref1a[5]
714	psad1 mpr[6] = cur[6], ref1a[6]
715
716	psad1 mpr[7] = cur[7], ref1a[7]
717	;;
718
719.include "../../src/motion/ia64_asm/calc_delta_2.s"
720(lt)	cmp.lt.unc fb, p0 = mpr[8], iMinSAD
721.include "../../src/motion/ia64_asm/calc_delta_3.s"
722
723	// Right top corner
724
725
726	psad1 mpr[0] = cur[0], ref0b[0]
727	psad1 mpr[1] = cur[1], ref0b[1]
728	psad1 mpr[2] = cur[2], ref0b[2]
729	psad1 mpr[3] = cur[3], ref0b[3]
730	psad1 mpr[4] = cur[4], ref0b[4]
731
732	add backupX = 1, backupX
733	psad1 mpr[5] = cur[5], ref0b[5]
734	psad1 mpr[6] = cur[6], ref0b[6]
735
736	add dx = 1, dx
737	psad1 mpr[7] = cur[7], ref0b[7]
738	;;
739
740.include "../../src/motion/ia64_asm/calc_delta_1.s"
741(t)	cmp.lt.unc fb, p0 = iSAD, iMinSAD
742	;;
743
744	// Left edge
745
746(fb)	mov iMinSAD = iSAD
747	psad1 mpr[0] = cur[0], ref2a[0]
748
749(fb)	mov currX = backupX
750	psad1 mpr[1] = cur[1], ref2a[1]
751	psad1 mpr[2] = cur[2], ref2a[2]
752
753(fb)	mov currY = backupY
754	psad1 mpr[3] = cur[3], ref2a[3]
755	psad1 mpr[4] = cur[4], ref2a[4]
756
757	add backupX = 1, backupX
758	psad1 mpr[5] = cur[5], ref2a[5]
759	psad1 mpr[6] = cur[6], ref2a[6]
760
761	psad1 mpr[7] = cur[7], ref2a[7]
762
763	add dx = -2, dx
764	add dy = 1, dy
765	;;
766
767.include "../../src/motion/ia64_asm/calc_delta_2.s"
768(rt)	cmp.lt.unc fb, p0 = mpr[8], iMinSAD
769.include "../../src/motion/ia64_asm/calc_delta_3.s"
770
771	// Right edge
772
773
774	psad1 mpr[0] = cur[0], ref2b[0]
775	psad1 mpr[1] = cur[1], ref2b[1]
776	psad1 mpr[2] = cur[2], ref2b[2]
777	psad1 mpr[3] = cur[3], ref2b[3]
778	psad1 mpr[4] = cur[4], ref2b[4]
779
780	add backupX = -2, backupX
781	psad1 mpr[5] = cur[5], ref2b[5]
782	psad1 mpr[6] = cur[6], ref2b[6]
783
784	add backupY = 1, backupY
785	add dx = 2, dx
786	psad1 mpr[7] = cur[7], ref2b[7]
787	;;
788
789.include "../../src/motion/ia64_asm/calc_delta_1.s"
790(l)	cmp.lt.unc fb, p0 = iSAD, iMinSAD
791	;;
792
793	// Left bottom corner
794
795(fb)	mov iMinSAD = iSAD
796	psad1 mpr[0] = cur[0], ref0a[1]
797
798(fb)	mov currX = backupX
799	psad1 mpr[1] = cur[1], ref0a[2]
800	psad1 mpr[2] = cur[2], ref0a[3]
801
802(fb)	mov currY = backupY
803	psad1 mpr[3] = cur[3], ref0a[4]
804	psad1 mpr[4] = cur[4], ref0a[5]
805
806	add backupX = 2, backupX
807	psad1 mpr[5] = cur[5], ref0a[6]
808	psad1 mpr[6] = cur[6], ref0a[7]
809
810	psad1 mpr[7] = cur[7], ref0a[8]
811
812	add dx = -2, dx
813	add dy = 1, dy
814	;;
815
816.include "../../src/motion/ia64_asm/calc_delta_2.s"
817(r)	cmp.lt.unc fb, p0 = mpr[8], iMinSAD
818.include "../../src/motion/ia64_asm/calc_delta_3.s"
819
820	// Bottom edge
821
822	psad1 mpr[0] = cur[0], ref1a[1]
823	psad1 mpr[1] = cur[1], ref1a[2]
824	psad1 mpr[2] = cur[2], ref1a[3]
825	psad1 mpr[3] = cur[3], ref1a[4]
826	psad1 mpr[4] = cur[4], ref1a[5]
827
828	add backupX = -2, backupX
829	psad1 mpr[5] = cur[5], ref1a[6]
830	psad1 mpr[6] = cur[6], ref1a[7]
831
832	add backupY = 1, backupY
833	add dx = 1, dx
834	psad1 mpr[7] = cur[7], ref1a[8]
835	;;
836
837.include "../../src/motion/ia64_asm/calc_delta_1.s"
838(lb)	cmp.lt.unc fb, p0 = iSAD, iMinSAD
839	;;
840	// Right bottom corner
841
842
843(fb)	mov iMinSAD = iSAD
844	psad1 mpr[0] = cur[0], ref0b[1]
845
846(fb)	mov currX = backupX
847	psad1 mpr[1] = cur[1], ref0b[2]
848	psad1 mpr[2] = cur[2], ref0b[3]
849
850(fb)	mov currY = backupY
851	psad1 mpr[3] = cur[3], ref0b[4]
852	psad1 mpr[4] = cur[4], ref0b[5]
853
854	add backupX = 1, backupX
855	psad1 mpr[5] = cur[5], ref0b[6]
856	psad1 mpr[6] = cur[6], ref0b[7]
857
858	add dx = 1, dx
859	psad1 mpr[7] = cur[7], ref0b[8]
860	;;
861
862.include "../../src/motion/ia64_asm/calc_delta_2.s"
863(b)	cmp.lt.unc fb, p0 = mpr[8], iMinSAD
864.include "../../src/motion/ia64_asm/calc_delta_3.s"
865
866(rb)	getf.sig ret0 = fmv
867	add backupX = 1, backupX
868	;;
869(rb)	add iSAD = iSAD, ret0
870	;;
871(rb)	cmp.lt.unc fb, p0 = iSAD, iMinSAD
872	;;
873(fb)	mov iMinSAD = iSAD
874(fb)	mov currX = backupX
875(fb)	mov currY = backupY
876	;;
877
878	// Write back result
879
880	st4 [currMV] = currX
881	st4 [currYAddress] = currY
882	mov ret0 = iMinSAD
883
884	// Restore important registers
885
886	;;
887	mov pr = prsave, -1
888	mov ar.pfs = pfs
889	br.ret.sptk.many b0
890
891	.endp Halfpel8_Refine_ia64#
892