1// ****************************************************************************
2// *
3// *  XVID MPEG-4 VIDEO CODEC
4// *  - IA64 8bit<->16bit transfer -
5// *
6// *  Copyright(C) 2002 Sebastian Felis, Max Stengel
7// *
8// *  This program is free software; you can redistribute it and/or modify it
9// *  under the terms of the GNU General Public License as published by
10// *  the Free Software Foundation; either version 2 of the License, or
11// *  (at your option) any later version.
12// *
13// *  This program is distributed in the hope that it will be useful,
14// *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15// *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16// *  GNU General Public License for more details.
17// *
18// *  You should have received a copy of the GNU General Public License
19// *  along with this program; if not, write to the Free Software
20// *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21// *
22// * $Id: mem_transfer_ia64.s,v 1.6 2009-02-19 17:07:29 Isibaar Exp $
23// *
24// ***************************************************************************/
25//
26// ****************************************************************************
27// *
28// *  mem_transfer_ia64.s, IA-64 8bit<->16bit transfer
29// *
30// *  This version was implemented during an IA-64 practical training at
31// *  the University of Karlsruhe (http://i44w3.info.uni-karlsruhe.de/)
32// *
33// ****************************************************************************
34
35///////////////////////////////////////////////////////////////////////////////
36//
37// mem_transfer.c optimized for ia-64 by Sebastian Felis and Max Stengel,
38// University of Karlsruhe, Germany, 03.06.2002, during the laboratory
39// "IA-64 Video Codec Assember Parktikum" at IPD Goos.
40
41///// History /////////////////////////////////////////////////////////////////
42//
43// - 16.07.2002: several minor changes for ecc-conformity
44// - 03.06.2002: initial version
45//
46
47///////////////////////////////////////////////////////////////////////////////
48//
49// Annotations:
50// ===========
51//
52// - All functions work on 8x8-matrices. While the C-code-functions treat each
53//   element seperatly, the functions in this assembler-code treat a whole line
54//   simultaneously. So one loop is saved.
55//   The remaining loop is relized by using softwarepipelining with rotating
56//   rregisters.
57// - Register renaming is used for better readability
58// - To load 8 bytes of missaligned data, two 8-byte-blocks are loaded, both
59//   parts are shifted and joined together with an "OR"-Instruction.
60// - First parameter is stored in GR 32, next in GR 33, and so on. They must be
61//   saved, as these GRs are used for register-rotation.
62// - Some of the orininal, German comments used during development are left in
63//   in the code. They shouldn't bother anyone.
64//
65// Anmerkungen:
66// ============
67//
68// - Alle Funtionen arbeiten mit 8x8-Matrizen. W�hrend die Funktionen im C-Code
69//   jedes Element einzeln bearbeiten, bearbeiten die Funtionen dieses Assembler-
70//   Codes eine Zeile gleichzeitig. Dadurch kann eine Schleife eingespart werden.
71//   Die verbleibende Schleife wird unter Benutzung von Softwarepipelining mit
72//   rotierenden Registern realisiert.
73// - Umbenennung der Register zwecks besserer Lesbarkeit wird verwendet.
74// - Um 8 Bytes falsch ausgerichtete Daten zu laden, werden zwei 8-Byte-Bl�cke
75//   geladen, beide Teile mit "shift"-Operationen zurechter�ckt und mit einem
76//   logischen Oder zusammenkopiert.
77// - Die Parameter werden in den Registern ab GR 32 �bergeben. Sie m�ssen ge-
78//   sichert werden, da die Register f�r die register-Rotation ben�tigt werden.
79// - Einige der urspr�nglichen, deutschen Kommentare aus der Entwicklungsphase
80//   sind im Code verblieben. Sie sollten niemanden st�ren.
81//
82///////////////////////////////////////////////////////////////////////////////
83
84
85//	***	define Latencies for software pipilines ***
86
87	LL  = 3 // Load
88	SL  = 3 // Store
89	PL  = 1 // Pack
90	SHL = 1 // Shift
91	OL  = 1 // Or
92	UL  = 1 // Unpack
93	PAL = 1 // Parallel Add
94	PSL = 1 // Parallel Subtract
95	PAVGL = 1 // Parallel Avarage
96
97	.text
98
99
100///////////////////////////////////////////////////////////////////////////////
101//
102// transfer8x8_copy_ia64
103//
104// SRC is missaligned, to align the source load two 8-bytes-words, shift it,
105// join them and store the aligned source into the destination address.
106//
107///////////////////////////////////////////////////////////////////////////////
108
109	.align 16
110	.global transfer8x8_copy_ia64#
111	.proc transfer8x8_copy_ia64#
112
113transfer8x8_copy_ia64:
114	.prologue
115
116//	*** register renaming ***
117	zero = r0
118
119	oldLC = r2
120	oldPR = r3
121
122	src_1 = r14 // left aligned address of src
123	src_2 = r15 // right aligned address of src
124	dst = r16  // destination address
125	stride = r17
126
127	offset = r18 // shift right offset
128	aoffset = r19 // shift left offset
129
130//	*** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
131	.save ar.lc, oldLC
132	mov oldLC = ar.lc
133	mov oldPR = pr
134
135	.body
136
137//	*** Allocating new stackframe, initialize LC, Epilogue-Counter and PR ***
138	alloc r9 = ar.pfs, 3, 29, 0, 32
139
140//	*** Saving Parameters ***
141	mov dst = r32
142	mov stride = r34
143
144//	*** Misalingment-Treatment ***
145	and src_1 = -8, r33 // Computing adress of first aligned block containing src-values
146	dep offset = r33, zero, 3, 3 // Extracting offset for shr from src-adress
147	;;
148	sub aoffset = 64, offset // Computing counterpart of offset ("anti-offset"), used for shl
149	add src_2 = 8, src_1 // Computing adress of second aligned block containing src-values
150
151//	*** init loop: set loop counter, epilog counter, predicates ***
152	mov ar.lc = 7
153	mov ar.ec = LL + SHL + OL + 1
154	mov pr.rot = 1 << 16
155	;;
156
157//	*** define register arrays and predicate array for software pipeline ***
158	// src_v1 = source value 1, shd_r = shifted right, shd_l = shifted left
159	.rotr src_v1[LL+1], src_v2[LL+1], shd_r[SHL+1], shd_l[SHL+1], value[OL+1]
160	.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], st_stage[1]
161
162
163//	Software pipelined loop:
164//	Stage 1: Load two 2 bytes from SRC_1, SRC_2 into SRC_v1 and SRC_v2
165//	Stage 2: Shift both values of source to SHD_R and SHD_L
166//	Stage 3: Join both parts together with OR
167//	Stage 4: Store aligned date to destination and add stride to destination address
168
169
170.Loop_8x8copy:
171	{.mii
172		(ld_stage[0]) ld8 src_v1[0] = [src_1], stride
173		(sh_stage[0]) shr.u shd_r[0] = src_v1[LL], offset
174	}
175	{.mii
176		(ld_stage[0]) ld8 src_v2[0] = [src_2], stride
177		(sh_stage[0]) shl shd_l[0] = src_v2[LL], aoffset
178		(or_stage[0]) or value[0] = shd_l[SHL], shd_r[SHL]
179	}
180	{.mib
181		(st_stage[0]) st8 [dst] = value[OL]
182		(st_stage[0]) add dst = dst, stride
183		br.ctop.sptk.few .Loop_8x8copy
184		;;
185	}
186
187//	*** Restore old LC and PRs ***
188	mov ar.lc = oldLC
189	mov pr = oldPR, -1
190
191	br.ret.sptk.many b0
192
193	.endp transfer8x8_copy_ia64#
194
195
196
197
198///////////////////////////////////////////////////////////////////////////////
199//
200// transfer_8to16copy_ia64
201//
202// SRC is aligned. To convert 8 bit unsigned values to 16 bit signed values,
203// UNPACK is used. So 8 bytes are loaded from source, unpacked to two
204// 4 x 16 bit values and stored to the destination. Destination is a continuous
205// array of 64 x 16 bit signed data. To store the next line, only 16 must be
206// added to the destination address.
207///////////////////////////////////////////////////////////////////////////////
208
209	.align 16
210	.global transfer_8to16copy_ia64#
211	.proc transfer_8to16copy_ia64#
212
213
214transfer_8to16copy_ia64:
215	.prologue
216
217//	*** register renaming ***
218	oldLC = r2
219	oldPR = r3
220
221	zero = r0 // damit ist die Zahl "zero" = 0 gemeint
222
223	dst_1 = r14 // destination address for first 4 x 16 bit values
224	dst_2 = r15 // destination address for second 4 x 16 bit values
225	src = r16
226	stride = r17
227
228//	*** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
229	.save ar.lc, oldLC
230	mov oldLC = ar.lc
231	mov oldPR = pr
232
233
234	.body
235
236//	*** Allocating new stackframe, define rotating registers ***
237	alloc r9 = ar.pfs, 4, 92, 0, 96
238
239//	*** Saving Paramters ***
240	mov dst_1 = r32 // fist 4 x 16 bit values
241	add dst_2 = 8, r32 // second 4 x 16 bit values
242	mov src = r33
243	mov stride = r34
244
245//	*** init loop: set loop counter, epilog counter, predicates ***
246	mov ar.lc = 7
247	mov ar.ec = LL + UL + 1
248	mov pr.rot = 1 << 16
249	;;
250
251//	*** define register arrays and predicate array for software pipeline ***
252	// src_v = source value, dst_v1 = destination value 1
253	.rotr src_v[LL+1], dst_v1[UL+1], dst_v2[UL+1]
254	.rotp ld_stage[LL], upack_stage[UL], st_stage[1]
255
256
257//	Software pipelined loop:
258//	Stage 1: Load value of SRC
259//	Stage 2: Unpack the SRC_V to two 4 x 16 bit signed data
260//	Stage 3: Store both 8 byte of 16 bit data
261
262
263.Loop_8to16copy:
264	{.mii
265		(ld_stage[0]) ld8 src_v[0] = [src], stride
266		(upack_stage[0]) unpack1.l dst_v1[0] = zero, src_v[LL]
267		(upack_stage[0]) unpack1.h dst_v2[0] = zero, src_v[LL]
268	}
269	{.mmb
270		(st_stage[0]) st8 [dst_1] = dst_v1[UL], 16
271		(st_stage[0]) st8 [dst_2] = dst_v2[UL], 16
272		br.ctop.sptk.few .Loop_8to16copy
273		;;
274	}
275
276//	*** Restore old LC and PRs ***
277	mov ar.lc = oldLC
278	mov pr = oldPR, -1
279
280	br.ret.sptk.many b0
281	.endp transfer_8to16copy_ia64#
282
283
284
285
286///////////////////////////////////////////////////////////////////////////////
287//
288// transfer_16to8copy_ia64
289//
290// src is a 64 x 16 bit signed continuous array. To convert the 16 bit
291// values to 8 bit unsigned data, PACK is used. So two 8-bytes-words of
292// 4 x 16 bit signed data are loaded, packed together and stored a 8-byte-word
293// of 8 x 8 unsigned data to the destination.
294///////////////////////////////////////////////////////////////////////////////
295
296	.align 16
297	.global transfer_16to8copy_ia64#
298	.proc transfer_16to8copy_ia64#
299transfer_16to8copy_ia64:
300	.prologue
301
302//	*** register renaming ***
303	dst = r14
304	src_1 = r15
305	src_2 = r17
306	stride = r16
307
308//	*** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
309	.save ar.lc, oldLC
310	mov oldLC = ar.lc
311	mov oldPR = pr
312
313
314	.body
315
316//	*** Allocating new stackframe, define rotating registers ***
317	alloc r9 = ar.pfs, 4, 92, 0, 96
318
319//	*** Saving Paramters ***
320	mov dst = r32
321	mov src_1 = r33
322	add src_2 = 8, r33
323	mov stride = r34
324
325//	*** init loop: set loop counter, epilog counter, predicates ***
326	mov ar.lc = 7
327	mov ar.ec = LL + PL + 1
328	mov pr.rot = 1 << 16
329	;;
330
331//	*** define register arrays and predicate array for software pipeline ***
332	// src_v1 = source value 1, dst_v = destination value
333	.rotr src_v1[LL+1], src_v2[LL+1], dst_v[PL+1]
334	.rotp ld_stage[LL], pack_stage[PL], st_stage[1]
335
336
337//	Software pipelined loop:
338//	Stage 1: Load two 8-byte-words of 4 x 16 bit signed source data
339//	Stage 2: Pack them together to one 8 byte 8 x 8 bit unsigned data
340//	Stage 3: Store the 8 byte to the destination address and add stride to
341//	         destination address (to get the next 8 byte line of destination)
342
343
344.Loop_16to8copy:
345	{.mmi
346		(ld_stage[0]) ld8 src_v1[0] = [src_1], 16
347		(ld_stage[0]) ld8 src_v2[0] = [src_2], 16
348		(pack_stage[0]) pack2.uss dst_v[0] = src_v1[LL], src_v2[LL]
349	}
350	{.mib
351		(st_stage[0]) st8 [dst] = dst_v[PL]
352		(st_stage[0]) add dst = dst, stride
353		br.ctop.sptk.few .Loop_16to8copy
354		;;
355	}
356
357//	*** Restore old LC and PRs ***
358	mov ar.lc = oldLC
359	mov pr = oldPR, -1
360
361	br.ret.sptk.many b0
362	.endp transfer_16to8copy_ia64#
363
364
365
366///////////////////////////////////////////////////////////////////////////////
367//
368// transfer_16to8add_ia64
369//
370// The 8-Bit-values of dst are "unpacked" into two 8-byte-blocks containing 16-
371// bit-values. These are "parallel-added" to the values of src. The result is
372// converted into 8-bit-values using "PACK" and stored at the adress of dst.
373// We assume that there is no misalignment.
374//
375///////////////////////////////////////////////////////////////////////////////
376
377	.align 16
378	.global transfer_16to8add_ia64#
379	.proc transfer_16to8add_ia64#
380
381transfer_16to8add_ia64:
382	.prologue
383
384//	*** register renaming ***
385	dst = r14
386	src = r15
387	stride = r16
388
389	_src = r17
390
391//	*** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
392	.save ar.lc, r2
393	mov oldLC = ar.lc
394	mov oldPR = pr
395
396
397	.body
398
399//	*** Allocating new stackframe, initialize LC, Epilogue-Counter and PR ***
400	alloc r9 = ar.pfs, 4, 92, 0, 96
401
402//	*** Saving Paramters ***
403	mov dst = r32
404	mov src = r33
405	mov stride = r34
406	add _src = 8, r33
407
408//	*** init loop: set loop counter, epilog counter, predicates ***
409	mov ar.lc = 7
410	mov ar.ec = LL + UL + PAL + PL + 1
411	mov pr.rot = 1 << 16
412	;;
413
414//	*** define register arrays and predicate array for software pipeline ***
415	.rotr _dst[LL+UL+PAL+PL+1], dst8[PL+1], pixel_1[PAL+1], pixel_2[PAL+1], w_dst16_1[UL+1], w_src_1[LL+UL+1], w_dst16_2[UL+1], w_src_2[LL+UL+1], w_dst8[LL+1]
416	.rotp s1_p[LL], s2_p[UL], s3_p[PAL], s4_p[PL], s5_p[1]
417
418
419//	Software pipelined loop:
420//	s1_p: The values of src and dst are loaded
421//	s2_p: The dst-values are converted to 16-bit-values
422//	s3_p: The values of src and dst are added
423// 	s4_p: The Results are packed into 8-bit-values
424//	s5_p: The 8-bit-values are stored at the dst-adresses
425
426
427.Loop_16to8add:
428	{.mii
429		(s1_p[0]) ld8 w_src_1[0] = [src], 16 // l�d die 1. H�lfte der j. Zeile von src (i = 0..3)
430		(s1_p[0]) mov _dst[0] = dst // erh�ht die Adresse von dst um stride
431		(s3_p[0]) padd2.sss pixel_1[0] = w_dst16_1[UL], w_src_1[LL+UL] // parallele Addition von scr und dst
432	}
433	{.mii
434		(s1_p[0]) ld8 w_dst8[0] = [dst], stride // l�d die j. Zeile von dst
435		(s2_p[0]) unpack1.l w_dst16_1[0] = r0, w_dst8[LL]; // dst wird f�r i = 0..3 in 16-Bit umgewandelt
436		(s2_p[0]) unpack1.h w_dst16_2[0] = r0, w_dst8[LL]; // dst wird f�r i = 4..7 in 16-Bit umgewandelt
437	}
438	{.mii
439		(s1_p[0]) ld8 w_src_2[0] = [_src], 16 // l�d die 2. H�lfte der j. Zeile von src (i = 4..7)
440		(s3_p[0]) padd2.sss pixel_2[0] = w_dst16_2[UL], w_src_2[LL+UL] // parallele Addition von scr und dst
441		(s4_p[0]) pack2.uss dst8[0] = pixel_1[PAL], pixel_2[PAL] // wandelt die Summen (pixel) in 8-Bit Werte um. Die �berpr�fung der Wertebereiche erfolgt automatisch
442	}
443	{.mmb
444		(s5_p[0]) st8 [_dst[LL+UL+PAL+PL]] = dst8[PL] // speichert dst ab
445		(s1_p[0]) nop.m 0
446		br.ctop.sptk.few .Loop_16to8add
447		;;
448	}
449
450//	*** Restore old LC and PRs ***
451	mov ar.lc = oldLC
452	mov pr = oldPR, -1
453
454	br.ret.sptk.many b0
455	.endp transfer_16to8add_ia64#
456
457
458
459///////////////////////////////////////////////////////////////////////////////
460//
461// transfer_8to16sub_ia64
462//
463// The 8-bit-values of ref and cur are loaded. cur is converted to 16-bit. The
464// Difference of cur and ref ist stored at the dct-adresses and cur is copied
465// into the ref-array.
466//
467// You must assume, that the data adressed by 'ref' are misaligned in memory.
468// But you can assume, that the other data are aligned (at least I hope so).
469//
470///////////////////////////////////////////////////////////////////////////////
471
472	.align 16
473	.global transfer_8to16sub_ia64#
474	.proc transfer_8to16sub_ia64#
475
476
477transfer_8to16sub_ia64:
478	.prologue
479
480//	*** register renaming ***
481	oldLC = r2
482	oldPR = r3
483
484	zero = r0 // damit ist die Zahl "zero" = 0 gemeint
485
486	//Die folgenden Register erhalten die gleichen Namen, wie die Variablen in der C-Vorlage
487	dct = r14
488	cur = r15
489	ref = r34 // muss nicht extra gesichert werden, deswegen bleibt das �bergabeRegister in dieser Liste
490	stride = r16
491
492	offset = r17 // Offset der falsch ausgerichteten Daten zum zurechtr�cken
493	aoffset = r18 // Gegenst�ck zum Offset,
494	ref_a1 = r19 // Adresse des ersten 64-Bit Blocks von ref
495	ref_a2 = r20 // Adresse des zweiten 64-Bit Blocks von ref
496
497	_dct = r21 // Register f�r die Zieladressen des 2. dct-Blocks
498
499//	*** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
500	.save ar.lc, r2
501	mov oldLC = ar.lc
502	mov oldPR = pr
503
504
505	.body
506
507//	*** Allocating new stackframe, define rotating registers ***
508	alloc r9 = ar.pfs, 4, 92, 0, 96
509
510//	*** Saving Paramters ***
511	mov dct = r32
512	mov cur = r33
513	// mov ref = r34: ref is unaligned, get aligned ref below...
514	mov stride = r35
515
516	and ref_a1 = -8, ref // Die Adresse des ersten 64-Bit Blocks, in dem ref liegt, wird berechnet (entspricht mod 8)
517	dep offset = ref, zero, 3, 3
518	;;
519	add ref_a2 = 8, ref_a1
520	sub aoffset = 64, offset // Gegenst�ck zum Offset wird berechnet
521	add _dct = 8, dct // Die Adresse f�r den 2. dct-Block wird berechnet, um 8 Byte (= 64 Bit) h�her als beim 1. Block
522
523//	*** init loop: set loop counter, epilog counter, predicates ***
524	mov ar.lc = 7
525	mov ar.ec = LL + SHL + OL + UL + PSL + 1
526	mov pr.rot = 1 << 16
527	;;
528
529//	*** define register arrays and predicate array for software pipeline ***
530	.rotr  c[LL+1], ref_v1[LL+1], ref_v2[LL+1], c16_1[SHL+OL+UL+1], c16_2[SHL+OL+UL+1], ref_shdr[SHL+1], ref_shdl[SHL+1], r[OL+1], r16_1[UL+1], r16_2[UL+1],  dct_1[PSL+1], dct_2[PSL+1], _cur[LL+SHL+OL+UL+1]
531	.rotp s1_p[LL], s2_p[SHL], s3_p[OL], s4_p[UL], s5_p[PSL], s6_p[1]
532
533
534//	Software pipelined loop:
535//	s1_p: The values of ref and cur ale loaded, a copy of cur is made.
536//	s2_p: cur is converted to 16-bit and thehe misaligned values of ref are
537// 	      shifted...
538//	s3_p: ... and copied together.
539//	s4_p: This ref-value is converted to 16-bit. The values of cur are stored
540//	      at the ref-adresses.
541//	s5_p: the ref- abd cur-values are substracted...
542//	s6_p: ...and the result is stored at the dct-adresses.
543
544
545loop_8to16sub:
546	{.mii
547		(s1_p[0]) ld8 ref_v1[0] = [ref_a1], stride // l�d den 1. 64-Bit-Block, der einen Teil der ref-Daten enth�lt
548		(s1_p[0]) mov _cur[0] = cur // cur wird f�r sp�tere Verwendung gesichert
549		(s2_p[0]) shr.u ref_shdr[0] = ref_v1[LL], offset // Die rechte H�lfte wird zurechtger�ckt
550	}
551	{.mii
552		(s1_p[0]) ld8 ref_v2[0] = [ref_a2], stride // l�d den 2. 64-Bit-Block
553		(s2_p[0]) shl ref_shdl[0] = ref_v2[LL], aoffset // Die linke H�lfte wird zurechtger�ckt
554		(s3_p[0]) or r[0] = ref_shdr[SHL], ref_shdl[SHL] // Die zurechtger�ckten Daten werden in r zusammenkopiert
555	}
556	{.mii
557		(s1_p[0]) ld8 c[0] = [cur], stride //l�d die j. Zeile von cur komplett
558		(s2_p[0]) unpack1.l c16_1[0] = zero, c[LL]; // c wird f�r i = 0..3 in 16-Bit umgewandelt
559		(s2_p[0]) unpack1.h c16_2[0] = zero, c[LL]; // c wird f�r i = 4..7 in 16-Bit umgewandelt
560	}
561	{.mii
562		(s4_p[0]) st8 [_cur[LL+SHL+OL]] = r[OL] // cur wird auf den Wert von r gesetzt
563		//Umwandeln der 8-Bit r und c -Werte in 16-bit Werte
564		(s4_p[0]) unpack1.l r16_1[0] = zero, r[OL]; // r wird f�r i = 0..3 in 16-Bit umgewandelt
565		(s4_p[0]) unpack1.h r16_2[0] = zero, r[OL]; // r wird f�r i = 4..7 in 16-Bit umgewandelt
566	}
567	{.mii
568		(s5_p[0]) psub2.sss dct_1[0] = c16_1[SHL+OL+UL], r16_1[UL] // Subtraktion der 1. H�fte der j. Zeile
569		(s5_p[0]) psub2.sss dct_2[0] = c16_2[SHL+OL+UL], r16_2[UL] // Subtraktion der 2. H�lfte
570	}
571	{.mmb
572		(s6_p[0]) st8 [dct] = dct_1[PSL], 16 // speichert den 1. 64-Bit-Block an der vorgesehenen Adresse, erh�hen der Adresse um 16 Byte f�r den n�chsten Wert
573		(s6_p[0]) st8 [_dct] = dct_2[PSL], 16 // speichert den 2. 64-Bit-Block an der vorgesehenen Adresse, erh�hen der Adresse um 16 Byte f�r den n�chsten Wert
574		br.ctop.sptk.few loop_8to16sub // Und hopp
575		;;
576	}
577
578//	*** Restore old LC and PRs ***
579	mov ar.lc = oldLC
580	mov pr = oldPR, -1
581
582	br.ret.sptk.many b0
583	.endp transfer_8to16sub_ia64#
584
585
586
587
588
589///////////////////////////////////////////////////////////////////////////////
590//
591// transfer_8to16sub2_ia64
592//
593// At the time, this function was written, it was not yet in use.
594// We assume that the values of ref1/2 are misaligned.
595//
596// The values of ref1/2 and cur are loaded, the ref-values need misalignment-
597// treatment. The values are converted to 16-bit using unpack. The average of
598// ref1 and ref2 is computed with pavg and substacted from cur. The results are
599// stored at the dct-adresses.
600// pavg1.raz is used to get the same results as the C-code-function.
601//
602///////////////////////////////////////////////////////////////////////////////
603
604	.text
605	.align 16
606	.global transfer_8to16sub2_ia64#
607	.proc transfer_8to16sub2_ia64#
608
609transfer_8to16sub2_ia64:
610	.prologue
611
612//	*** register renaming ***
613	//	We've tried to keep the C-Code names as often as possible, at least as
614	//	part of register-names
615	oldLC = r2
616	oldPR = r3
617
618	zero = r0
619
620	dct_al = r14 // dct: adress of left block in one line
621	dct_ar = r15 // dct: adress of right block in one line
622	cur = r16
623	ref1_al = r17 // ref1: aligned adress of lower part
624	ref1_ah = r18 // ref1: aligned adress of higher part
625	ref2_al = r19 // ref2: aligned adress of lower part
626	ref2_ah = r20 // ref2: aligned adress of higher part
627	stride = r21
628
629	offset_1 = r22
630	offset_2 = r23
631	aoffset_1 = r24
632	aoffset_2 = r25
633
634//	*** Saving old Loop-Counter (LC) and Predicate Registers (PR) ***
635	.save ar.lc, r2
636	mov oldLC = ar.lc
637	mov oldPR = pr
638
639
640	.body
641
642//	*** Saving Paramters ***
643//	*** (as inputregisters r32 + are needed for register-rotation) ***
644	mov dct_ar = r32
645	add dct_al = 8, r32
646	mov cur = r33
647
648	and ref1_al = -8, r34
649	and ref2_al = -8, r35	// ref2 aligned adrress of lower part
650
651	mov stride = r36
652
653//	***	Calculations for Misaligment-Handling ***
654	dep offset_1 = r34, zero, 3, 3
655	dep offset_2 = r35, zero, 3, 3
656	;;
657	add ref1_ah = 8, ref1_al
658	add ref2_ah = 8, ref2_al
659	sub aoffset_1 = 64, offset_1
660	sub aoffset_2 = 64, offset_2
661	;;
662
663//	*** Allocating new stackframe, define rotating registers ***
664	alloc r9 = ar.pfs, 5, 91, 0, 96
665
666//	*** init loop: set loop counter, epilog counter, predicates ***
667	mov ar.lc = 7
668	mov ar.ec = LL + SHL + OL + PAVGL + UL +PSL + 1
669	mov pr.rot = 1 << 16
670	;;
671
672//	*** define register arrays and predicate array for software pipeline ***
673	.rotr ref1_vl[LL+1], ref1_vh[LL+1], ref2_vl[LL+1], ref2_vh[LL+1], c[LL+SHL+OL+PAVGL+1], ref1_l[SHL+1], ref1_h[SHL+1], ref2_l[SHL+1], ref2_h[SHL+1], ref1_aligned[OL+1], ref2_aligned[OL+1], r[PAVGL+1], r16_l[UL+1], r16_r[UL+1], c16_l[UL+1], c16_r[UL+1], dct16_l[PSL+1], dct16_r[PSL+1]
674	.rotp ld_stage[LL], sh_stage[SHL], or_stage[OL], pavg_stage[PAVGL], up_stage[UL], psub_stage[PSL], st_stage[1]
675
676
677//	software pipelined loop:
678//	ld_stage:   The values of ref1, ref2, cur are loaded
679//	sh_stage:   The misaligned values of ref1/2 are shifted...
680//	or_stage:   ...and copied together.
681//	pavg_stage: The average of ref1 and ref2 is computed.
682//	up_stage:   The result and the cur-values are converted to 16-bit.
683//	psub_stage: Those values are substracted...
684//	st_stage:   ...and stored at the dct-adresses.
685
686
687.Loop_8to16sub2:
688	{.mii
689		(ld_stage[0])	ld8 c[0] = [cur], stride
690		(sh_stage[0])	shr.u ref1_l[0] = ref1_vl[LL], offset_1
691		(sh_stage[0])	shl ref1_h[0] = ref1_vh[LL], aoffset_1
692	}
693	{.mii
694		(ld_stage[0])	ld8 ref1_vl[0] = [ref1_al], stride
695		(sh_stage[0])	shr.u ref2_l[0] = ref2_vl[LL], offset_2
696		(sh_stage[0])	shl ref2_h[0] = ref2_vh[LL], aoffset_2
697	}
698	{.mii
699		(ld_stage[0])	ld8 ref1_vh[0] = [ref1_ah], stride
700		(or_stage[0])	or ref1_aligned[0] = ref1_h[SHL], ref1_l[SHL]
701		(or_stage[0])	or ref2_aligned[0] = ref2_h[SHL], ref2_l[SHL]
702	}
703	{.mii
704		(ld_stage[0])	ld8 ref2_vl[0] = [ref2_al], stride
705		(pavg_stage[0])	pavg1.raz r[0] = ref1_aligned[OL], ref2_aligned[OL]
706		(up_stage[0])	unpack1.l r16_r[0] = zero, r[PAVGL]
707	}
708	{.mii
709		(ld_stage[0])	ld8 ref2_vh[0] = [ref2_ah], stride
710		(up_stage[0])	unpack1.h r16_l[0] = zero, r[PAVGL]
711		(up_stage[0])	unpack1.l c16_r[0] = zero, c[LL+SHL+OL+PAVGL]
712	}
713	{.mii
714		(st_stage[0])	st8 [dct_ar] = dct16_r[PSL], 16
715		(up_stage[0])	unpack1.h c16_l[0] = zero, c[LL+SHL+OL+PAVGL]
716		(psub_stage[0])	psub2.sss dct16_l[0] = c16_l[UL], r16_l[UL]
717	}
718	{.mib
719		(st_stage[0])	st8 [dct_al] = dct16_l[PSL], 16
720		(psub_stage[0])	psub2.sss dct16_r[0] = c16_r[UL], r16_r[UL]
721		br.ctop.sptk.few .Loop_8to16sub2 // Und hopp
722		;;
723	}
724
725//	*** Restore old LC and PRs ***
726	mov ar.lc = oldLC
727	mov pr = oldPR, -1
728
729	br.ret.sptk.many b0
730	.endp transfer_8to16sub2_ia64#
731