1/***************************************************************************
2Copyright (c) 2013-2016, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*****************************************************************************/
27
28/**************************************************************************************
29* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
30* 	 BLASTEST 		: OK
31* 	 CTEST			: OK
32* 	 TEST			: OK
33*	 LAPACK-TEST		: OK
34**************************************************************************************/
35
36
37	srawi.		I,	M,	2
38	ble		DCOPYT_L2_BEGIN
39
40
41DCOPYT_L4_BEGIN:
42
43	mr		A0,	A
44	add		A1,	A0,	LDA
45	add		A2,	A1,	LDA
46	add		A3,	A2,	LDA
47	add		A,	A3,	LDA
48	mr		B16,	B
49	addi		B,	B,	64*SIZE
50
51	sradi.		J,	N,	4
52	ble		DCOPYT_L4x8_BEGIN
53
54	mr		BO,	B16
55	addi		T2,	M16,	384
56	mtctr		J
57
58	.align 5
59
60DCOPYT_L4x16_LOOP:
61
62	addi		T1,	M16,	256
63
64	dcbt	A0,	PREA
65	dcbt	A1,	PREA
66	dcbt	A2,	PREA
67	dcbt	A3,	PREA
68
69	dcbt  BO,	M16
70	dcbt  BO,	PREB
71	dcbt  BO,	T1
72	dcbt  BO,	T2
73
74	COPY_4x16
75
76	add		BO,	BO,	M16
77
78	// addic.		J,	J,	-1
79	bdnz+		DCOPYT_L4x16_LOOP
80
81DCOPYT_L4x8_BEGIN:
82
83	andi.		T1,	N,	8
84	ble		DCOPYT_L4x4_BEGIN
85
86	mr		BO,	B8
87
88	COPY_4x8
89
90
91	addi		B8,	B8,	32*SIZE
92
93DCOPYT_L4x4_BEGIN:
94
95	andi.		T1,	N,	4
96	ble		DCOPYT_L4x2_BEGIN
97
98	mr		BO,	B4
99
100	COPY_4x4
101
102
103	addi		B4,	B4,	16*SIZE
104
105DCOPYT_L4x2_BEGIN:
106
107	andi.		T1,	N,	2
108	ble		DCOPYT_L4x1_BEGIN
109
110	mr		BO,	B2
111
112	COPY_4x2
113
114
115	addi		B2,	B2,	8*SIZE
116
117DCOPYT_L4x1_BEGIN:
118
119	andi.		T1,	N,	1
120	ble		DCOPYT_L4_END
121
122	mr		BO,	B1
123
124	COPY_4x1
125
126
127	addi		B1,	B1,	4*SIZE
128
129DCOPYT_L4_END:
130
131	addic.		I,	I,	-1
132	bgt		DCOPYT_L4_BEGIN
133
134
135
136DCOPYT_L2_BEGIN:
137
138	andi.		T1,	M,	2
139	ble		DCOPYT_L1_BEGIN
140
141	mr		A0,	A
142	add		A1,	A0,	LDA
143	add		A,	A1,	LDA
144	mr		B16,	B
145	addi		B,	B,	32*SIZE
146
147	sradi.		J,	N,	4
148	ble		DCOPYT_L2x8_BEGIN
149
150	mr		BO,	B16
151
152DCOPYT_L2x16_LOOP:
153
154	COPY_2x16
155
156	add		BO,	BO,	M16
157
158	addic.		J,	J,	-1
159	bgt		DCOPYT_L2x16_LOOP
160
161DCOPYT_L2x8_BEGIN:
162
163	andi.		T1,	N,	8
164	ble		DCOPYT_L2x4_BEGIN
165
166	mr		BO,	B8
167
168	COPY_2x8
169
170
171	addi		B8,	B8,	16*SIZE
172
173DCOPYT_L2x4_BEGIN:
174
175	andi.		T1,	N,	4
176	ble		DCOPYT_L2x2_BEGIN
177
178	mr		BO,	B4
179
180	COPY_2x4
181
182
183	addi		B4,	B4,	8*SIZE
184
185DCOPYT_L2x2_BEGIN:
186
187	andi.		T1,	N,	2
188	ble		DCOPYT_L2x1_BEGIN
189
190	mr		BO,	B2
191
192	COPY_2x2
193
194
195	addi		B2,	B2,	4*SIZE
196
197DCOPYT_L2x1_BEGIN:
198
199	andi.		T1,	N,	1
200	ble		DCOPYT_L2_END
201
202	mr		BO,	B1
203
204	COPY_2x1
205
206
207	addi		B1,	B1,	2*SIZE
208
209DCOPYT_L2_END:
210
211
212DCOPYT_L1_BEGIN:
213
214	andi.		T1,	M,	1
215	ble		L999
216
217	mr		A0,	A
218	add		A,	A0,	LDA
219	mr		B16,	B
220	addi		B,	B,	16*SIZE
221
222	sradi.		J,	N,	4
223	ble		DCOPYT_L1x8_BEGIN
224
225	mr		BO,	B16
226
227DCOPYT_L1x16_LOOP:
228
229	COPY_1x16
230
231	add		BO,	BO,	M16
232
233	addic.		J,	J,	-1
234	bgt		DCOPYT_L1x16_LOOP
235
236DCOPYT_L1x8_BEGIN:
237
238	andi.		T1,	N,	8
239	ble		DCOPYT_L1x4_BEGIN
240
241	mr		BO,	B8
242
243	COPY_1x8
244
245
246	addi		B8,	B8,	8*SIZE
247
248DCOPYT_L1x4_BEGIN:
249
250	andi.		T1,	N,	4
251	ble		DCOPYT_L1x2_BEGIN
252
253	mr		BO,	B4
254
255	COPY_1x4
256
257
258	addi		B4,	B4,	4*SIZE
259
260DCOPYT_L1x2_BEGIN:
261
262	andi.		T1,	N,	2
263	ble		DCOPYT_L1x1_BEGIN
264
265	mr		BO,	B2
266
267	COPY_1x2
268
269
270	addi		B2,	B2,	2*SIZE
271
272DCOPYT_L1x1_BEGIN:
273
274	andi.		T1,	N,	1
275	ble		DCOPYT_L1_END
276
277	mr		BO,	B1
278
279	COPY_1x1
280
281
282	addi		B1,	B1,	1*SIZE
283
284DCOPYT_L1_END:
285
286