1
2      subroutine tce_sort_4(unsorted,sorted,a,b,c,d,
3     1                      i,j,k,l,factor)
4      implicit none
5#include "util.fh"
6#include "tce_main.fh"
7      integer version
8      integer a,b,c,d
9      integer i,j,k,l
10      integer id(4),jd(4),ia,ib,j1,j2,j3,j4
11      double precision sorted(a*b*c*d)
12      double precision unsorted(a*b*c*d)
13      double precision factor
14c
15      version = 1000*i+100*j+10*k+l
16c
17c transpose_1234.log:        The best loop order is:        2134
18c transpose_1243.log:        The best loop order is:        1243
19c transpose_1324.log:        The best loop order is:        1234
20c transpose_1342.log:        The best loop order is:        1342
21c transpose_1423.log:        The best loop order is:        1423
22c transpose_1432.log:        The best loop order is:        1342
23c
24      if (version.eq.1234) then
25        call transpose_1234_loop_2134(unsorted,sorted,a,b,c,d,factor)
26      elseif (version.eq.1243) then
27        call transpose_1243_loop_1243(unsorted,sorted,a,b,c,d,factor)
28      elseif (version.eq.1324) then
29        call transpose_1324_loop_1234(unsorted,sorted,a,b,c,d,factor)
30      elseif (version.eq.1342) then
31        call transpose_1342_loop_1342(unsorted,sorted,a,b,c,d,factor)
32      elseif (version.eq.1423) then
33        call transpose_1423_loop_1423(unsorted,sorted,a,b,c,d,factor)
34      elseif (version.eq.1432) then
35        call transpose_1432_loop_1342(unsorted,sorted,a,b,c,d,factor)
36c
37c transpose_2134.log:        The best loop order is:        2134
38c transpose_2143.log:        The best loop order is:        2143
39c transpose_2314.log:        The best loop order is:        2134
40c transpose_2341.log:        The best loop order is:        2341
41c transpose_2413.log:        The best loop order is:        2413
42c transpose_2431.log:        The best loop order is:        2341
43c
44      elseif (version.eq.2134) then
45        call transpose_2134_loop_2134(unsorted,sorted,a,b,c,d,factor)
46      elseif (version.eq.2143) then
47        call transpose_2143_loop_2143(unsorted,sorted,a,b,c,d,factor)
48      elseif (version.eq.2314) then
49        call transpose_2314_loop_2134(unsorted,sorted,a,b,c,d,factor)
50      elseif (version.eq.2341) then
51        call transpose_2341_loop_2341(unsorted,sorted,a,b,c,d,factor)
52      elseif (version.eq.2413) then
53        call transpose_2413_loop_2413(unsorted,sorted,a,b,c,d,factor)
54      elseif (version.eq.2431) then
55        call transpose_2431_loop_2341(unsorted,sorted,a,b,c,d,factor)
56c
57c transpose_3124.log:        The best loop order is:        1234
58c transpose_3142.log:        The best loop order is:        1342
59c transpose_3214.log:        The best loop order is:        2134
60c transpose_3241.log:        The best loop order is:        2341
61c transpose_3412.log:        The best loop order is:        1342
62c transpose_3421.log:        The best loop order is:        2341
63c
64      elseif (version.eq.3124) then
65        call transpose_3124_loop_1234(unsorted,sorted,a,b,c,d,factor)
66      elseif (version.eq.3142) then
67        call transpose_3142_loop_1342(unsorted,sorted,a,b,c,d,factor)
68      elseif (version.eq.3214) then
69        call transpose_3214_loop_2134(unsorted,sorted,a,b,c,d,factor)
70      elseif (version.eq.3241) then
71        call transpose_3241_loop_2341(unsorted,sorted,a,b,c,d,factor)
72      elseif (version.eq.3412) then
73        call transpose_3412_loop_1342(unsorted,sorted,a,b,c,d,factor)
74      elseif (version.eq.3421) then
75        call transpose_3421_loop_2341(unsorted,sorted,a,b,c,d,factor)
76c
77c transpose_4123.log:        The best loop order is:        1423
78c transpose_4132.log:        The best loop order is:        1342
79c transpose_4213.log:        The best loop order is:        2413
80c transpose_4231.log:        The best loop order is:        2341
81c transpose_4312.log:        The best loop order is:        1342
82c transpose_4321.log:        The best loop order is:        2341
83c
84      elseif (version.eq.4123) then
85        call transpose_4123_loop_1423(unsorted,sorted,a,b,c,d,factor)
86      elseif (version.eq.4132) then
87        call transpose_4132_loop_1342(unsorted,sorted,a,b,c,d,factor)
88      elseif (version.eq.4213) then
89        call transpose_4213_loop_2413(unsorted,sorted,a,b,c,d,factor)
90      elseif (version.eq.4231) then
91        call transpose_4231_loop_2341(unsorted,sorted,a,b,c,d,factor)
92      elseif (version.eq.4312) then
93        call transpose_4312_loop_1342(unsorted,sorted,a,b,c,d,factor)
94      elseif (version.eq.4321) then
95        call transpose_4321_loop_2341(unsorted,sorted,a,b,c,d,factor)
96c
97      else
98        print*,'something is wrong...'
99      endif
100
101      return
102      end
103        subroutine transpose_4321_loop_3241(unsorted,sorted,
104     &                           dim1,dim2,dim3,dim4,factor)
105        implicit none
106        integer dim1,dim2,dim3,dim4
107        integer xdim1,xdim4,rdim1,rdim4
108        integer old offset,new_offset
109        integer j1,j2,j3,j4
110        double precision sorted(dim1*dim2*dim3*dim4)
111        double precision unsorted(dim1*dim2*dim3*dim4)
112        double precision factor
113        rdim1=mod(dim1,4)
114        rdim4=mod(dim4,4)
115        xdim1=dim1-rdim1
116        xdim4=dim4-rdim4
117!DEC$ ivdep
118!DEC$ prefetch sorted
119!DEC$ prefetch unsorted
120!DEC$ vector always
121!DEC$ loop count min(24), max(40), avg(32)
122        do j3 = 1,dim3
123!DEC$ loop count min(24), max(40), avg(32)
124         do j2 = 1,dim2
125!DEC$ loop count min(24), max(40), avg(32)
126!DEC$ vector always
127          do j4 = 1,xdim4,4
128!DEC$ loop count min(24), max(40), avg(32)
129!DEC$ vector always
130           do j1 = 1,xdim1,4
131            sorted(0+j1+dim1*(j2-1+dim2*(j3-1+dim3*(0+j4-1)))) = factor*
132     &    unsorted(0+j4+dim4*(j3-1+dim3*(j2-1+dim2*(0+j1-1))))
133            sorted(1+j1+dim1*(j2-1+dim2*(j3-1+dim3*(0+j4-1)))) = factor*
134     &    unsorted(0+j4+dim4*(j3-1+dim3*(j2-1+dim2*(1+j1-1))))
135            sorted(2+j1+dim1*(j2-1+dim2*(j3-1+dim3*(0+j4-1)))) = factor*
136     &    unsorted(0+j4+dim4*(j3-1+dim3*(j2-1+dim2*(2+j1-1))))
137            sorted(3+j1+dim1*(j2-1+dim2*(j3-1+dim3*(0+j4-1)))) = factor*
138     &    unsorted(0+j4+dim4*(j3-1+dim3*(j2-1+dim2*(3+j1-1))))
139
140            sorted(0+j1+dim1*(j2-1+dim2*(j3-1+dim3*(1+j4-1)))) = factor*
141     &    unsorted(1+j4+dim4*(j3-1+dim3*(j2-1+dim2*(0+j1-1))))
142            sorted(1+j1+dim1*(j2-1+dim2*(j3-1+dim3*(1+j4-1)))) = factor*
143     &    unsorted(1+j4+dim4*(j3-1+dim3*(j2-1+dim2*(1+j1-1))))
144            sorted(2+j1+dim1*(j2-1+dim2*(j3-1+dim3*(1+j4-1)))) = factor*
145     &    unsorted(1+j4+dim4*(j3-1+dim3*(j2-1+dim2*(2+j1-1))))
146            sorted(3+j1+dim1*(j2-1+dim2*(j3-1+dim3*(1+j4-1)))) = factor*
147     &    unsorted(1+j4+dim4*(j3-1+dim3*(j2-1+dim2*(3+j1-1))))
148
149            sorted(0+j1+dim1*(j2-1+dim2*(j3-1+dim3*(2+j4-1)))) = factor*
150     &    unsorted(2+j4+dim4*(j3-1+dim3*(j2-1+dim2*(0+j1-1))))
151            sorted(1+j1+dim1*(j2-1+dim2*(j3-1+dim3*(2+j4-1)))) = factor*
152     &    unsorted(2+j4+dim4*(j3-1+dim3*(j2-1+dim2*(1+j1-1))))
153            sorted(2+j1+dim1*(j2-1+dim2*(j3-1+dim3*(2+j4-1)))) = factor*
154     &    unsorted(2+j4+dim4*(j3-1+dim3*(j2-1+dim2*(2+j1-1))))
155            sorted(3+j1+dim1*(j2-1+dim2*(j3-1+dim3*(2+j4-1)))) = factor*
156     &    unsorted(2+j4+dim4*(j3-1+dim3*(j2-1+dim2*(3+j1-1))))
157
158            sorted(0+j1+dim1*(j2-1+dim2*(j3-1+dim3*(3+j4-1)))) = factor*
159     &    unsorted(3+j4+dim4*(j3-1+dim3*(j2-1+dim2*(0+j1-1))))
160            sorted(1+j1+dim1*(j2-1+dim2*(j3-1+dim3*(3+j4-1)))) = factor*
161     &    unsorted(3+j4+dim4*(j3-1+dim3*(j2-1+dim2*(1+j1-1))))
162            sorted(2+j1+dim1*(j2-1+dim2*(j3-1+dim3*(3+j4-1)))) = factor*
163     &    unsorted(3+j4+dim4*(j3-1+dim3*(j2-1+dim2*(2+j1-1))))
164            sorted(3+j1+dim1*(j2-1+dim2*(j3-1+dim3*(3+j4-1)))) = factor*
165     &    unsorted(3+j4+dim4*(j3-1+dim3*(j2-1+dim2*(3+j1-1))))
166           enddo
167!DEC$ loop count min(0), max(4), avg(2)
168!DEC$ vector always
169           do j1 = xdim1+1,dim1,1
170            sorted(0+j1+dim1*(j2-1+dim2*(j3-1+dim3*(0+j4-1)))) = factor*
171     &    unsorted(0+j4+dim4*(j3-1+dim3*(j2-1+dim2*(0+j1-1))))
172            sorted(0+j1+dim1*(j2-1+dim2*(j3-1+dim3*(1+j4-1)))) = factor*
173     &    unsorted(1+j4+dim4*(j3-1+dim3*(j2-1+dim2*(0+j1-1))))
174            sorted(0+j1+dim1*(j2-1+dim2*(j3-1+dim3*(2+j4-1)))) = factor*
175     &    unsorted(2+j4+dim4*(j3-1+dim3*(j2-1+dim2*(0+j1-1))))
176            sorted(0+j1+dim1*(j2-1+dim2*(j3-1+dim3*(3+j4-1)))) = factor*
177     &    unsorted(3+j4+dim4*(j3-1+dim3*(j2-1+dim2*(0+j1-1))))
178           enddo
179          enddo
180!DEC$ loop count min(0), max(4), avg(2)
181!DEC$ vector always
182          do j4 = xdim4+1,dim4,1
183!DEC$ loop count min(24), max(40), avg(32)
184!DEC$ vector always
185           do j1 = 1,xdim1,4
186            sorted(0+j1+dim1*(j2-1+dim2*(j3-1+dim3*(0+j4-1)))) = factor*
187     &    unsorted(0+j4+dim4*(j3-1+dim3*(j2-1+dim2*(0+j1-1))))
188            sorted(1+j1+dim1*(j2-1+dim2*(j3-1+dim3*(0+j4-1)))) = factor*
189     &    unsorted(0+j4+dim4*(j3-1+dim3*(j2-1+dim2*(1+j1-1))))
190            sorted(2+j1+dim1*(j2-1+dim2*(j3-1+dim3*(0+j4-1)))) = factor*
191     &    unsorted(0+j4+dim4*(j3-1+dim3*(j2-1+dim2*(2+j1-1))))
192            sorted(3+j1+dim1*(j2-1+dim2*(j3-1+dim3*(0+j4-1)))) = factor*
193     &    unsorted(0+j4+dim4*(j3-1+dim3*(j2-1+dim2*(3+j1-1))))
194           enddo
195          enddo
196!DEC$ loop count min(0), max(4), avg(2)
197!DEC$ vector always
198          do j4 = xdim4+1,dim4,1
199!DEC$ loop count min(0), max(4), avg(2)
200!DEC$ vector always
201           do j1 = xdim1+1,dim1,1
202            sorted(0+j1+dim1*(j2-1+dim2*(j3-1+dim3*(0+j4-1)))) = factor*
203     &    unsorted(0+j4+dim4*(j3-1+dim3*(j2-1+dim2*(0+j1-1))))
204           enddo
205          enddo
206         enddo
207        enddo
208        return
209        end
210c
211c
212c  ADD AUTO-GENERATED CODE HERE
213c
214c
215        subroutine transpose_1234_loop_2134(unsorted,sorted,
216     &                           dim1,dim2,dim3,dim4,factor)
217        implicit none
218        integer dim1,dim2,dim3,dim4
219        integer old_offset,new_offset
220        integer j1,j2,j3,j4
221        double precision sorted(dim1*dim2*dim3*dim4)
222        double precision unsorted(dim1*dim2*dim3*dim4)
223        double precision factor
224!DEC$ prefetch sorted
225!DEC$ prefetch unsorted
226!DEC$ ivdep
227!DEC$ loop count min(24), max(40), avg(32)
228        do j2 = 1,dim2
229!DEC$ loop count min(24), max(40), avg(32)
230         do j1 = 1,dim1
231!DEC$ loop count min(24), max(40), avg(32)
232!DEC$ unroll(8)
233          do j3 = 1,dim3
234!DEC$ loop count min(24), max(40), avg(32)
235!DEC$ unroll(8)
236!DEC$ vector always
237           do j4 = 1,dim4
238            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
239            new_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
240            sorted(new_offset) = unsorted(old_offset) * factor
241           enddo
242          enddo
243         enddo
244        enddo
245        return
246        end
247        subroutine transpose_1243_loop_1243(unsorted,sorted,
248     &                           dim1,dim2,dim3,dim4,factor)
249        implicit none
250        integer dim1,dim2,dim3,dim4
251        integer old_offset,new_offset
252        integer j1,j2,j3,j4
253        double precision sorted(dim1*dim2*dim3*dim4)
254        double precision unsorted(dim1*dim2*dim3*dim4)
255        double precision factor
256!DEC$ prefetch sorted
257!DEC$ prefetch unsorted
258!DEC$ ivdep
259!DEC$ loop count min(24), max(40), avg(32)
260        do j1 = 1,dim1
261!DEC$ loop count min(24), max(40), avg(32)
262         do j2 = 1,dim2
263!DEC$ loop count min(24), max(40), avg(32)
264!DEC$ unroll(8)
265          do j4 = 1,dim4
266!DEC$ loop count min(24), max(40), avg(32)
267!DEC$ unroll(8)
268!DEC$ vector always
269           do j3 = 1,dim3
270            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
271            new_offset = j3+dim3*(j4-1+dim4*(j2-1+dim2*(j1-1)))
272            sorted(new_offset) = unsorted(old_offset) * factor
273           enddo
274          enddo
275         enddo
276        enddo
277        return
278        end
279        subroutine transpose_1324_loop_1234(unsorted,sorted,
280     &                           dim1,dim2,dim3,dim4,factor)
281        implicit none
282        integer dim1,dim2,dim3,dim4
283        integer old_offset,new_offset
284        integer j1,j2,j3,j4
285        double precision sorted(dim1*dim2*dim3*dim4)
286        double precision unsorted(dim1*dim2*dim3*dim4)
287        double precision factor
288!DEC$ prefetch sorted
289!DEC$ prefetch unsorted
290!DEC$ ivdep
291!DEC$ loop count min(24), max(40), avg(32)
292        do j1 = 1,dim1
293!DEC$ loop count min(24), max(40), avg(32)
294         do j2 = 1,dim2
295!DEC$ loop count min(24), max(40), avg(32)
296!DEC$ unroll(8)
297          do j3 = 1,dim3
298!DEC$ loop count min(24), max(40), avg(32)
299!DEC$ unroll(8)
300!DEC$ vector always
301           do j4 = 1,dim4
302            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
303            new_offset = j4+dim4*(j2-1+dim2*(j3-1+dim3*(j1-1)))
304            sorted(new_offset) = unsorted(old_offset) * factor
305           enddo
306          enddo
307         enddo
308        enddo
309        return
310        end
311        subroutine transpose_1342_loop_1342(unsorted,sorted,
312     &                           dim1,dim2,dim3,dim4,factor)
313        implicit none
314        integer dim1,dim2,dim3,dim4
315        integer old_offset,new_offset
316        integer j1,j2,j3,j4
317        double precision sorted(dim1*dim2*dim3*dim4)
318        double precision unsorted(dim1*dim2*dim3*dim4)
319        double precision factor
320!DEC$ prefetch sorted
321!DEC$ prefetch unsorted
322!DEC$ ivdep
323!DEC$ loop count min(24), max(40), avg(32)
324        do j1 = 1,dim1
325!DEC$ loop count min(24), max(40), avg(32)
326         do j3 = 1,dim3
327!DEC$ loop count min(24), max(40), avg(32)
328!DEC$ unroll(8)
329          do j4 = 1,dim4
330!DEC$ loop count min(24), max(40), avg(32)
331!DEC$ unroll(8)
332!DEC$ vector always
333           do j2 = 1,dim2
334            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
335            new_offset = j2+dim2*(j4-1+dim4*(j3-1+dim3*(j1-1)))
336            sorted(new_offset) = unsorted(old_offset) * factor
337           enddo
338          enddo
339         enddo
340        enddo
341        return
342        end
343        subroutine transpose_1423_loop_1423(unsorted,sorted,
344     &                           dim1,dim2,dim3,dim4,factor)
345        implicit none
346        integer dim1,dim2,dim3,dim4
347        integer old_offset,new_offset
348        integer j1,j2,j3,j4
349        double precision sorted(dim1*dim2*dim3*dim4)
350        double precision unsorted(dim1*dim2*dim3*dim4)
351        double precision factor
352!DEC$ prefetch sorted
353!DEC$ prefetch unsorted
354!DEC$ ivdep
355!DEC$ loop count min(24), max(40), avg(32)
356        do j1 = 1,dim1
357!DEC$ loop count min(24), max(40), avg(32)
358         do j4 = 1,dim4
359!DEC$ loop count min(24), max(40), avg(32)
360!DEC$ unroll(8)
361          do j2 = 1,dim2
362!DEC$ loop count min(24), max(40), avg(32)
363!DEC$ unroll(8)
364!DEC$ vector always
365           do j3 = 1,dim3
366            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
367            new_offset = j3+dim3*(j2-1+dim2*(j4-1+dim4*(j1-1)))
368            sorted(new_offset) = unsorted(old_offset) * factor
369           enddo
370          enddo
371         enddo
372        enddo
373        return
374        end
375        subroutine transpose_1432_loop_1342(unsorted,sorted,
376     &                           dim1,dim2,dim3,dim4,factor)
377        implicit none
378        integer dim1,dim2,dim3,dim4
379        integer old_offset,new_offset
380        integer j1,j2,j3,j4
381        double precision sorted(dim1*dim2*dim3*dim4)
382        double precision unsorted(dim1*dim2*dim3*dim4)
383        double precision factor
384!DEC$ prefetch sorted
385!DEC$ prefetch unsorted
386!DEC$ ivdep
387!DEC$ loop count min(24), max(40), avg(32)
388        do j1 = 1,dim1
389!DEC$ loop count min(24), max(40), avg(32)
390         do j3 = 1,dim3
391!DEC$ loop count min(24), max(40), avg(32)
392!DEC$ unroll(8)
393          do j4 = 1,dim4
394!DEC$ loop count min(24), max(40), avg(32)
395!DEC$ unroll(8)
396!DEC$ vector always
397           do j2 = 1,dim2
398            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
399            new_offset = j2+dim2*(j3-1+dim3*(j4-1+dim4*(j1-1)))
400            sorted(new_offset) = unsorted(old_offset) * factor
401           enddo
402          enddo
403         enddo
404        enddo
405        return
406        end
407        subroutine transpose_2134_loop_2134(unsorted,sorted,
408     &                           dim1,dim2,dim3,dim4,factor)
409        implicit none
410        integer dim1,dim2,dim3,dim4
411        integer old_offset,new_offset
412        integer j1,j2,j3,j4
413        double precision sorted(dim1*dim2*dim3*dim4)
414        double precision unsorted(dim1*dim2*dim3*dim4)
415        double precision factor
416!DEC$ prefetch sorted
417!DEC$ prefetch unsorted
418!DEC$ ivdep
419!DEC$ loop count min(24), max(40), avg(32)
420        do j2 = 1,dim2
421!DEC$ loop count min(24), max(40), avg(32)
422         do j1 = 1,dim1
423!DEC$ loop count min(24), max(40), avg(32)
424!DEC$ unroll(8)
425          do j3 = 1,dim3
426!DEC$ loop count min(24), max(40), avg(32)
427!DEC$ unroll(8)
428!DEC$ vector always
429           do j4 = 1,dim4
430            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
431            new_offset = j4+dim4*(j3-1+dim3*(j1-1+dim1*(j2-1)))
432            sorted(new_offset) = unsorted(old_offset) * factor
433           enddo
434          enddo
435         enddo
436        enddo
437        return
438        end
439        subroutine transpose_2143_loop_2143(unsorted,sorted,
440     &                           dim1,dim2,dim3,dim4,factor)
441        implicit none
442        integer dim1,dim2,dim3,dim4
443        integer old_offset,new_offset
444        integer j1,j2,j3,j4
445        double precision sorted(dim1*dim2*dim3*dim4)
446        double precision unsorted(dim1*dim2*dim3*dim4)
447        double precision factor
448!DEC$ prefetch sorted
449!DEC$ prefetch unsorted
450!DEC$ ivdep
451!DEC$ loop count min(24), max(40), avg(32)
452        do j2 = 1,dim2
453!DEC$ loop count min(24), max(40), avg(32)
454         do j1 = 1,dim1
455!DEC$ loop count min(24), max(40), avg(32)
456!DEC$ unroll(8)
457          do j4 = 1,dim4
458!DEC$ loop count min(24), max(40), avg(32)
459!DEC$ unroll(8)
460!DEC$ vector always
461           do j3 = 1,dim3
462            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
463            new_offset = j3+dim3*(j4-1+dim4*(j1-1+dim1*(j2-1)))
464            sorted(new_offset) = unsorted(old_offset) * factor
465           enddo
466          enddo
467         enddo
468        enddo
469        return
470        end
471        subroutine transpose_2314_loop_2134(unsorted,sorted,
472     &                           dim1,dim2,dim3,dim4,factor)
473        implicit none
474        integer dim1,dim2,dim3,dim4
475        integer old_offset,new_offset
476        integer j1,j2,j3,j4
477        double precision sorted(dim1*dim2*dim3*dim4)
478        double precision unsorted(dim1*dim2*dim3*dim4)
479        double precision factor
480!DEC$ prefetch sorted
481!DEC$ prefetch unsorted
482!DEC$ ivdep
483!DEC$ loop count min(24), max(40), avg(32)
484        do j2 = 1,dim2
485!DEC$ loop count min(24), max(40), avg(32)
486         do j1 = 1,dim1
487!DEC$ loop count min(24), max(40), avg(32)
488!DEC$ unroll(8)
489          do j3 = 1,dim3
490!DEC$ loop count min(24), max(40), avg(32)
491!DEC$ unroll(8)
492!DEC$ vector always
493           do j4 = 1,dim4
494            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
495            new_offset = j4+dim4*(j1-1+dim1*(j3-1+dim3*(j2-1)))
496            sorted(new_offset) = unsorted(old_offset) * factor
497           enddo
498          enddo
499         enddo
500        enddo
501        return
502        end
503        subroutine transpose_2341_loop_2341(unsorted,sorted,
504     &                           dim1,dim2,dim3,dim4,factor)
505        implicit none
506        integer dim1,dim2,dim3,dim4
507        integer old_offset,new_offset
508        integer j1,j2,j3,j4
509        double precision sorted(dim1*dim2*dim3*dim4)
510        double precision unsorted(dim1*dim2*dim3*dim4)
511        double precision factor
512!DEC$ prefetch sorted
513!DEC$ prefetch unsorted
514!DEC$ ivdep
515!DEC$ loop count min(24), max(40), avg(32)
516        do j2 = 1,dim2
517!DEC$ loop count min(24), max(40), avg(32)
518         do j3 = 1,dim3
519!DEC$ loop count min(24), max(40), avg(32)
520!DEC$ unroll(8)
521          do j4 = 1,dim4
522!DEC$ loop count min(24), max(40), avg(32)
523!DEC$ unroll(8)
524!DEC$ vector always
525           do j1 = 1,dim1
526            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
527            new_offset = j1+dim1*(j4-1+dim4*(j3-1+dim3*(j2-1)))
528            sorted(new_offset) = unsorted(old_offset) * factor
529           enddo
530          enddo
531         enddo
532        enddo
533        return
534        end
535        subroutine transpose_2413_loop_2413(unsorted,sorted,
536     &                           dim1,dim2,dim3,dim4,factor)
537        implicit none
538        integer dim1,dim2,dim3,dim4
539        integer old_offset,new_offset
540        integer j1,j2,j3,j4
541        double precision sorted(dim1*dim2*dim3*dim4)
542        double precision unsorted(dim1*dim2*dim3*dim4)
543        double precision factor
544!DEC$ prefetch sorted
545!DEC$ prefetch unsorted
546!DEC$ ivdep
547!DEC$ loop count min(24), max(40), avg(32)
548        do j2 = 1,dim2
549!DEC$ loop count min(24), max(40), avg(32)
550         do j4 = 1,dim4
551!DEC$ loop count min(24), max(40), avg(32)
552!DEC$ unroll(8)
553          do j1 = 1,dim1
554!DEC$ loop count min(24), max(40), avg(32)
555!DEC$ unroll(8)
556!DEC$ vector always
557           do j3 = 1,dim3
558            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
559            new_offset = j3+dim3*(j1-1+dim1*(j4-1+dim4*(j2-1)))
560            sorted(new_offset) = unsorted(old_offset) * factor
561           enddo
562          enddo
563         enddo
564        enddo
565        return
566        end
567        subroutine transpose_2431_loop_2341(unsorted,sorted,
568     &                           dim1,dim2,dim3,dim4,factor)
569        implicit none
570        integer dim1,dim2,dim3,dim4
571        integer old_offset,new_offset
572        integer j1,j2,j3,j4
573        double precision sorted(dim1*dim2*dim3*dim4)
574        double precision unsorted(dim1*dim2*dim3*dim4)
575        double precision factor
576!DEC$ prefetch sorted
577!DEC$ prefetch unsorted
578!DEC$ ivdep
579!DEC$ loop count min(24), max(40), avg(32)
580        do j2 = 1,dim2
581!DEC$ loop count min(24), max(40), avg(32)
582         do j3 = 1,dim3
583!DEC$ loop count min(24), max(40), avg(32)
584!DEC$ unroll(8)
585          do j4 = 1,dim4
586!DEC$ loop count min(24), max(40), avg(32)
587!DEC$ unroll(8)
588!DEC$ vector always
589           do j1 = 1,dim1
590            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
591            new_offset = j1+dim1*(j3-1+dim3*(j4-1+dim4*(j2-1)))
592            sorted(new_offset) = unsorted(old_offset) * factor
593           enddo
594          enddo
595         enddo
596        enddo
597        return
598        end
599        subroutine transpose_3124_loop_1234(unsorted,sorted,
600     &                           dim1,dim2,dim3,dim4,factor)
601        implicit none
602        integer dim1,dim2,dim3,dim4
603        integer old_offset,new_offset
604        integer j1,j2,j3,j4
605        double precision sorted(dim1*dim2*dim3*dim4)
606        double precision unsorted(dim1*dim2*dim3*dim4)
607        double precision factor
608!DEC$ prefetch sorted
609!DEC$ prefetch unsorted
610!DEC$ ivdep
611!DEC$ loop count min(24), max(40), avg(32)
612        do j1 = 1,dim1
613!DEC$ loop count min(24), max(40), avg(32)
614         do j2 = 1,dim2
615!DEC$ loop count min(24), max(40), avg(32)
616!DEC$ unroll(8)
617          do j3 = 1,dim3
618!DEC$ loop count min(24), max(40), avg(32)
619!DEC$ unroll(8)
620!DEC$ vector always
621           do j4 = 1,dim4
622            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
623            new_offset = j4+dim4*(j2-1+dim2*(j1-1+dim1*(j3-1)))
624            sorted(new_offset) = unsorted(old_offset) * factor
625           enddo
626          enddo
627         enddo
628        enddo
629        return
630        end
631        subroutine transpose_3142_loop_1342(unsorted,sorted,
632     &                           dim1,dim2,dim3,dim4,factor)
633        implicit none
634        integer dim1,dim2,dim3,dim4
635        integer old_offset,new_offset
636        integer j1,j2,j3,j4
637        double precision sorted(dim1*dim2*dim3*dim4)
638        double precision unsorted(dim1*dim2*dim3*dim4)
639        double precision factor
640!DEC$ prefetch sorted
641!DEC$ prefetch unsorted
642!DEC$ ivdep
643!DEC$ loop count min(24), max(40), avg(32)
644        do j1 = 1,dim1
645!DEC$ loop count min(24), max(40), avg(32)
646         do j3 = 1,dim3
647!DEC$ loop count min(24), max(40), avg(32)
648!DEC$ unroll(8)
649          do j4 = 1,dim4
650!DEC$ loop count min(24), max(40), avg(32)
651!DEC$ unroll(8)
652!DEC$ vector always
653           do j2 = 1,dim2
654            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
655            new_offset = j2+dim2*(j4-1+dim4*(j1-1+dim1*(j3-1)))
656            sorted(new_offset) = unsorted(old_offset) * factor
657           enddo
658          enddo
659         enddo
660        enddo
661        return
662        end
663        subroutine transpose_3214_loop_2134(unsorted,sorted,
664     &                           dim1,dim2,dim3,dim4,factor)
665        implicit none
666        integer dim1,dim2,dim3,dim4
667        integer old_offset,new_offset
668        integer j1,j2,j3,j4
669        double precision sorted(dim1*dim2*dim3*dim4)
670        double precision unsorted(dim1*dim2*dim3*dim4)
671        double precision factor
672!DEC$ prefetch sorted
673!DEC$ prefetch unsorted
674!DEC$ ivdep
675!DEC$ loop count min(24), max(40), avg(32)
676        do j2 = 1,dim2
677!DEC$ loop count min(24), max(40), avg(32)
678         do j1 = 1,dim1
679!DEC$ loop count min(24), max(40), avg(32)
680!DEC$ unroll(8)
681          do j3 = 1,dim3
682!DEC$ loop count min(24), max(40), avg(32)
683!DEC$ unroll(8)
684!DEC$ vector always
685           do j4 = 1,dim4
686            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
687            new_offset = j4+dim4*(j1-1+dim1*(j2-1+dim2*(j3-1)))
688            sorted(new_offset) = unsorted(old_offset) * factor
689           enddo
690          enddo
691         enddo
692        enddo
693        return
694        end
695        subroutine transpose_3241_loop_2341(unsorted,sorted,
696     &                           dim1,dim2,dim3,dim4,factor)
697        implicit none
698        integer dim1,dim2,dim3,dim4
699        integer old_offset,new_offset
700        integer j1,j2,j3,j4
701        double precision sorted(dim1*dim2*dim3*dim4)
702        double precision unsorted(dim1*dim2*dim3*dim4)
703        double precision factor
704!DEC$ prefetch sorted
705!DEC$ prefetch unsorted
706!DEC$ ivdep
707!DEC$ loop count min(24), max(40), avg(32)
708        do j2 = 1,dim2
709!DEC$ loop count min(24), max(40), avg(32)
710         do j3 = 1,dim3
711!DEC$ loop count min(24), max(40), avg(32)
712!DEC$ unroll(8)
713          do j4 = 1,dim4
714!DEC$ loop count min(24), max(40), avg(32)
715!DEC$ unroll(8)
716!DEC$ vector always
717           do j1 = 1,dim1
718            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
719            new_offset = j1+dim1*(j4-1+dim4*(j2-1+dim2*(j3-1)))
720            sorted(new_offset) = unsorted(old_offset) * factor
721           enddo
722          enddo
723         enddo
724        enddo
725        return
726        end
727        subroutine transpose_3412_loop_1342(unsorted,sorted,
728     &                           dim1,dim2,dim3,dim4,factor)
729        implicit none
730        integer dim1,dim2,dim3,dim4
731        integer old_offset,new_offset
732        integer j1,j2,j3,j4
733        double precision sorted(dim1*dim2*dim3*dim4)
734        double precision unsorted(dim1*dim2*dim3*dim4)
735        double precision factor
736!DEC$ prefetch sorted
737!DEC$ prefetch unsorted
738!DEC$ ivdep
739!DEC$ loop count min(24), max(40), avg(32)
740        do j1 = 1,dim1
741!DEC$ loop count min(24), max(40), avg(32)
742         do j3 = 1,dim3
743!DEC$ loop count min(24), max(40), avg(32)
744!DEC$ unroll(8)
745          do j4 = 1,dim4
746!DEC$ loop count min(24), max(40), avg(32)
747!DEC$ unroll(8)
748!DEC$ vector always
749           do j2 = 1,dim2
750            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
751            new_offset = j2+dim2*(j1-1+dim1*(j4-1+dim4*(j3-1)))
752            sorted(new_offset) = unsorted(old_offset) * factor
753           enddo
754          enddo
755         enddo
756        enddo
757        return
758        end
759        subroutine transpose_3421_loop_2341(unsorted,sorted,
760     &                           dim1,dim2,dim3,dim4,factor)
761        implicit none
762        integer dim1,dim2,dim3,dim4
763        integer old_offset,new_offset
764        integer j1,j2,j3,j4
765        double precision sorted(dim1*dim2*dim3*dim4)
766        double precision unsorted(dim1*dim2*dim3*dim4)
767        double precision factor
768!DEC$ prefetch sorted
769!DEC$ prefetch unsorted
770!DEC$ ivdep
771!DEC$ loop count min(24), max(40), avg(32)
772        do j2 = 1,dim2
773!DEC$ loop count min(24), max(40), avg(32)
774         do j3 = 1,dim3
775!DEC$ loop count min(24), max(40), avg(32)
776!DEC$ unroll(8)
777          do j4 = 1,dim4
778!DEC$ loop count min(24), max(40), avg(32)
779!DEC$ unroll(8)
780!DEC$ vector always
781           do j1 = 1,dim1
782            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
783            new_offset = j1+dim1*(j2-1+dim2*(j4-1+dim4*(j3-1)))
784            sorted(new_offset) = unsorted(old_offset) * factor
785           enddo
786          enddo
787         enddo
788        enddo
789        return
790        end
791        subroutine transpose_4123_loop_1423(unsorted,sorted,
792     &                           dim1,dim2,dim3,dim4,factor)
793        implicit none
794        integer dim1,dim2,dim3,dim4
795        integer old_offset,new_offset
796        integer j1,j2,j3,j4
797        double precision sorted(dim1*dim2*dim3*dim4)
798        double precision unsorted(dim1*dim2*dim3*dim4)
799        double precision factor
800!DEC$ prefetch sorted
801!DEC$ prefetch unsorted
802!DEC$ ivdep
803!DEC$ loop count min(24), max(40), avg(32)
804        do j1 = 1,dim1
805!DEC$ loop count min(24), max(40), avg(32)
806         do j4 = 1,dim4
807!DEC$ loop count min(24), max(40), avg(32)
808!DEC$ unroll(8)
809          do j2 = 1,dim2
810!DEC$ loop count min(24), max(40), avg(32)
811!DEC$ unroll(8)
812!DEC$ vector always
813           do j3 = 1,dim3
814            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
815            new_offset = j3+dim3*(j2-1+dim2*(j1-1+dim1*(j4-1)))
816            sorted(new_offset) = unsorted(old_offset) * factor
817           enddo
818          enddo
819         enddo
820        enddo
821        return
822        end
823        subroutine transpose_4132_loop_1342(unsorted,sorted,
824     &                           dim1,dim2,dim3,dim4,factor)
825        implicit none
826        integer dim1,dim2,dim3,dim4
827        integer old_offset,new_offset
828        integer j1,j2,j3,j4
829        double precision sorted(dim1*dim2*dim3*dim4)
830        double precision unsorted(dim1*dim2*dim3*dim4)
831        double precision factor
832!DEC$ prefetch sorted
833!DEC$ prefetch unsorted
834!DEC$ ivdep
835!DEC$ loop count min(24), max(40), avg(32)
836        do j1 = 1,dim1
837!DEC$ loop count min(24), max(40), avg(32)
838         do j3 = 1,dim3
839!DEC$ loop count min(24), max(40), avg(32)
840!DEC$ unroll(8)
841          do j4 = 1,dim4
842!DEC$ loop count min(24), max(40), avg(32)
843!DEC$ unroll(8)
844!DEC$ vector always
845           do j2 = 1,dim2
846            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
847            new_offset = j2+dim2*(j3-1+dim3*(j1-1+dim1*(j4-1)))
848            sorted(new_offset) = unsorted(old_offset) * factor
849           enddo
850          enddo
851         enddo
852        enddo
853        return
854        end
855        subroutine transpose_4213_loop_2413(unsorted,sorted,
856     &                           dim1,dim2,dim3,dim4,factor)
857        implicit none
858        integer dim1,dim2,dim3,dim4
859        integer old_offset,new_offset
860        integer j1,j2,j3,j4
861        double precision sorted(dim1*dim2*dim3*dim4)
862        double precision unsorted(dim1*dim2*dim3*dim4)
863        double precision factor
864!DEC$ prefetch sorted
865!DEC$ prefetch unsorted
866!DEC$ ivdep
867!DEC$ loop count min(24), max(40), avg(32)
868        do j2 = 1,dim2
869!DEC$ loop count min(24), max(40), avg(32)
870         do j4 = 1,dim4
871!DEC$ loop count min(24), max(40), avg(32)
872!DEC$ unroll(8)
873          do j1 = 1,dim1
874!DEC$ loop count min(24), max(40), avg(32)
875!DEC$ unroll(8)
876!DEC$ vector always
877           do j3 = 1,dim3
878            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
879            new_offset = j3+dim3*(j1-1+dim1*(j2-1+dim2*(j4-1)))
880            sorted(new_offset) = unsorted(old_offset) * factor
881           enddo
882          enddo
883         enddo
884        enddo
885        return
886        end
887        subroutine transpose_4231_loop_2341(unsorted,sorted,
888     &                           dim1,dim2,dim3,dim4,factor)
889        implicit none
890        integer dim1,dim2,dim3,dim4
891        integer old_offset,new_offset
892        integer j1,j2,j3,j4
893        double precision sorted(dim1*dim2*dim3*dim4)
894        double precision unsorted(dim1*dim2*dim3*dim4)
895        double precision factor
896!DEC$ prefetch sorted
897!DEC$ prefetch unsorted
898!DEC$ ivdep
899!DEC$ loop count min(24), max(40), avg(32)
900        do j2 = 1,dim2
901!DEC$ loop count min(24), max(40), avg(32)
902         do j3 = 1,dim3
903!DEC$ loop count min(24), max(40), avg(32)
904!DEC$ unroll(8)
905          do j4 = 1,dim4
906!DEC$ loop count min(24), max(40), avg(32)
907!DEC$ unroll(8)
908!DEC$ vector always
909           do j1 = 1,dim1
910            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
911            new_offset = j1+dim1*(j3-1+dim3*(j2-1+dim2*(j4-1)))
912            sorted(new_offset) = unsorted(old_offset) * factor
913           enddo
914          enddo
915         enddo
916        enddo
917        return
918        end
919        subroutine transpose_4312_loop_1342(unsorted,sorted,
920     &                           dim1,dim2,dim3,dim4,factor)
921        implicit none
922        integer dim1,dim2,dim3,dim4
923        integer old_offset,new_offset
924        integer j1,j2,j3,j4
925        double precision sorted(dim1*dim2*dim3*dim4)
926        double precision unsorted(dim1*dim2*dim3*dim4)
927        double precision factor
928!DEC$ prefetch sorted
929!DEC$ prefetch unsorted
930!DEC$ ivdep
931!DEC$ loop count min(24), max(40), avg(32)
932        do j1 = 1,dim1
933!DEC$ loop count min(24), max(40), avg(32)
934         do j3 = 1,dim3
935!DEC$ loop count min(24), max(40), avg(32)
936!DEC$ unroll(8)
937          do j4 = 1,dim4
938!DEC$ loop count min(24), max(40), avg(32)
939!DEC$ unroll(8)
940!DEC$ vector always
941           do j2 = 1,dim2
942            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
943            new_offset = j2+dim2*(j1-1+dim1*(j3-1+dim3*(j4-1)))
944            sorted(new_offset) = unsorted(old_offset) * factor
945           enddo
946          enddo
947         enddo
948        enddo
949        return
950        end
951        subroutine transpose_4321_loop_2341(unsorted,sorted,
952     &                           dim1,dim2,dim3,dim4,factor)
953        implicit none
954        integer dim1,dim2,dim3,dim4
955        integer old_offset,new_offset
956        integer j1,j2,j3,j4
957        double precision sorted(dim1*dim2*dim3*dim4)
958        double precision unsorted(dim1*dim2*dim3*dim4)
959        double precision factor
960!DEC$ prefetch sorted
961!DEC$ prefetch unsorted
962!DEC$ ivdep
963!DEC$ loop count min(24), max(40), avg(32)
964        do j2 = 1,dim2
965!DEC$ loop count min(24), max(40), avg(32)
966         do j3 = 1,dim3
967!DEC$ loop count min(24), max(40), avg(32)
968!DEC$ unroll(8)
969          do j4 = 1,dim4
970!DEC$ loop count min(24), max(40), avg(32)
971!DEC$ unroll(8)
972!DEC$ vector always
973           do j1 = 1,dim1
974            old_offset = j4+dim4*(j3-1+dim3*(j2-1+dim2*(j1-1)))
975            new_offset = j1+dim1*(j2-1+dim2*(j3-1+dim3*(j4-1)))
976            sorted(new_offset) = unsorted(old_offset) * factor
977           enddo
978          enddo
979         enddo
980        enddo
981        return
982        end
983c $Id$
984