1 /* Copyright (C) 2008-2018 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 3, or (at your option) any later
8 version.
9 
10 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 for more details.
14 
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18 
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22 <http://www.gnu.org/licenses/>.  */
23 
24 #include <spu_mfcio.h>
25 #include <spu_internals.h>
26 #include <spu_intrinsics.h>
27 #include <spu_cache.h>
28 
29 extern unsigned long long __ea_local_store;
30 extern char __cache_tag_array_size;
31 
32 #define LINE_SIZE 128
33 #define TAG_MASK (LINE_SIZE - 1)
34 
35 #define WAYS 4
36 #define SET_MASK ((int) &__cache_tag_array_size - LINE_SIZE)
37 
38 #define CACHE_LINES ((int) &__cache_tag_array_size /		\
39 		     sizeof (struct __cache_tag_array) * WAYS)
40 
41 struct __cache_tag_array
42 {
43   unsigned int tag_lo[WAYS];
44   unsigned int tag_hi[WAYS];
45   void *base[WAYS];
46   int reserved[WAYS];
47   vector unsigned short dirty_bits[WAYS];
48 };
49 
50 extern struct __cache_tag_array __cache_tag_array[];
51 extern char __cache[];
52 
53 /* In order to make the code seem a little cleaner, and to avoid having
54    64/32 bit ifdefs all over the place, we use macros.  */
55 
56 #ifdef __EA64__
57 typedef unsigned long long addr;
58 
59 #define CHECK_TAG(_entry, _way, _tag)			\
60   ((_entry)->tag_lo[(_way)] == ((_tag) & 0xFFFFFFFF)	\
61    && (_entry)->tag_hi[(_way)] == ((_tag) >> 32))
62 
63 #define GET_TAG(_entry, _way) \
64   ((unsigned long long)(_entry)->tag_hi[(_way)] << 32	\
65    | (unsigned long long)(_entry)->tag_lo[(_way)])
66 
67 #define SET_TAG(_entry, _way, _tag)			\
68   (_entry)->tag_lo[(_way)] = (_tag) & 0xFFFFFFFF;	\
69   (_entry)->tag_hi[(_way)] = (_tag) >> 32
70 
71 #else /*__EA32__*/
72 typedef unsigned long addr;
73 
74 #define CHECK_TAG(_entry, _way, _tag)			\
75   ((_entry)->tag_lo[(_way)] == (_tag))
76 
77 #define GET_TAG(_entry, _way)				\
78   ((_entry)->tag_lo[(_way)])
79 
80 #define SET_TAG(_entry, _way, _tag)			\
81   (_entry)->tag_lo[(_way)] = (_tag)
82 
83 #endif
84 
85 /* In GET_ENTRY, we cast away the high 32 bits,
86    as the tag is only in the low 32.  */
87 
88 #define GET_ENTRY(_addr)						   \
89   ((struct __cache_tag_array *)						   \
90    si_to_uint (si_a (si_and (si_from_uint ((unsigned int) (addr) (_addr)), \
91 			     si_from_uint (SET_MASK)),			   \
92 	       si_from_uint ((unsigned int) __cache_tag_array))))
93 
94 #define GET_CACHE_LINE(_addr, _way) \
95   ((void *) (__cache + ((_addr) & SET_MASK) * WAYS) + ((_way) * LINE_SIZE));
96 
97 #define CHECK_DIRTY(_vec) (si_to_uint (si_orx ((qword) (_vec))))
98 #define SET_EMPTY(_entry, _way) ((_entry)->tag_lo[(_way)] = 1)
99 #define CHECK_EMPTY(_entry, _way) ((_entry)->tag_lo[(_way)] == 1)
100 
101 #define LS_FLAG 0x80000000
102 #define SET_IS_LS(_entry, _way) ((_entry)->reserved[(_way)] |= LS_FLAG)
103 #define CHECK_IS_LS(_entry, _way) ((_entry)->reserved[(_way)] & LS_FLAG)
104 #define GET_LRU(_entry, _way) ((_entry)->reserved[(_way)] & ~LS_FLAG)
105 
106 static int dma_tag = 32;
107 
108 static void
__cache_evict_entry(struct __cache_tag_array * entry,int way)109 __cache_evict_entry (struct __cache_tag_array *entry, int way)
110 {
111   addr tag = GET_TAG (entry, way);
112 
113   if (CHECK_DIRTY (entry->dirty_bits[way]) && !CHECK_IS_LS (entry, way))
114     {
115 #ifdef NONATOMIC
116       /* Non-atomic writes.  */
117       unsigned int oldmask, mach_stat;
118       char *line = ((void *) 0);
119 
120       /* Enter critical section.  */
121       mach_stat = spu_readch (SPU_RdMachStat);
122       spu_idisable ();
123 
124       /* Issue DMA request.  */
125       line = GET_CACHE_LINE (entry->tag_lo[way], way);
126       mfc_put (line, tag, LINE_SIZE, dma_tag, 0, 0);
127 
128       /* Wait for DMA completion.  */
129       oldmask = mfc_read_tag_mask ();
130       mfc_write_tag_mask (1 << dma_tag);
131       mfc_read_tag_status_all ();
132       mfc_write_tag_mask (oldmask);
133 
134       /* Leave critical section.  */
135       if (__builtin_expect (mach_stat & 1, 0))
136 	spu_ienable ();
137 #else
138       /* Allocate a buffer large enough that we know it has 128 bytes
139          that are 128 byte aligned (for DMA). */
140 
141       char buffer[LINE_SIZE + 127];
142       qword *buf_ptr = (qword *) (((unsigned int) (buffer) + 127) & ~127);
143       qword *line = GET_CACHE_LINE (entry->tag_lo[way], way);
144       qword bits;
145       unsigned int mach_stat;
146 
147       /* Enter critical section.  */
148       mach_stat = spu_readch (SPU_RdMachStat);
149       spu_idisable ();
150 
151       do
152 	{
153 	  /* We atomically read the current memory into a buffer
154 	     modify the dirty bytes in the buffer, and write it
155 	     back. If writeback fails, loop and try again.  */
156 
157 	  mfc_getllar (buf_ptr, tag, 0, 0);
158 	  mfc_read_atomic_status ();
159 
160 	  /* The method we're using to write 16 dirty bytes into
161 	     the buffer at a time uses fsmb which in turn uses
162 	     the least significant 16 bits of word 0, so we
163 	     load the bits and rotate so that the first bit of
164 	     the bitmap is in the first bit that fsmb will use.  */
165 
166 	  bits = (qword) entry->dirty_bits[way];
167 	  bits = si_rotqbyi (bits, -2);
168 
169 	  /* Si_fsmb creates the mask of dirty bytes.
170 	     Use selb to nab the appropriate bits.  */
171 	  buf_ptr[0] = si_selb (buf_ptr[0], line[0], si_fsmb (bits));
172 
173 	  /* Rotate to next 16 byte section of cache.  */
174 	  bits = si_rotqbyi (bits, 2);
175 
176 	  buf_ptr[1] = si_selb (buf_ptr[1], line[1], si_fsmb (bits));
177 	  bits = si_rotqbyi (bits, 2);
178 	  buf_ptr[2] = si_selb (buf_ptr[2], line[2], si_fsmb (bits));
179 	  bits = si_rotqbyi (bits, 2);
180 	  buf_ptr[3] = si_selb (buf_ptr[3], line[3], si_fsmb (bits));
181 	  bits = si_rotqbyi (bits, 2);
182 	  buf_ptr[4] = si_selb (buf_ptr[4], line[4], si_fsmb (bits));
183 	  bits = si_rotqbyi (bits, 2);
184 	  buf_ptr[5] = si_selb (buf_ptr[5], line[5], si_fsmb (bits));
185 	  bits = si_rotqbyi (bits, 2);
186 	  buf_ptr[6] = si_selb (buf_ptr[6], line[6], si_fsmb (bits));
187 	  bits = si_rotqbyi (bits, 2);
188 	  buf_ptr[7] = si_selb (buf_ptr[7], line[7], si_fsmb (bits));
189 	  bits = si_rotqbyi (bits, 2);
190 
191 	  mfc_putllc (buf_ptr, tag, 0, 0);
192 	}
193       while (mfc_read_atomic_status ());
194 
195       /* Leave critical section.  */
196       if (__builtin_expect (mach_stat & 1, 0))
197 	spu_ienable ();
198 #endif
199     }
200 
201   /* In any case, marking the lo tag with 1 which denotes empty.  */
202   SET_EMPTY (entry, way);
203   entry->dirty_bits[way] = (vector unsigned short) si_from_uint (0);
204 }
205 
206 void
__cache_evict(__ea void * ea)207 __cache_evict (__ea void *ea)
208 {
209   addr tag = (addr) ea & ~TAG_MASK;
210   struct __cache_tag_array *entry = GET_ENTRY (ea);
211   int i = 0;
212 
213   /* Cycles through all the possible ways an address could be at
214      and evicts the way if found.  */
215 
216   for (i = 0; i < WAYS; i++)
217     if (CHECK_TAG (entry, i, tag))
218       __cache_evict_entry (entry, i);
219 }
220 
221 static void *
__cache_fill(int way,addr tag)222 __cache_fill (int way, addr tag)
223 {
224   unsigned int oldmask, mach_stat;
225   char *line = ((void *) 0);
226 
227   /* Reserve our DMA tag.  */
228   if (dma_tag == 32)
229     dma_tag = mfc_tag_reserve ();
230 
231   /* Enter critical section.  */
232   mach_stat = spu_readch (SPU_RdMachStat);
233   spu_idisable ();
234 
235   /* Issue DMA request.  */
236   line = GET_CACHE_LINE (tag, way);
237   mfc_get (line, tag, LINE_SIZE, dma_tag, 0, 0);
238 
239   /* Wait for DMA completion.  */
240   oldmask = mfc_read_tag_mask ();
241   mfc_write_tag_mask (1 << dma_tag);
242   mfc_read_tag_status_all ();
243   mfc_write_tag_mask (oldmask);
244 
245   /* Leave critical section.  */
246   if (__builtin_expect (mach_stat & 1, 0))
247     spu_ienable ();
248 
249   return (void *) line;
250 }
251 
252 static void
__cache_miss(__ea void * ea,struct __cache_tag_array * entry,int way)253 __cache_miss (__ea void *ea, struct __cache_tag_array *entry, int way)
254 {
255 
256   addr tag = (addr) ea & ~TAG_MASK;
257   unsigned int lru = 0;
258   int i = 0;
259   int idx = 0;
260 
261   /* If way > 4, then there are no empty slots, so we must evict
262      the least recently used entry. */
263   if (way >= 4)
264     {
265       for (i = 0; i < WAYS; i++)
266 	{
267 	  if (GET_LRU (entry, i) > lru)
268 	    {
269 	      lru = GET_LRU (entry, i);
270 	      idx = i;
271 	    }
272 	}
273       __cache_evict_entry (entry, idx);
274       way = idx;
275     }
276 
277   /* Set the empty entry's tag and fill it's cache line. */
278 
279   SET_TAG (entry, way, tag);
280   entry->reserved[way] = 0;
281 
282   /* Check if the address is just an effective address within the
283      SPU's local store. */
284 
285   /* Because the LS is not 256k aligned, we can't do a nice and mask
286      here to compare, so we must check the whole range.  */
287 
288   if ((addr) ea >= (addr) __ea_local_store
289       && (addr) ea < (addr) (__ea_local_store + 0x40000))
290     {
291       SET_IS_LS (entry, way);
292       entry->base[way] =
293 	(void *) ((unsigned int) ((addr) ea -
294 				  (addr) __ea_local_store) & ~0x7f);
295     }
296   else
297     {
298       entry->base[way] = __cache_fill (way, tag);
299     }
300 }
301 
302 void *
__cache_fetch_dirty(__ea void * ea,int n_bytes_dirty)303 __cache_fetch_dirty (__ea void *ea, int n_bytes_dirty)
304 {
305 #ifdef __EA64__
306   unsigned int tag_hi;
307   qword etag_hi;
308 #endif
309   unsigned int tag_lo;
310   struct __cache_tag_array *entry;
311 
312   qword etag_lo;
313   qword equal;
314   qword bit_mask;
315   qword way;
316 
317   /* This first chunk, we merely fill the pointer and tag.  */
318 
319   entry = GET_ENTRY (ea);
320 
321 #ifndef __EA64__
322   tag_lo =
323     si_to_uint (si_andc
324 		(si_shufb
325 		 (si_from_uint ((addr) ea), si_from_uint (0),
326 		  si_from_uint (0x00010203)), si_from_uint (TAG_MASK)));
327 #else
328   tag_lo =
329     si_to_uint (si_andc
330 		(si_shufb
331 		 (si_from_ullong ((addr) ea), si_from_uint (0),
332 		  si_from_uint (0x04050607)), si_from_uint (TAG_MASK)));
333 
334   tag_hi =
335     si_to_uint (si_shufb
336 		(si_from_ullong ((addr) ea), si_from_uint (0),
337 		 si_from_uint (0x00010203)));
338 #endif
339 
340   /* Increment LRU in reserved bytes.  */
341   si_stqd (si_ai (si_lqd (si_from_ptr (entry), 48), 1),
342 	   si_from_ptr (entry), 48);
343 
344 missreturn:
345   /* Check if the entry's lo_tag is equal to the address' lo_tag.  */
346   etag_lo = si_lqd (si_from_ptr (entry), 0);
347   equal = si_ceq (etag_lo, si_from_uint (tag_lo));
348 #ifdef __EA64__
349   /* And the high tag too.  */
350   etag_hi = si_lqd (si_from_ptr (entry), 16);
351   equal = si_and (equal, (si_ceq (etag_hi, si_from_uint (tag_hi))));
352 #endif
353 
354   if ((si_to_uint (si_orx (equal)) == 0))
355     goto misshandler;
356 
357   if (n_bytes_dirty)
358     {
359       /* way = 0x40,0x50,0x60,0x70 for each way, which is also the
360          offset of the appropriate dirty bits.  */
361       way = si_shli (si_clz (si_gbb (equal)), 2);
362 
363       /* To create the bit_mask, we set it to all 1s (uint -1), then we
364          shift it over (128 - n_bytes_dirty) times.  */
365 
366       bit_mask = si_from_uint (-1);
367 
368       bit_mask =
369 	si_shlqby (bit_mask, si_from_uint ((LINE_SIZE - n_bytes_dirty) / 8));
370 
371       bit_mask =
372 	si_shlqbi (bit_mask, si_from_uint ((LINE_SIZE - n_bytes_dirty) % 8));
373 
374       /* Rotate it around to the correct offset.  */
375       bit_mask =
376 	si_rotqby (bit_mask,
377 		   si_from_uint (-1 * ((addr) ea & TAG_MASK) / 8));
378 
379       bit_mask =
380 	si_rotqbi (bit_mask,
381 		   si_from_uint (-1 * ((addr) ea & TAG_MASK) % 8));
382 
383       /* Update the dirty bits.  */
384       si_stqx (si_or (si_lqx (si_from_ptr (entry), way), bit_mask),
385 	       si_from_ptr (entry), way);
386     };
387 
388   /* We've definitely found the right entry, set LRU (reserved) to 0
389      maintaining the LS flag (MSB).  */
390 
391   si_stqd (si_andc
392 	   (si_lqd (si_from_ptr (entry), 48),
393 	    si_and (equal, si_from_uint (~(LS_FLAG)))),
394 	   si_from_ptr (entry), 48);
395 
396   return (void *)
397     si_to_uint (si_a
398 		(si_orx
399 		 (si_and (si_lqd (si_from_ptr (entry), 32), equal)),
400 		 si_from_uint (((unsigned int) (addr) ea) & TAG_MASK)));
401 
402 misshandler:
403   equal = si_ceqi (etag_lo, 1);
404   __cache_miss (ea, entry, (si_to_uint (si_clz (si_gbb (equal))) - 16) >> 2);
405   goto missreturn;
406 }
407 
408 void *
__cache_fetch(__ea void * ea)409 __cache_fetch (__ea void *ea)
410 {
411   return __cache_fetch_dirty (ea, 0);
412 }
413 
414 void
__cache_touch(__ea void * ea)415 __cache_touch (__ea void *ea __attribute__ ((unused)))
416 {
417   /* NO-OP for now.  */
418 }
419 
420 void __cache_flush (void) __attribute__ ((destructor));
421 void
__cache_flush(void)422 __cache_flush (void)
423 {
424   struct __cache_tag_array *entry = __cache_tag_array;
425   unsigned int i;
426   int j;
427 
428   /* Cycle through each cache entry and evict all used ways.  */
429 
430   for (i = 0; i < CACHE_LINES / WAYS; i++)
431     {
432       for (j = 0; j < WAYS; j++)
433 	if (!CHECK_EMPTY (entry, j))
434 	  __cache_evict_entry (entry, j);
435 
436       entry++;
437     }
438 }
439