1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "vpx_config.h"
12 #include "vp8_rtcd.h"
13 #include "vpx_ports/mem.h"
14 
15 extern const short vp8_six_tap_x86[8][6 * 8];
16 
17 extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
18                                       unsigned short *output_ptr,
19                                       unsigned int src_pixels_per_line,
20                                       unsigned int pixel_step,
21                                       unsigned int output_height,
22                                       unsigned int output_width,
23                                       const short *vp8_filter);
24 extern void vp8_filter_block1dc_v6_mmx(
25     unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
26     unsigned int pixels_per_line, unsigned int pixel_step,
27     unsigned int output_height, unsigned int output_width,
28     const short *vp8_filter);
29 extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
30                                         unsigned short *output_ptr,
31                                         unsigned int src_pixels_per_line,
32                                         unsigned int pixel_step,
33                                         unsigned int output_height,
34                                         unsigned int output_width,
35                                         const short *vp8_filter);
36 extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
37                                          unsigned short *output_ptr,
38                                          unsigned int src_pixels_per_line,
39                                          unsigned int pixel_step,
40                                          unsigned int output_height,
41                                          unsigned int output_width,
42                                          const short *vp8_filter);
43 extern void vp8_filter_block1d8_v6_sse2(
44     unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
45     unsigned int pixels_per_line, unsigned int pixel_step,
46     unsigned int output_height, unsigned int output_width,
47     const short *vp8_filter);
48 extern void vp8_filter_block1d16_v6_sse2(
49     unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
50     unsigned int pixels_per_line, unsigned int pixel_step,
51     unsigned int output_height, unsigned int output_width,
52     const short *vp8_filter);
53 extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
54                                          unsigned short *output_ptr,
55                                          unsigned int src_pixels_per_line,
56                                          unsigned int output_height,
57                                          unsigned int output_width);
58 extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
59                                              unsigned int src_pixels_per_line,
60                                              unsigned char *output_ptr,
61                                              int dst_ptich,
62                                              unsigned int output_height,
63                                              const short *vp8_filter);
64 extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
65                                               unsigned int src_pixels_per_line,
66                                               unsigned char *output_ptr,
67                                               int dst_ptich,
68                                               unsigned int output_height,
69                                               const short *vp8_filter);
70 extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
71                                              unsigned int src_pixels_per_line,
72                                              unsigned char *output_ptr,
73                                              int dst_ptich,
74                                              unsigned int output_height,
75                                              const short *vp8_filter);
76 
77 #if HAVE_MMX
vp8_sixtap_predict4x4_mmx(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)78 void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
79                                int xoffset, int yoffset, unsigned char *dst_ptr,
80                                int dst_pitch) {
81   DECLARE_ALIGNED(16, unsigned short,
82                   FData2[16 * 16]); /* Temp data bufffer used in filtering */
83   const short *HFilter, *VFilter;
84   HFilter = vp8_six_tap_x86[xoffset];
85   vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
86                             src_pixels_per_line, 1, 9, 8, HFilter);
87   VFilter = vp8_six_tap_x86[yoffset];
88   vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
89                              VFilter);
90 }
91 #endif
92 
93 #if HAVE_SSE2
vp8_sixtap_predict16x16_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)94 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
95                                   int src_pixels_per_line, int xoffset,
96                                   int yoffset, unsigned char *dst_ptr,
97                                   int dst_pitch) {
98   DECLARE_ALIGNED(16, unsigned short,
99                   FData2[24 * 24]); /* Temp data bufffer used in filtering */
100 
101   const short *HFilter, *VFilter;
102 
103   if (xoffset) {
104     if (yoffset) {
105       HFilter = vp8_six_tap_x86[xoffset];
106       vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
107                                    src_pixels_per_line, 1, 21, 32, HFilter);
108       VFilter = vp8_six_tap_x86[yoffset];
109       vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
110                                    dst_pitch, VFilter);
111     } else {
112       /* First-pass only */
113       HFilter = vp8_six_tap_x86[xoffset];
114       vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
115                                         dst_pitch, 16, HFilter);
116     }
117   } else {
118     /* Second-pass only */
119     VFilter = vp8_six_tap_x86[yoffset];
120     vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
121                                  src_pixels_per_line, 21, 32);
122     vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
123                                  dst_pitch, VFilter);
124   }
125 }
126 
vp8_sixtap_predict8x8_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)127 void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
128                                 int xoffset, int yoffset,
129                                 unsigned char *dst_ptr, int dst_pitch) {
130   DECLARE_ALIGNED(16, unsigned short,
131                   FData2[256]); /* Temp data bufffer used in filtering */
132   const short *HFilter, *VFilter;
133 
134   if (xoffset) {
135     if (yoffset) {
136       HFilter = vp8_six_tap_x86[xoffset];
137       vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
138                                   src_pixels_per_line, 1, 13, 16, HFilter);
139       VFilter = vp8_six_tap_x86[yoffset];
140       vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
141                                   dst_pitch, VFilter);
142     } else {
143       /* First-pass only */
144       HFilter = vp8_six_tap_x86[xoffset];
145       vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
146                                        dst_pitch, 8, HFilter);
147     }
148   } else {
149     /* Second-pass only */
150     VFilter = vp8_six_tap_x86[yoffset];
151     vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
152                                      src_pixels_per_line, dst_ptr, dst_pitch, 8,
153                                      VFilter);
154   }
155 }
156 
vp8_sixtap_predict8x4_sse2(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)157 void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
158                                 int xoffset, int yoffset,
159                                 unsigned char *dst_ptr, int dst_pitch) {
160   DECLARE_ALIGNED(16, unsigned short,
161                   FData2[256]); /* Temp data bufffer used in filtering */
162   const short *HFilter, *VFilter;
163 
164   if (xoffset) {
165     if (yoffset) {
166       HFilter = vp8_six_tap_x86[xoffset];
167       vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
168                                   src_pixels_per_line, 1, 9, 16, HFilter);
169       VFilter = vp8_six_tap_x86[yoffset];
170       vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
171                                   dst_pitch, VFilter);
172     } else {
173       /* First-pass only */
174       HFilter = vp8_six_tap_x86[xoffset];
175       vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
176                                        dst_pitch, 4, HFilter);
177     }
178   } else {
179     /* Second-pass only */
180     VFilter = vp8_six_tap_x86[yoffset];
181     vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
182                                      src_pixels_per_line, dst_ptr, dst_pitch, 4,
183                                      VFilter);
184   }
185 }
186 
187 #endif
188 
189 #if HAVE_SSSE3
190 
191 extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
192                                          unsigned int src_pixels_per_line,
193                                          unsigned char *output_ptr,
194                                          unsigned int output_pitch,
195                                          unsigned int output_height,
196                                          unsigned int vp8_filter_index);
197 
198 extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
199                                           unsigned int src_pixels_per_line,
200                                           unsigned char *output_ptr,
201                                           unsigned int output_pitch,
202                                           unsigned int output_height,
203                                           unsigned int vp8_filter_index);
204 
205 extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
206                                           unsigned int src_pitch,
207                                           unsigned char *output_ptr,
208                                           unsigned int out_pitch,
209                                           unsigned int output_height,
210                                           unsigned int vp8_filter_index);
211 
212 extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
213                                          unsigned int src_pitch,
214                                          unsigned char *output_ptr,
215                                          unsigned int out_pitch,
216                                          unsigned int output_height,
217                                          unsigned int vp8_filter_index);
218 
219 extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
220                                          unsigned int src_pixels_per_line,
221                                          unsigned char *output_ptr,
222                                          unsigned int output_pitch,
223                                          unsigned int output_height,
224                                          unsigned int vp8_filter_index);
225 
226 extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
227                                          unsigned int src_pitch,
228                                          unsigned char *output_ptr,
229                                          unsigned int out_pitch,
230                                          unsigned int output_height,
231                                          unsigned int vp8_filter_index);
232 
vp8_sixtap_predict16x16_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)233 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
234                                    int src_pixels_per_line, int xoffset,
235                                    int yoffset, unsigned char *dst_ptr,
236                                    int dst_pitch) {
237   DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
238 
239   if (xoffset) {
240     if (yoffset) {
241       vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
242                                     src_pixels_per_line, FData2, 16, 21,
243                                     xoffset);
244       vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
245                                     yoffset);
246     } else {
247       /* First-pass only */
248       vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
249                                     dst_pitch, 16, xoffset);
250     }
251   } else {
252     if (yoffset) {
253       /* Second-pass only */
254       vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
255                                     src_pixels_per_line, dst_ptr, dst_pitch, 16,
256                                     yoffset);
257     } else {
258       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
259        * yoffset==0) case correctly. Add copy function here to guarantee
260        * six-tap function handles all possible offsets. */
261       vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
262     }
263   }
264 }
265 
vp8_sixtap_predict8x8_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)266 void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
267                                  int src_pixels_per_line, int xoffset,
268                                  int yoffset, unsigned char *dst_ptr,
269                                  int dst_pitch) {
270   DECLARE_ALIGNED(16, unsigned char, FData2[256]);
271 
272   if (xoffset) {
273     if (yoffset) {
274       vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
275                                    src_pixels_per_line, FData2, 8, 13, xoffset);
276       vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
277     } else {
278       vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
279                                    dst_pitch, 8, xoffset);
280     }
281   } else {
282     if (yoffset) {
283       /* Second-pass only */
284       vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
285                                    src_pixels_per_line, dst_ptr, dst_pitch, 8,
286                                    yoffset);
287     } else {
288       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
289        * yoffset==0) case correctly. Add copy function here to guarantee
290        * six-tap function handles all possible offsets. */
291       vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
292     }
293   }
294 }
295 
vp8_sixtap_predict8x4_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)296 void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
297                                  int src_pixels_per_line, int xoffset,
298                                  int yoffset, unsigned char *dst_ptr,
299                                  int dst_pitch) {
300   DECLARE_ALIGNED(16, unsigned char, FData2[256]);
301 
302   if (xoffset) {
303     if (yoffset) {
304       vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
305                                    src_pixels_per_line, FData2, 8, 9, xoffset);
306       vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
307     } else {
308       /* First-pass only */
309       vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
310                                    dst_pitch, 4, xoffset);
311     }
312   } else {
313     if (yoffset) {
314       /* Second-pass only */
315       vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
316                                    src_pixels_per_line, dst_ptr, dst_pitch, 4,
317                                    yoffset);
318     } else {
319       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
320        * yoffset==0) case correctly. Add copy function here to guarantee
321        * six-tap function handles all possible offsets. */
322       vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
323     }
324   }
325 }
326 
vp8_sixtap_predict4x4_ssse3(unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * dst_ptr,int dst_pitch)327 void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
328                                  int src_pixels_per_line, int xoffset,
329                                  int yoffset, unsigned char *dst_ptr,
330                                  int dst_pitch) {
331   DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
332 
333   if (xoffset) {
334     if (yoffset) {
335       vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
336                                    src_pixels_per_line, FData2, 4, 9, xoffset);
337       vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
338     } else {
339       vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
340                                    dst_pitch, 4, xoffset);
341     }
342   } else {
343     if (yoffset) {
344       vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
345                                    src_pixels_per_line, dst_ptr, dst_pitch, 4,
346                                    yoffset);
347     } else {
348       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
349        * yoffset==0) case correctly. Add copy function here to guarantee
350        * six-tap function handles all possible offsets. */
351       int r;
352 
353       for (r = 0; r < 4; ++r) {
354         dst_ptr[0] = src_ptr[0];
355         dst_ptr[1] = src_ptr[1];
356         dst_ptr[2] = src_ptr[2];
357         dst_ptr[3] = src_ptr[3];
358         dst_ptr += dst_pitch;
359         src_ptr += src_pixels_per_line;
360       }
361     }
362   }
363 }
364 
365 #endif
366