1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10  *                                                                  *
11  ********************************************************************
12 
13   function:
14     last mod: $Id: mmxstate.c 16503 2009-08-22 18:14:02Z giles $
15 
16  ********************************************************************/
17 
18 /*MMX acceleration of complete fragment reconstruction algorithm.
19   Originally written by Rudolf Marek.*/
20 #include <string.h>
21 #include "x86int.h"
22 #include "mmxfrag.h"
23 #include "mmxloop.h"
24 
25 #if defined(OC_X86_ASM)
26 
oc_state_frag_recon_mmx(const oc_theora_state * _state,ptrdiff_t _fragi,int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant)27 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
28  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
29   unsigned char *dst;
30   ptrdiff_t      frag_buf_off;
31   int            ystride;
32   int            mb_mode;
33   /*Apply the inverse transform.*/
34   /*Special case only having a DC component.*/
35   if(_last_zzi<2){
36     /*Note that this value must be unsigned, to keep the __asm__ block from
37        sign-extending it when it puts it in a register.*/
38     ogg_uint16_t p;
39     /*We round this dequant product (and not any of the others) because there's
40        no iDCT rounding.*/
41     p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
42     /*Fill _dct_coeffs with p.*/
43     __asm__ __volatile__(
44       /*mm0=0000 0000 0000 AAAA*/
45       "movd %[p],%%mm0\n\t"
46       /*mm0=0000 0000 AAAA AAAA*/
47       "punpcklwd %%mm0,%%mm0\n\t"
48       /*mm0=AAAA AAAA AAAA AAAA*/
49       "punpckldq %%mm0,%%mm0\n\t"
50       "movq %%mm0,(%[y])\n\t"
51       "movq %%mm0,8(%[y])\n\t"
52       "movq %%mm0,16(%[y])\n\t"
53       "movq %%mm0,24(%[y])\n\t"
54       "movq %%mm0,32(%[y])\n\t"
55       "movq %%mm0,40(%[y])\n\t"
56       "movq %%mm0,48(%[y])\n\t"
57       "movq %%mm0,56(%[y])\n\t"
58       "movq %%mm0,64(%[y])\n\t"
59       "movq %%mm0,72(%[y])\n\t"
60       "movq %%mm0,80(%[y])\n\t"
61       "movq %%mm0,88(%[y])\n\t"
62       "movq %%mm0,96(%[y])\n\t"
63       "movq %%mm0,104(%[y])\n\t"
64       "movq %%mm0,112(%[y])\n\t"
65       "movq %%mm0,120(%[y])\n\t"
66       :
67       :[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
68       :"memory"
69     );
70   }
71   else{
72     /*Dequantize the DC coefficient.*/
73     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
74     oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
75   }
76   /*Fill in the target buffer.*/
77   frag_buf_off=_state->frag_buf_offs[_fragi];
78   mb_mode=_state->frags[_fragi].mb_mode;
79   ystride=_state->ref_ystride[_pli];
80   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
81   if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
82   else{
83     const unsigned char *ref;
84     int                  mvoffsets[2];
85     ref=
86      _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
87      +frag_buf_off;
88     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
89      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
90       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
91        _dct_coeffs);
92     }
93     else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
94   }
95 }
96 
97 /*We copy these entire function to inline the actual MMX routines so that we
98    use only a single indirect call.*/
99 
100 /*Copies the fragments specified by the lists of fragment indices from one
101    frame to another.
102   _fragis:    A pointer to a list of fragment indices.
103   _nfragis:   The number of fragment indices to copy.
104   _dst_frame: The reference frame to copy to.
105   _src_frame: The reference frame to copy from.
106   _pli:       The color plane the fragments lie in.*/
oc_state_frag_copy_list_mmx(const oc_theora_state * _state,const ptrdiff_t * _fragis,ptrdiff_t _nfragis,int _dst_frame,int _src_frame,int _pli)107 void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
108  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
109  int _dst_frame,int _src_frame,int _pli){
110   const ptrdiff_t     *frag_buf_offs;
111   const unsigned char *src_frame_data;
112   unsigned char       *dst_frame_data;
113   ptrdiff_t            fragii;
114   int                  ystride;
115   dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
116   src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
117   ystride=_state->ref_ystride[_pli];
118   frag_buf_offs=_state->frag_buf_offs;
119   for(fragii=0;fragii<_nfragis;fragii++){
120     ptrdiff_t frag_buf_off;
121     frag_buf_off=frag_buf_offs[_fragis[fragii]];
122     OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
123      src_frame_data+frag_buf_off,ystride);
124   }
125 }
126 
127 /*Apply the loop filter to a given set of fragment rows in the given plane.
128   The filter may be run on the bottom edge, affecting pixels in the next row of
129    fragments, so this row also needs to be available.
130   _bv:        The bounding values array.
131   _refi:      The index of the frame buffer to filter.
132   _pli:       The color plane to filter.
133   _fragy0:    The Y coordinate of the first fragment row to filter.
134   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
oc_state_loop_filter_frag_rows_mmx(const oc_theora_state * _state,int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end)135 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
136  int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
137   OC_ALIGN8(unsigned char   ll[8]);
138   const oc_fragment_plane *fplane;
139   const oc_fragment       *frags;
140   const ptrdiff_t         *frag_buf_offs;
141   unsigned char           *ref_frame_data;
142   ptrdiff_t                fragi_top;
143   ptrdiff_t                fragi_bot;
144   ptrdiff_t                fragi0;
145   ptrdiff_t                fragi0_end;
146   int                      ystride;
147   int                      nhfrags;
148   memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
149   fplane=_state->fplanes+_pli;
150   nhfrags=fplane->nhfrags;
151   fragi_top=fplane->froffset;
152   fragi_bot=fragi_top+fplane->nfrags;
153   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
154   fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
155   ystride=_state->ref_ystride[_pli];
156   frags=_state->frags;
157   frag_buf_offs=_state->frag_buf_offs;
158   ref_frame_data=_state->ref_frame_data[_refi];
159   /*The following loops are constructed somewhat non-intuitively on purpose.
160     The main idea is: if a block boundary has at least one coded fragment on
161      it, the filter is applied to it.
162     However, the order that the filters are applied in matters, and VP3 chose
163      the somewhat strange ordering used below.*/
164   while(fragi0<fragi0_end){
165     ptrdiff_t fragi;
166     ptrdiff_t fragi_end;
167     fragi=fragi0;
168     fragi_end=fragi+nhfrags;
169     while(fragi<fragi_end){
170       if(frags[fragi].coded){
171         unsigned char *ref;
172         ref=ref_frame_data+frag_buf_offs[fragi];
173         if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
174         if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
175         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
176           OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
177         }
178         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
179           OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
180         }
181       }
182       fragi++;
183     }
184     fragi0+=nhfrags;
185   }
186 }
187 
188 #endif
189