1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10  *                                                                  *
11  ********************************************************************
12 
13   function:
14     last mod: $Id: mmxstate.c 16584 2009-09-26 19:35:55Z tterribe $
15 
16  ********************************************************************/
17 
18 /*MMX acceleration of complete fragment reconstruction algorithm.
19   Originally written by Rudolf Marek.*/
20 #include <string.h>
21 #include "x86int.h"
22 #include "mmxfrag.h"
23 #include "mmxloop.h"
24 
25 #if defined(OC_X86_ASM)
26 
oc_state_frag_recon_mmx(const oc_theora_state * _state,ptrdiff_t _fragi,int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant)27 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
28  int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
29   unsigned char *dst;
30   ptrdiff_t      frag_buf_off;
31   int            ystride;
32   int            mb_mode;
33   /*Apply the inverse transform.*/
34   /*Special case only having a DC component.*/
35   if(_last_zzi<2){
36     /*Note that this value must be unsigned, to keep the __asm__ block from
37        sign-extending it when it puts it in a register.*/
38     ogg_uint16_t p;
39     /*We round this dequant product (and not any of the others) because there's
40        no iDCT rounding.*/
41     p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
42     /*Fill _dct_coeffs with p.*/
43     __asm{
44 #define Y eax
45 #define P ecx
46       mov Y,_dct_coeffs
47       movzx P,p
48       /*mm0=0000 0000 0000 AAAA*/
49       movd mm0,P
50       /*mm0=0000 0000 AAAA AAAA*/
51       punpcklwd mm0,mm0
52       /*mm0=AAAA AAAA AAAA AAAA*/
53       punpckldq mm0,mm0
54       movq [Y],mm0
55       movq [8+Y],mm0
56       movq [16+Y],mm0
57       movq [24+Y],mm0
58       movq [32+Y],mm0
59       movq [40+Y],mm0
60       movq [48+Y],mm0
61       movq [56+Y],mm0
62       movq [64+Y],mm0
63       movq [72+Y],mm0
64       movq [80+Y],mm0
65       movq [88+Y],mm0
66       movq [96+Y],mm0
67       movq [104+Y],mm0
68       movq [112+Y],mm0
69       movq [120+Y],mm0
70 #undef Y
71 #undef P
72     }
73   }
74   else{
75     /*Dequantize the DC coefficient.*/
76     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
77     oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
78   }
79   /*Fill in the target buffer.*/
80   frag_buf_off=_state->frag_buf_offs[_fragi];
81   mb_mode=_state->frags[_fragi].mb_mode;
82   ystride=_state->ref_ystride[_pli];
83   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
84   if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
85   else{
86     const unsigned char *ref;
87     int                  mvoffsets[2];
88     ref=
89      _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
90      +frag_buf_off;
91     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
92      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
93       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
94        _dct_coeffs);
95     }
96     else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
97   }
98 }
99 
100 /*We copy these entire function to inline the actual MMX routines so that we
101    use only a single indirect call.*/
102 
103 /*Copies the fragments specified by the lists of fragment indices from one
104    frame to another.
105   _fragis:    A pointer to a list of fragment indices.
106   _nfragis:   The number of fragment indices to copy.
107   _dst_frame: The reference frame to copy to.
108   _src_frame: The reference frame to copy from.
109   _pli:       The color plane the fragments lie in.*/
oc_state_frag_copy_list_mmx(const oc_theora_state * _state,const ptrdiff_t * _fragis,ptrdiff_t _nfragis,int _dst_frame,int _src_frame,int _pli)110 void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
111  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
112  int _dst_frame,int _src_frame,int _pli){
113   const ptrdiff_t     *frag_buf_offs;
114   const unsigned char *src_frame_data;
115   unsigned char       *dst_frame_data;
116   ptrdiff_t            fragii;
117   int                  ystride;
118   dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
119   src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
120   ystride=_state->ref_ystride[_pli];
121   frag_buf_offs=_state->frag_buf_offs;
122   for(fragii=0;fragii<_nfragis;fragii++){
123     ptrdiff_t frag_buf_off;
124     frag_buf_off=frag_buf_offs[_fragis[fragii]];
125 #define SRC edx
126 #define DST eax
127 #define YSTRIDE ecx
128 #define YSTRIDE3 edi
129     OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
130      src_frame_data+frag_buf_off,ystride);
131 #undef SRC
132 #undef DST
133 #undef YSTRIDE
134 #undef YSTRIDE3
135   }
136 }
137 
138 /*Apply the loop filter to a given set of fragment rows in the given plane.
139   The filter may be run on the bottom edge, affecting pixels in the next row of
140    fragments, so this row also needs to be available.
141   _bv:        The bounding values array.
142   _refi:      The index of the frame buffer to filter.
143   _pli:       The color plane to filter.
144   _fragy0:    The Y coordinate of the first fragment row to filter.
145   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
oc_state_loop_filter_frag_rows_mmx(const oc_theora_state * _state,int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end)146 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
147  int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
148   OC_ALIGN8(unsigned char  ll[8]);
149   const oc_fragment_plane *fplane;
150   const oc_fragment       *frags;
151   const ptrdiff_t         *frag_buf_offs;
152   unsigned char           *ref_frame_data;
153   ptrdiff_t                fragi_top;
154   ptrdiff_t                fragi_bot;
155   ptrdiff_t                fragi0;
156   ptrdiff_t                fragi0_end;
157   int                      ystride;
158   int                      nhfrags;
159   memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
160   fplane=_state->fplanes+_pli;
161   nhfrags=fplane->nhfrags;
162   fragi_top=fplane->froffset;
163   fragi_bot=fragi_top+fplane->nfrags;
164   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
165   fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
166   ystride=_state->ref_ystride[_pli];
167   frags=_state->frags;
168   frag_buf_offs=_state->frag_buf_offs;
169   ref_frame_data=_state->ref_frame_data[_refi];
170   /*The following loops are constructed somewhat non-intuitively on purpose.
171     The main idea is: if a block boundary has at least one coded fragment on
172      it, the filter is applied to it.
173     However, the order that the filters are applied in matters, and VP3 chose
174      the somewhat strange ordering used below.*/
175   while(fragi0<fragi0_end){
176     ptrdiff_t fragi;
177     ptrdiff_t fragi_end;
178     fragi=fragi0;
179     fragi_end=fragi+nhfrags;
180     while(fragi<fragi_end){
181       if(frags[fragi].coded){
182         unsigned char *ref;
183         ref=ref_frame_data+frag_buf_offs[fragi];
184 #define PIX eax
185 #define YSTRIDE3 edi
186 #define YSTRIDE ecx
187 #define LL edx
188 #define D esi
189 #define D_WORD si
190         if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
191         if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
192         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
193           OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
194         }
195         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
196           OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
197         }
198 #undef PIX
199 #undef YSTRIDE3
200 #undef YSTRIDE
201 #undef LL
202 #undef D
203 #undef D_WORD
204       }
205       fragi++;
206     }
207     fragi0+=nhfrags;
208   }
209 }
210 
211 #endif
212