1 // Copyright (c) 2012-2020 Intel Corporation
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in all
11 // copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 // SOFTWARE.
20 #include "../include/genx_me_common.h"
21 #define COMPLEX_BIDIR 1
22 #define INVERTMOTION  1
23 
24 typedef matrix<uchar, 4, 32> UniIn;
25 
26 _GENX_ inline
SetRef(vector_ref<short,2>,vector<short,2> mv_predictor,vector_ref<char,2> searchWindow,vector<uchar,2>,vector_ref<short,2> reference)27 void SetRef(
28     vector_ref<short, 2> /*source*/,     // IN:  SourceX, SourceY
29     vector<short, 2>     mv_predictor,      // IN:  mv predictor
30     vector_ref<char, 2> searchWindow,   // IN:  reference window w/h
31     vector<uchar, 2>    /*picSize*/,    // IN:  pic size w/h
32     vector_ref<short, 2> reference
33 )      // OUT: Ref0X, Ref0Y
34 {
35     vector<short, 2>
36         Width = (searchWindow - 16) >> 1,
37         MaxMvLen,
38         mask,
39         res,
40         otherRes;
41 
42     // set up parameters
43     MaxMvLen[0] = 0x7fff / 4;
44     MaxMvLen[1] = 0x7fff / 4;
45 
46     // fields and MBAFF are not supported
47     // remove quater pixel fraction
48     mv_predictor = mv_predictor >> 2;
49 
50     //
51     // set the reference position
52     //
53     reference = mv_predictor;
54     reference[1] &= -2;
55     reference -= Width;
56 
57     res      = MaxMvLen - Width;
58     mask     = (mv_predictor > res);
59     otherRes = MaxMvLen - (searchWindow - 16);
60     reference.merge(otherRes, mask);
61 
62     res      = -res;
63     mask     = (mv_predictor < res);
64     otherRes = -MaxMvLen;
65     reference.merge(otherRes, mask);
66 }
67 
68 extern "C" _GENX_MAIN_
MeP16_1MV_MRE(SurfaceIndex SURF_CONTROL,SurfaceIndex SURF_SRC_AND_REF,SurfaceIndex SURF_DIST16x16,SurfaceIndex SURF_MV16x16,uint start_xy,uchar blSize)69 void MeP16_1MV_MRE(
70     SurfaceIndex SURF_CONTROL,
71     SurfaceIndex SURF_SRC_AND_REF,
72     SurfaceIndex SURF_DIST16x16,
73     SurfaceIndex SURF_MV16x16,
74     uint         start_xy,
75     uchar        blSize
76 )
77 {
78     vector<uint, 1>
79         start_mbXY = start_xy;
80     uint
81         mbX = get_thread_origin_x() + start_mbXY.format<ushort>()[0],
82         mbY = get_thread_origin_y() + start_mbXY.format<ushort>()[1],
83         x   = mbX * blSize,
84         y   = mbY * blSize;
85 
86     vector<uchar, 96> control;
87     read(SURF_CONTROL, 0, control);
88 
89     uchar
90         maxNumSu = control.format<uchar>()[56],
91         lenSp    = control.format<uchar>()[57];
92     ushort
93         width      = control.format<ushort>()[30],
94         height     = control.format<ushort>()[31],
95         mre_width  = control.format<ushort>()[33],
96         mre_height = control.format<ushort>()[34],
97         precision  = control.format<ushort>()[36];
98 
99     cm_assert(x > width);
100     // read MB record data
101     UniIn
102         uniIn = 0;
103     matrix<uchar, 9, 32>
104         imeOut;
105     matrix<uchar, 2, 32>
106         imeIn = 0;
107     matrix<uchar, 4, 32>
108         fbrIn;
109 
110     // declare parameters for VME
111     matrix<uint, 16, 2>
112         costs = 0;
113     vector<short, 2>
114         mvPred = 0,
115         mvPred2 = 0;
116     uchar
117         x_r = 64,
118         y_r = 32;
119 
120     // load search path
121     imeIn.select<2, 1, 32, 1>(0) = control.select<64, 1>(0);
122 
123     // M0.2
124     VME_SET_UNIInput_SrcX(uniIn, x);
125     VME_SET_UNIInput_SrcY(uniIn, y);
126 
127     // M0.3 various prediction parameters
128     VME_SET_DWORD(uniIn, 0, 3, 0x76a40000); // BMEDisableFBR=1 InterSAD=2 8x8 16x16
129                                             //VME_SET_DWORD(uniIn, 0, 3, 0x76a00000); // BMEDisableFBR=0 InterSAD=2 SubMbPartMask=0x76: 8x8 16x16
130                                             //VME_SET_UNIInput_BMEDisableFBR(uniIn);
131                                             // M1.1 MaxNumMVs
132     VME_SET_UNIInput_MaxNumMVs(uniIn, 32);
133     // M0.5 Reference Window Width & Height
134     VME_SET_UNIInput_RefW(uniIn, x_r);//48);
135     VME_SET_UNIInput_RefH(uniIn, y_r);//40);
136     VME_SET_UNIInput_EarlyImeSuccessEn(uniIn);
137 
138     // M0.0 Ref0X, Ref0Y
139     vector_ref<short, 2>
140         sourceXY = uniIn.row(0).format<short>().select<2, 1>(4);
141     vector<uchar, 2>
142         widthHeight;
143     widthHeight[0] = (height >> 4) - 1;
144     widthHeight[1] = (width >> 4);
145     vector_ref<char, 2>
146         searchWindow = uniIn.row(0).format<char>().select<2, 1>(22);
147 
148     vector_ref<short, 2>
149         ref0XY = uniIn.row(0).format<short>().select<2, 1>(0);
150     SetRef(sourceXY, mvPred, searchWindow, widthHeight, ref0XY);
151 
152     vector_ref<short, 2>
153         ref1XY = uniIn.row(0).format<short>().select<2, 1>(2);
154     SetRef(sourceXY, mvPred2, searchWindow, widthHeight, ref1XY);
155 
156     // M1.0-3 Search path parameters & start centers & MaxNumMVs again!!!
157     VME_SET_UNIInput_AdaptiveEn(uniIn);
158     VME_SET_UNIInput_T8x8FlagForInterEn(uniIn);
159     VME_SET_UNIInput_MaxNumMVs(uniIn, 0x3f);
160     VME_SET_UNIInput_MaxNumSU(uniIn, maxNumSu);
161     VME_SET_UNIInput_LenSP(uniIn, lenSp);
162     //VME_SET_UNIInput_BiWeight(uniIn, 32);
163 
164     // M1.2 Start0X, Start0Y
165     vector<char, 2>
166         start0 = searchWindow;
167     start0 = ((start0 - 16) >> 3) & 0x0f;
168     uniIn.row(1)[10] = start0[0] | (start0[1] << 4);
169 
170     uniIn.row(1)[6] = 0x20;
171     uniIn.row(1)[31] = 0x1;
172 
173     vector<short, 2>
174         ref0 = uniIn.row(0).format<short>().select<2, 1>(0);
175     vector<ushort, 16>
176         costCenter = uniIn.row(3).format<ushort>().select<16, 1>(0);
177 
178     vector<short, 2>
179         mv16;
180     matrix<uint, 1, 1>
181         dist16x16;
182     run_vme_ime(uniIn, imeIn,
183         VME_STREAM_OUT, VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START,
184         SURF_SRC_AND_REF, ref0XY, NULL, costCenter, imeOut);
185     VME_GET_IMEOutput_Rec0_16x16_Mv(imeOut, mv16);
186     VME_GET_IMEOutput_Rec0_16x16_Distortion(imeOut, dist16x16);
187 
188     // distortions calculated before updates (subpel, bidir search)
189     write(SURF_DIST16x16, mbX * DIST_SIZE, mbY, dist16x16); //16x16 Forward SAD
190 
191     if (precision)
192     {//QPEL
193         VME_SET_UNIInput_SubPelMode(uniIn, 3);
194         VME_CLEAR_UNIInput_BMEDisableFBR(uniIn);
195         SLICE(fbrIn.format<uint>(), 1, 16, 2) = 0; // zero L1 motion vectors
196         VME_SET_UNIInput_FBRMbModeInput(uniIn, 0);
197         VME_SET_UNIInput_FBRSubMBShapeInput(uniIn, 0);
198         VME_SET_UNIInput_FBRSubPredModeInput(uniIn, 3);
199         matrix<uchar, 7, 32>
200             fbrOut16x16;
201         fbrIn.format<uint, 4, 8>().select<4, 1, 4, 2>(0, 0) = mv16.format<uint>()[0]; // motion vectors 16x16
202         run_vme_fbr(uniIn, fbrIn, SURF_SRC_AND_REF, 0, 0, 0, fbrOut16x16);
203         VME_GET_FBROutput_Rec0_16x16_Mv(fbrOut16x16, mv16);
204         VME_GET_FBROutput_Dist_16x16_Bi(fbrOut16x16, dist16x16);
205     }
206 
207     // distortions Actual complete distortion
208     //write(SURF_DIST16x16, mbX * DIST_SIZE, mbY, dist16x16);
209 
210     // motion vectors
211     write(SURF_MV16x16, mbX * MVDATA_SIZE, mbY, mv16);      //16x16mv Ref0
212 }
213 
214 extern "C" _GENX_MAIN_
MeP16_1MV_MRE_8x8(SurfaceIndex SURF_CONTROL,SurfaceIndex SURF_SRC_AND_REF,SurfaceIndex SURF_DIST8x8,SurfaceIndex SURF_MV8x8,uint start_xy,uchar blSize)215 void MeP16_1MV_MRE_8x8(
216     SurfaceIndex SURF_CONTROL,
217     SurfaceIndex SURF_SRC_AND_REF,
218     SurfaceIndex SURF_DIST8x8,
219     SurfaceIndex SURF_MV8x8,
220     uint         start_xy,
221     uchar        blSize
222 )
223 {
224     vector<uint, 1>
225         start_mbXY = start_xy;
226     uint
227         mbX = get_thread_origin_x() + start_mbXY.format<ushort>()[0],
228         mbY = get_thread_origin_y() + start_mbXY.format<ushort>()[1],
229         x   = mbX * blSize,
230         y   = mbY * blSize;
231 
232     vector<uchar, 96>
233         control;
234     read(SURF_CONTROL, 0, control);
235 
236     uchar
237         maxNumSu = control.format<uchar>()[56],
238         lenSp = control.format<uchar>()[57];
239     ushort
240         width      = control.format<ushort>()[30],
241         height     = control.format<ushort>()[31],
242         mre_width  = control.format<ushort>()[33],
243         mre_height = control.format<ushort>()[34],
244         precision  = control.format<ushort>()[36];
245 
246 
247     // read MB record data
248     UniIn
249         uniIn = 0;
250     matrix<uchar, 9, 32>
251         imeOut;
252     matrix<uchar, 2, 32>
253         imeIn = 0;
254     matrix<uchar, 4, 32>
255         fbrIn;
256 
257     // declare parameters for VME
258     matrix<uint, 16, 2>
259         costs = 0;
260     vector<short, 2>
261         mvPred = 0,
262         mvPred2 = 0;
263     //read(SURF_MV16x16, mbX * MVDATA_SIZE, mbY, mvPred); // these pred MVs will be updated later here
264     uchar
265         x_r = 64,
266         y_r = 32;
267 
268     // load search path
269     imeIn.select<2, 1, 32, 1>(0) = control.select<64, 1>(0);
270 
271     // M0.2
272     VME_SET_UNIInput_SrcX(uniIn, x);
273     VME_SET_UNIInput_SrcY(uniIn, y);
274 
275     // M0.3 various prediction parameters
276     //VME_SET_DWORD(uniIn, 0, 3, 0x76a40000); // BMEDisableFBR=1 InterSAD=2 8x8 16x16
277     //VME_SET_DWORD(uniIn, 0, 3, 0x76a00000); // BMEDisableFBR=0 InterSAD=2 SubMbPartMask=0x76: 8x8 16x16
278     VME_SET_DWORD(uniIn, 0, 3, 0x77a00000); // BMEDisableFBR=0 InterSAD=2 SubMbPartMask=0x77: 8x8
279                                             //VME_SET_UNIInput_BMEDisableFBR(uniIn);
280                                             // M1.1 MaxNumMVs
281     VME_SET_UNIInput_MaxNumMVs(uniIn, 32);
282     // M0.5 Reference Window Width & Height
283     VME_SET_UNIInput_RefW(uniIn, x_r);//48);
284     VME_SET_UNIInput_RefH(uniIn, y_r);//40);
285 
286                                       // M0.0 Ref0X, Ref0Y
287     vector_ref<short, 2>
288         sourceXY = uniIn.row(0).format<short>().select<2, 1>(4);
289     vector<uchar, 2>
290         widthHeight;
291     widthHeight[0] = (height >> 4) - 1;
292     widthHeight[1] = (width >> 4);
293     vector_ref<char, 2>
294         searchWindow = uniIn.row(0).format<char>().select<2, 1>(22);
295 
296     vector_ref<short, 2>
297         ref0XY = uniIn.row(0).format<short>().select<2, 1>(0);
298     SetRef(sourceXY, mvPred, searchWindow, widthHeight, ref0XY);
299 
300     // M1.0-3 Search path parameters & start centers & MaxNumMVs again!!!
301     VME_SET_UNIInput_AdaptiveEn(uniIn);
302     VME_SET_UNIInput_T8x8FlagForInterEn(uniIn);
303     VME_SET_UNIInput_MaxNumMVs(uniIn, 0x3f);
304     VME_SET_UNIInput_MaxNumSU(uniIn, maxNumSu);
305     VME_SET_UNIInput_LenSP(uniIn, lenSp);
306     //VME_SET_UNIInput_BiWeight(uniIn, 32);
307 
308     // M1.2 Start0X, Start0Y
309     vector<char, 2>
310         start0 = searchWindow;
311     start0 = ((start0 - 16) >> 3) & 0x0f;
312     uniIn.row(1)[10] = start0[0] | (start0[1] << 4);
313 
314     uniIn.row(1)[6] = 0x20;
315     uniIn.row(1)[31] = 0x1;
316 
317     vector<short, 2>
318         ref0 = uniIn.row(0).format<short>().select<2, 1>(0);
319     vector<ushort, 16>
320         costCenter = uniIn.row(3).format<ushort>().select<16, 1>(0);
321 
322     VME_SET_UNIInput_EarlyImeSuccessEn(uniIn);
323     matrix<short, 2, 4>
324         mv8;
325     vector<uint, 4>
326         dist8;
327 
328     run_vme_ime(uniIn, imeIn,
329         VME_STREAM_OUT, VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START,
330         SURF_SRC_AND_REF, ref0XY, NULL, costCenter, imeOut);
331     mv8   = imeOut.row(8).format<short>().select<8, 1>(8); // 4 MVs
332     dist8 = imeOut.row(7).format<ushort>().select<4, 1>(4);
333     // distortions Integer search results
334     // 8x8
335     write(SURF_DIST8x8, mbX * DIST_SIZE * 2, mbY * 2, dist8.format<uint, 2, 2>());     //8x8 Forward SAD
336     if (precision)
337     {//QPEL
338         VME_SET_UNIInput_SubPelMode(uniIn, 3);
339         VME_CLEAR_UNIInput_BMEDisableFBR(uniIn);
340         SLICE(fbrIn.format<uint>(), 1, 16, 2) = 0; // zero L1 motion vectors
341         matrix<uchar, 7, 32> fbrOut8x8;
342         VME_SET_UNIInput_FBRMbModeInput(uniIn, 3);
343         VME_SET_UNIInput_FBRSubMBShapeInput(uniIn, 0);
344         VME_SET_UNIInput_FBRSubPredModeInput(uniIn, 3);
345         fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(0, 0) = mv8.format<uint>()[0]; // motion vectors 8x8_0
346         fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(1, 0) = mv8.format<uint>()[1]; // motion vectors 8x8_1
347         fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(2, 0) = mv8.format<uint>()[2]; // motion vectors 8x8_2
348         fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(3, 0) = mv8.format<uint>()[3]; // motion vectors 8x8_3
349         run_vme_fbr(uniIn, fbrIn, SURF_SRC_AND_REF, 3, 0, 0, fbrOut8x8);
350         VME_GET_FBROutput_Rec0_8x8_4Mv(fbrOut8x8, mv8.format<uint>());
351         VME_GET_FBROutput_Dist_8x8_Bi(fbrOut8x8, dist8);
352     }
353 
354     // distortions actual complete distortion calculation
355     // 8x8
356     //write(SURF_DIST8x8  , mbX * DIST_SIZE * 2  , mbY * 2, dist8.format<uint,2,2>());     //8x8 Bidir distortions
357 
358     // motion vectors
359     // 8x8
360     write(SURF_MV8x8, mbX * MVDATA_SIZE * 2, mbY * 2, mv8);       //8x8mvs  Ref0
361 }
362 
363 extern "C" _GENX_MAIN_
MeP16bi_1MV2_MRE(SurfaceIndex SURF_CONTROL,SurfaceIndex SURF_SRC_AND_REF,SurfaceIndex SURF_SRC_AND_REF2,SurfaceIndex SURF_DIST16x16,SurfaceIndex SURF_MV16x16,SurfaceIndex SURF_MV16x16_2,uint start_xy,uchar blSize,char forwardRefDist,char backwardRefDist)364 void MeP16bi_1MV2_MRE(
365     SurfaceIndex SURF_CONTROL,
366     SurfaceIndex SURF_SRC_AND_REF,
367     SurfaceIndex SURF_SRC_AND_REF2,
368     SurfaceIndex SURF_DIST16x16,
369     SurfaceIndex SURF_MV16x16,
370     SurfaceIndex SURF_MV16x16_2,
371     uint         start_xy,
372     uchar        blSize,
373     char         forwardRefDist,
374     char         backwardRefDist
375 )
376 {
377     vector<uint, 1>
378         start_mbXY = start_xy;
379     uint
380         mbX = get_thread_origin_x() + start_mbXY.format<ushort>()[0],
381         mbY = get_thread_origin_y() + start_mbXY.format<ushort>()[1],
382         x   = mbX * blSize,
383         y   = mbY * blSize;
384 
385     vector<uchar, 96>
386         control;
387     read(SURF_CONTROL, 0, control);
388 
389     uchar
390         maxNumSu = control.format<uchar>()[56],
391         lenSp = control.format<uchar>()[57];
392     ushort
393         width      = control.format<ushort>()[30],
394         height     = control.format<ushort>()[31],
395         mre_width  = control.format<ushort>()[33],
396         mre_height = control.format<ushort>()[34],
397         precision  = control.format<ushort>()[36];
398 
399     // read MB record data
400     UniIn
401         uniIn = 0;
402 #if COMPLEX_BIDIR
403     matrix<uchar, 9, 32>
404         imeOut;
405 #else
406     matrix<uchar, 11, 32>
407         imeOut;
408 #endif
409     matrix<uchar, 2, 32>
410         imeIn = 0;
411     matrix<uchar, 4, 32>
412         fbrIn;
413 
414     // declare parameters for VME
415     matrix<uint, 16, 2> costs = 0;
416     vector<short, 2>
417         mvPred  = 0,
418         mvPred2 = 0;
419     //read(SURF_MV16x16, mbX * MVDATA_SIZE, mbY, mvPred); // these pred MVs will be updated later here
420 
421 #if COMPLEX_BIDIR
422     uchar
423         x_r = 64,
424         y_r = 32;
425 #else
426     uchar
427         x_r = 32,
428         y_r = 32;
429 #endif
430 
431     // load search path
432     imeIn.select<2, 1, 32, 1>(0) = control.select<64, 1>(0);
433 
434     // M0.2
435     VME_SET_UNIInput_SrcX(uniIn, x);
436     VME_SET_UNIInput_SrcY(uniIn, y);
437 
438     // M0.3 various prediction parameters
439 #if COMPLEX_BIDIR
440     VME_SET_DWORD(uniIn, 0, 3, 0x76a40000); // BMEDisableFBR=1 InterSAD=2 8x8 16x16
441 #else
442     VME_SET_DWORD(uniIn, 0, 3, 0x76a00000); // BMEDisableFBR=0 InterSAD=2 SubMbPartMask=0x76: 8x8 16x16
443 #endif
444     //VME_SET_UNIInput_BMEDisableFBR(uniIn);
445     // M1.1 MaxNumMVs
446     VME_SET_UNIInput_MaxNumMVs(uniIn, 32);
447     // M0.5 Reference Window Width & Height
448     VME_SET_UNIInput_RefW(uniIn, x_r);//48);
449     VME_SET_UNIInput_RefH(uniIn, y_r);//40);
450 
451     // M0.0 Ref0X, Ref0Y
452     vector_ref<short, 2>
453         sourceXY = uniIn.row(0).format<short>().select<2, 1>(4);
454     vector<uchar, 2>
455         widthHeight;
456     widthHeight[0] = (height >> 4) - 1;
457     widthHeight[1] = (width >> 4);
458     vector_ref<char, 2>
459         searchWindow = uniIn.row(0).format<char>().select<2, 1>(22);
460 
461     vector_ref<short, 2>
462         ref0XY = uniIn.row(0).format<short>().select<2, 1>(0);
463     SetRef(sourceXY, mvPred, searchWindow, widthHeight, ref0XY);
464 
465     vector_ref<short, 2>
466         ref1XY = uniIn.row(0).format<short>().select<2, 1>(2);
467 
468     // M1.0-3 Search path parameters & start centers & MaxNumMVs again!!!
469     VME_SET_UNIInput_AdaptiveEn(uniIn);
470     VME_SET_UNIInput_T8x8FlagForInterEn(uniIn);
471     VME_SET_UNIInput_MaxNumMVs(uniIn, 0x3f);
472     VME_SET_UNIInput_MaxNumSU(uniIn, maxNumSu);
473     VME_SET_UNIInput_LenSP(uniIn, lenSp);
474     //VME_SET_UNIInput_BiWeight(uniIn, 32);
475 
476     // M1.2 Start0X, Start0Y
477     vector<char, 2>
478         start0 = searchWindow;
479     start0 = ((start0 - 16) >> 3) & 0x0f;
480     uniIn.row(1)[10] = start0[0] | (start0[1] << 4);
481 
482     uniIn.row(1)[6] = 0x20;
483     uniIn.row(1)[31] = 0x1;
484 
485     vector<short, 2>
486         ref0 = uniIn.row(0).format<short>().select<2, 1>(0);
487     vector<ushort, 16>
488         costCenter = uniIn.row(3).format<ushort>().select<16, 1>(0);
489 
490     VME_SET_UNIInput_EarlyImeSuccessEn(uniIn);
491     vector<short, 2>
492         mv16, mv16_2;
493     matrix<uint, 1, 1>
494         dist16x16,
495         dist16x16_2;
496 #if COMPLEX_BIDIR
497     run_vme_ime(uniIn, imeIn,
498         VME_STREAM_OUT, VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START,
499         SURF_SRC_AND_REF, ref0XY, NULL, costCenter, imeOut);
500     VME_GET_IMEOutput_Rec0_16x16_Mv(imeOut, mv16);
501     VME_GET_IMEOutput_Rec0_16x16_Distortion(imeOut, dist16x16);
502 
503     mvPred2 = mv16 * backwardRefDist / forwardRefDist;
504     SetRef(sourceXY, mvPred2, searchWindow, widthHeight, ref1XY);
505     run_vme_ime(uniIn, imeIn,
506         VME_STREAM_OUT, VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START,
507         SURF_SRC_AND_REF2, ref1XY, NULL, costCenter, imeOut);
508     VME_GET_IMEOutput_Rec0_16x16_Mv(imeOut, mv16_2);
509     VME_GET_IMEOutput_Rec0_16x16_Distortion(imeOut, dist16x16_2);
510 #else
511     run_vme_ime(uniIn, imeIn,
512         VME_STREAM_OUT, VME_SEARCH_DUAL_REF_DUAL_REC,
513         SURF_SRC_AND_REF, ref0XY, ref1XY, costCenter, imeOut);
514 
515     VME_GET_IMEOutput_Rec0_16x16_Mv(imeOut, mv16);
516     VME_GET_IMEOutput_Rec0_16x16_Distortion(imeOut, dist16x16);
517 
518     VME_GET_IMEOutput_Rec1_16x16_Mv(imeOut, mv16_2);
519     VME_GET_IMEOutput_Rec1_16x16_Distortion(imeOut, dist16x16_2);
520 #endif
521     // distortions calculated before updates (subpel, bidir search)
522     write(SURF_DIST16x16, mbX * DIST_SIZE, mbY, dist16x16); //16x16 Forward SAD
523 
524     if (precision)//QPEL
525         VME_SET_UNIInput_SubPelMode(uniIn, 3);
526     else
527         VME_SET_UNIInput_SubPelMode(uniIn, 0);
528     VME_SET_UNIInput_BiWeight(uniIn, 32);
529 
530     VME_CLEAR_UNIInput_BMEDisableFBR(uniIn);
531     SLICE(fbrIn.format<uint>(), 1, 16, 2) = 0; // zero L1 motion vectors
532     VME_SET_UNIInput_FBRMbModeInput(uniIn, 0);
533     VME_SET_UNIInput_FBRSubMBShapeInput(uniIn, 0);
534     if (precision)//QPEL
535         VME_SET_UNIInput_FBRSubPredModeInput(uniIn, 3);
536     else
537         VME_SET_UNIInput_FBRSubPredModeInput(uniIn, 0);
538 
539     matrix<uchar, 7, 32>
540         fbrOut16x16;
541     fbrIn.format<uint, 4, 8>().select<4, 1, 4, 2>(0, 0) = mv16.format<uint>()[0]; // motion vectors 16x16
542     fbrIn.format<uint, 4, 8>().select<4, 1, 4, 2>(0, 1) = mv16_2.format<uint>()[0];
543     run_vme_fbr(uniIn, fbrIn, SURF_SRC_AND_REF, 0, 0, 170, fbrOut16x16);
544     VME_GET_FBROutput_Rec0_16x16_Mv(fbrOut16x16, mv16);
545     VME_GET_FBROutput_Rec1_16x16_Mv(fbrOut16x16, mv16_2);
546     VME_GET_FBROutput_Dist_16x16_Bi(fbrOut16x16, dist16x16);
547 
548 
549     // distortions Actual complete distortion
550     //write(SURF_DIST16x16, mbX * DIST_SIZE, mbY, dist16x16);
551 
552     // motion vectors
553     write(SURF_MV16x16, mbX * MVDATA_SIZE, mbY, mv16);      //16x16mv Ref0
554     write(SURF_MV16x16_2, mbX * MVDATA_SIZE, mbY, mv16_2);    //16x16mv Ref1
555 }
556 
557 extern "C" _GENX_MAIN_
MeP16bi_1MV2_MRE_8x8(SurfaceIndex SURF_CONTROL,SurfaceIndex SURF_SRC_AND_REF,SurfaceIndex SURF_SRC_AND_REF2,SurfaceIndex SURF_DIST8x8,SurfaceIndex SURF_MV8x8,SurfaceIndex SURF_MV8x8_2,uint start_xy,uchar blSize,char forwardRefDist,char backwardRefDist)558 void MeP16bi_1MV2_MRE_8x8(
559     SurfaceIndex SURF_CONTROL,
560     SurfaceIndex SURF_SRC_AND_REF,
561     SurfaceIndex SURF_SRC_AND_REF2,
562     SurfaceIndex SURF_DIST8x8,
563     SurfaceIndex SURF_MV8x8,
564     SurfaceIndex SURF_MV8x8_2,
565     uint         start_xy,
566     uchar        blSize,
567     char         forwardRefDist,
568     char         backwardRefDist
569 )
570 {
571     vector<uint, 1>
572         start_mbXY = start_xy;
573     uint
574         mbX = get_thread_origin_x() + start_mbXY.format<ushort>()[0],
575         mbY = get_thread_origin_y() + start_mbXY.format<ushort>()[1],
576         x = mbX * blSize,
577         y = mbY * blSize;
578 
579     vector<uchar, 96>
580         control;
581     read(SURF_CONTROL, 0, control);
582 
583     uchar
584         maxNumSu = control.format<uchar>()[56],
585         lenSp    = control.format<uchar>()[57];
586     ushort
587         width      = control.format<ushort>()[30],
588         height     = control.format<ushort>()[31],
589         mre_width  = control.format<ushort>()[33],
590         mre_height = control.format<ushort>()[34],
591         precision  = control.format<ushort>()[36];
592     // read MB record data
593 #if CMRT_EMU
594     if (x >= width)
595         return;
596     cm_assert(x < width);
597 #endif
598     UniIn
599         uniIn = 0;
600 #if COMPLEX_BIDIR
601     matrix<uchar, 9, 32>
602         imeOut;
603 #else
604     matrix<uchar, 11, 32>
605         imeOut;
606 #endif
607     matrix<uchar, 2, 32>
608         imeIn = 0;
609     matrix<uchar, 4, 32>
610         fbrIn;
611 
612     // declare parameters for VME
613     matrix<uint, 16, 2>
614         costs = 0;
615     vector<short, 2>
616         mvPred  = 0,
617         mvPred2 = 0;
618     //read(SURF_MV16x16, mbX * MVDATA_SIZE, mbY, mvPred); // these pred MVs will be updated later here
619 #if COMPLEX_BIDIR
620     uchar x_r = 48;
621     uchar y_r = 40;
622 #else
623     uchar
624         x_r = 32,
625         y_r = 32;
626 #endif
627 
628     // load search path
629     imeIn.select<2, 1, 32, 1>(0) = control.select<64, 1>(0);
630 
631     // M0.2
632     VME_SET_UNIInput_SrcX(uniIn, x);
633     VME_SET_UNIInput_SrcY(uniIn, y);
634 
635     // M0.3 various prediction parameters
636     //VME_SET_DWORD(uniIn, 0, 3, 0x76a40000); // BMEDisableFBR=1 InterSAD=2 8x8 16x16
637     VME_SET_DWORD(uniIn, 0, 3, 0x76a00000); // BMEDisableFBR=0 InterSAD=2 SubMbPartMask=0x76: 8x8 16x16
638     //VME_SET_DWORD(uniIn, 0, 3, 0x77a00000); // BMEDisableFBR=0 InterSAD=2 SubMbPartMask=0x77: 8x8
639     //VME_SET_UNIInput_BMEDisableFBR(uniIn);
640     // M1.1 MaxNumMVs
641     VME_SET_UNIInput_MaxNumMVs(uniIn, 32);
642     // M0.5 Reference Window Width & Height
643     VME_SET_UNIInput_RefW(uniIn, x_r);//48);
644     VME_SET_UNIInput_RefH(uniIn, y_r);//40);
645 
646     // M0.0 Ref0X, Ref0Y
647     vector_ref<short, 2>
648         sourceXY = uniIn.row(0).format<short>().select<2, 1>(4);
649     vector<uchar, 2>
650         widthHeight;
651     widthHeight[0] = (height >> 4) - 1;
652     widthHeight[1] = (width >> 4);
653     vector_ref<char, 2>
654         searchWindow = uniIn.row(0).format<char>().select<2, 1>(22);
655 
656     vector_ref<short, 2>
657         ref0XY = uniIn.row(0).format<short>().select<2, 1>(0);
658     SetRef(sourceXY, mvPred, searchWindow, widthHeight, ref0XY);
659 
660     vector_ref<short, 2>
661         ref1XY = uniIn.row(0).format<short>().select<2, 1>(2);
662     SetRef(sourceXY, mvPred2, searchWindow, widthHeight, ref1XY);
663 
664     // M1.0-3 Search path parameters & start centers & MaxNumMVs again!!!
665     VME_SET_UNIInput_AdaptiveEn(uniIn);
666     VME_SET_UNIInput_T8x8FlagForInterEn(uniIn);
667     VME_SET_UNIInput_MaxNumMVs(uniIn, 0x3f);
668     VME_SET_UNIInput_MaxNumSU(uniIn, maxNumSu);
669     VME_SET_UNIInput_LenSP(uniIn, lenSp);
670     //VME_SET_UNIInput_BiWeight(uniIn, 32);
671 
672     // M1.2 Start0X, Start0Y
673     vector<char, 2>
674         start0 = searchWindow;
675     start0 = ((start0 - 16) >> 3) & 0x0f;
676     uniIn.row(1)[10] = start0[0] | (start0[1] << 4);
677 
678     uniIn.row(1)[6] = 0x20;
679     uniIn.row(1)[31] = 0x1;
680 
681     vector<short, 2>
682         ref0 = uniIn.row(0).format<short>().select<2, 1>(0);
683     vector<ushort, 16>
684         costCenter = uniIn.row(3).format<ushort>().select<16, 1>(0);
685     VME_SET_UNIInput_EarlyImeSuccessEn(uniIn);
686 #if COMPLEX_BIDIR
687     matrix<short, 2, 4>
688         mv8,
689         mv8_2;
690 #else
691     matrix<uint, 2, 2>
692         mv8,
693         mv8_2;
694 #endif
695     vector<uint, 4>
696         dist8,
697         dist8_2;
698 #if COMPLEX_BIDIR
699     run_vme_ime(uniIn, imeIn,
700         VME_STREAM_OUT, VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START,
701         SURF_SRC_AND_REF, ref0XY, NULL, costCenter, imeOut);
702     mv8   = imeOut.row(8).format<short>().select<8, 1>(8); // 4 MVs
703     dist8 = imeOut.row(7).format<ushort>().select<4, 1>(4);
704     vector<short, 2>
705         mv16;
706     VME_GET_IMEOutput_Rec0_16x16_Mv(imeOut, mv16);
707 
708 #if !INVERTMOTION
709     run_vme_ime(uniIn, imeIn,
710         VME_STREAM_OUT, VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START,
711         SURF_SRC_AND_REF2, ref1XY, NULL, costCenter, imeOut);
712     mv8_2   = imeOut.row(8).format<short>().select<8, 1>(8); // 4 MVs
713     dist8_2 = imeOut.row(7).format<ushort>().select<4, 1>(4);
714 
715 #else
716     mvPred2 = mv16 * backwardRefDist / forwardRefDist;
717     SetRef(sourceXY, mvPred2, searchWindow, widthHeight, ref1XY);
718     run_vme_ime(uniIn, imeIn,
719         VME_STREAM_OUT, VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START,
720         SURF_SRC_AND_REF2, ref1XY, NULL, costCenter, imeOut);
721     mv8_2   = imeOut.row(8).format<short>().select<8, 1>(8); // 4 MVs
722     dist8_2 = imeOut.row(7).format<ushort>().select<4, 1>(4);
723     //mv8_2 = mv8 * -1;
724 #endif
725 #else
726     run_vme_ime(uniIn, imeIn,
727         VME_STREAM_OUT, VME_SEARCH_DUAL_REF_DUAL_REC,
728         SURF_SRC_AND_REF, ref0XY, ref1XY, costCenter, imeOut);
729 
730     //VME_GET_IMEOutput_Rec0_16x16_Mv(imeOut, mv16);
731     VME_GET_IMEOutput_Rec0_8x8_4Mv(imeOut, mv8);
732     //VME_GET_IMEOutput_Rec0_16x16_Distortion(imeOut, dist16x16);
733     VME_GET_IMEOutput_Rec0_8x8_4Distortion(imeOut, dist8);
734 
735     //VME_GET_IMEOutput_Rec1_16x16_Mv(imeOut, mv16_2);
736     VME_GET_IMEOutput_Rec1_8x8_4Mv(imeOut, mv8_2);
737     //VME_GET_IMEOutput_Rec1_16x16_Distortion(imeOut, dist16x16_2);
738     VME_GET_IMEOutput_Rec1_8x8_4Distortion(imeOut, dist8_2);
739 #endif
740 
741 
742     // distortions Integer search results
743     // 8x8
744     write(SURF_DIST8x8, mbX * DIST_SIZE * 2, mbY * 2, dist8.format<uint, 2, 2>());     //8x8 Forward SAD
745 
746 
747     if (precision)//QPEL
748         VME_SET_UNIInput_SubPelMode(uniIn, 3);
749     else
750         VME_SET_UNIInput_SubPelMode(uniIn, 0);
751     VME_SET_UNIInput_BiWeight(uniIn, 32);
752     VME_CLEAR_UNIInput_BMEDisableFBR(uniIn);
753     SLICE(fbrIn.format<uint>(), 1, 16, 2) = 0; // zero L1 motion vectors
754     matrix<uchar, 7, 32> fbrOut8x8;
755     VME_SET_UNIInput_FBRMbModeInput(uniIn, 3);
756     VME_SET_UNIInput_FBRSubMBShapeInput(uniIn, 0);
757     if (precision)//QPEL
758         VME_SET_UNIInput_FBRSubPredModeInput(uniIn, 3);
759     else
760         VME_SET_UNIInput_FBRSubPredModeInput(uniIn, 0);
761     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(0, 0) = mv8.format<uint>()[0]; // motion vectors 8x8_0
762     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(1, 0) = mv8.format<uint>()[1]; // motion vectors 8x8_1
763     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(2, 0) = mv8.format<uint>()[2]; // motion vectors 8x8_2
764     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(3, 0) = mv8.format<uint>()[3]; // motion vectors 8x8_3
765     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(0, 1) = mv8_2.format<uint>()[0]; // motion vectors 8x8_2_0
766     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(1, 1) = mv8_2.format<uint>()[1]; // motion vectors 8x8_2_1
767     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(2, 1) = mv8_2.format<uint>()[2]; // motion vectors 8x8_2_2
768     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(3, 1) = mv8_2.format<uint>()[3]; // motion vectors 8x8_2_3
769     run_vme_fbr(uniIn, fbrIn, SURF_SRC_AND_REF, 3, 0, 170, fbrOut8x8);
770     VME_GET_FBROutput_Rec0_8x8_4Mv(fbrOut8x8, mv8.format<uint>());
771     VME_GET_FBROutput_Rec1_8x8_4Mv(fbrOut8x8, mv8_2.format<uint>());
772     VME_GET_FBROutput_Dist_8x8_Bi(fbrOut8x8, dist8);
773 
774 
775     // distortions actual complete distortion calculation
776     // 8x8
777     //write(SURF_DIST8x8  , mbX * DIST_SIZE * 2  , mbY * 2, dist8.format<uint,2,2>());     //8x8 Bidir distortions
778 
779     // motion vectors
780     // 8x8
781     write(SURF_MV8x8, mbX * MVDATA_SIZE * 2, mbY * 2, mv8);       //8x8mvs  Ref0
782     write(SURF_MV8x8_2, mbX * MVDATA_SIZE * 2, mbY * 2, mv8_2);     //8x8mvs  Ref1
783 }
784 
785 extern "C" _GENX_MAIN_
MeP16_1ME_2BiRef_MRE_8x8(SurfaceIndex SURF_CONTROL,SurfaceIndex SURF_SRC_AND_REF,SurfaceIndex SURF_SRC_AND_REF2,SurfaceIndex SURF_DIST8x8,SurfaceIndex SURF_MV8x8,SurfaceIndex SURF_MV8x8_2,uint start_xy,uchar blSize,char forwardRefDist,char backwardRefDist)786 void MeP16_1ME_2BiRef_MRE_8x8(
787     SurfaceIndex SURF_CONTROL,
788     SurfaceIndex SURF_SRC_AND_REF,
789     SurfaceIndex SURF_SRC_AND_REF2,
790     SurfaceIndex SURF_DIST8x8,
791     SurfaceIndex SURF_MV8x8,
792     SurfaceIndex SURF_MV8x8_2,
793     uint         start_xy,
794     uchar        blSize,
795     char         forwardRefDist,
796     char         backwardRefDist
797 )
798 {
799     vector<uint, 1>
800         start_mbXY = start_xy;
801     uint
802         mbX = get_thread_origin_x() + start_mbXY.format<ushort>()[0],
803         mbY = get_thread_origin_y() + start_mbXY.format<ushort>()[1],
804         x = mbX * blSize,
805         y = mbY * blSize;
806 
807     vector<uchar, 96>
808         control;
809     read(SURF_CONTROL, 0, control);
810 
811     uchar
812         maxNumSu = control.format<uchar>()[56],
813         lenSp = control.format<uchar>()[57];
814     ushort
815         width = control.format<ushort>()[30],
816         height = control.format<ushort>()[31],
817         mre_width = control.format<ushort>()[33],
818         mre_height = control.format<ushort>()[34],
819         precision = control.format<ushort>()[36];
820     // read MB record data
821 #if CMRT_EMU
822     if (x >= width)
823         return;
824     cm_assert(x < width);
825 #endif
826     UniIn
827         uniIn = 0;
828 #if COMPLEX_BIDIR
829     matrix<uchar, 9, 32>
830         imeOut;
831 #else
832     matrix<uchar, 11, 32>
833         imeOut;
834 #endif
835     matrix<uchar, 2, 32>
836         imeIn = 0;
837     matrix<uchar, 4, 32>
838         fbrIn;
839 
840     // declare parameters for VME
841     matrix<uint, 16, 2>
842         costs = 0;
843     vector<short, 2>
844         mvPred = 0,
845         mvPred2 = 0;
846     //read(SURF_MV16x16, mbX * MVDATA_SIZE, mbY, mvPred); // these pred MVs will be updated later here
847 #if COMPLEX_BIDIR
848     uchar x_r = 48;
849     uchar y_r = 40;
850 #else
851     uchar
852         x_r = 32,
853         y_r = 32;
854 #endif
855 
856     // load search path
857     imeIn.select<2, 1, 32, 1>(0) = control.select<64, 1>(0);
858 
859     // M0.2
860     VME_SET_UNIInput_SrcX(uniIn, x);
861     VME_SET_UNIInput_SrcY(uniIn, y);
862 
863     // M0.3 various prediction parameters
864     //VME_SET_DWORD(uniIn, 0, 3, 0x76a40000); // BMEDisableFBR=1 InterSAD=2 8x8 16x16
865     VME_SET_DWORD(uniIn, 0, 3, 0x76a00000); // BMEDisableFBR=0 InterSAD=2 SubMbPartMask=0x76: 8x8 16x16
866     //VME_SET_DWORD(uniIn, 0, 3, 0x77a00000); // BMEDisableFBR=0 InterSAD=2 SubMbPartMask=0x77: 8x8
867     //VME_SET_UNIInput_BMEDisableFBR(uniIn);
868     // M1.1 MaxNumMVs
869     VME_SET_UNIInput_MaxNumMVs(uniIn, 32);
870     // M0.5 Reference Window Width & Height
871     VME_SET_UNIInput_RefW(uniIn, x_r);//48);
872     VME_SET_UNIInput_RefH(uniIn, y_r);//40);
873 
874     // M0.0 Ref0X, Ref0Y
875     vector_ref<short, 2>
876         sourceXY = uniIn.row(0).format<short>().select<2, 1>(4);
877     vector<uchar, 2>
878         widthHeight;
879     widthHeight[0] = (height >> 4) - 1;
880     widthHeight[1] = (width >> 4);
881     vector_ref<char, 2>
882         searchWindow = uniIn.row(0).format<char>().select<2, 1>(22);
883 
884     vector_ref<short, 2>
885         ref0XY = uniIn.row(0).format<short>().select<2, 1>(0);
886     SetRef(sourceXY, mvPred, searchWindow, widthHeight, ref0XY);
887 
888     vector_ref<short, 2>
889         ref1XY = uniIn.row(0).format<short>().select<2, 1>(2);
890     SetRef(sourceXY, mvPred2, searchWindow, widthHeight, ref1XY);
891 
892     // M1.0-3 Search path parameters & start centers & MaxNumMVs again!!!
893     VME_SET_UNIInput_AdaptiveEn(uniIn);
894     VME_SET_UNIInput_T8x8FlagForInterEn(uniIn);
895     VME_SET_UNIInput_MaxNumMVs(uniIn, 0x3f);
896     VME_SET_UNIInput_MaxNumSU(uniIn, maxNumSu);
897     VME_SET_UNIInput_LenSP(uniIn, lenSp);
898     //VME_SET_UNIInput_BiWeight(uniIn, 32);
899 
900     // M1.2 Start0X, Start0Y
901     vector<char, 2>
902         start0 = searchWindow;
903     start0 = ((start0 - 16) >> 3) & 0x0f;
904     uniIn.row(1)[10] = start0[0] | (start0[1] << 4);
905 
906     uniIn.row(1)[6] = 0x20;
907     uniIn.row(1)[31] = 0x1;
908 
909     vector<short, 2>
910         ref0 = uniIn.row(0).format<short>().select<2, 1>(0);
911     vector<ushort, 16>
912         costCenter = uniIn.row(3).format<ushort>().select<16, 1>(0);
913     VME_SET_UNIInput_EarlyImeSuccessEn(uniIn);
914 
915     matrix<short, 2, 4>
916         mv8,
917         mv8_2;
918 
919     vector<uint, 4>
920         dist8,
921         dist8_2;
922 
923     run_vme_ime(uniIn, imeIn,
924         VME_STREAM_OUT, VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START,
925         SURF_SRC_AND_REF, ref0XY, NULL, costCenter, imeOut);
926     mv8 = imeOut.row(8).format<short>().select<8, 1>(8); // 4 MVs
927     dist8 = imeOut.row(7).format<ushort>().select<4, 1>(4);
928     vector<short, 2>
929         mv16;
930     VME_GET_IMEOutput_Rec0_16x16_Mv(imeOut, mv16);
931 
932 
933 #if 0
934     mvPred2 = -mv16;
935     // M0.5 Reference Window Width & Height
936     VME_SET_UNIInput_RefW(uniIn, 32);//48);
937     VME_SET_UNIInput_RefH(uniIn, 32);//40);
938     SetRef(sourceXY, mvPred2, searchWindow, widthHeight, ref1XY);
939     run_vme_ime(uniIn, imeIn,
940         VME_STREAM_OUT, VME_SEARCH_SINGLE_REF_SINGLE_REC_SINGLE_START,
941         SURF_SRC_AND_REF2, ref1XY, NULL, costCenter, imeOut);
942     mv8_2 = imeOut.row(8).format<short>().select<8, 1>(8); // 4 MVs
943     dist8_2 = imeOut.row(7).format<ushort>().select<4, 1>(4);
944 #else
945     mv8_2 = -mv8; // 4 MVs
946 #endif
947     // distortions Integer search results
948     // 8x8
949     write(SURF_DIST8x8, mbX * DIST_SIZE * 2, mbY * 2, dist8.format<uint, 2, 2>());     //8x8 Forward SAD
950 
951 
952     if (precision)//QPEL
953         VME_SET_UNIInput_SubPelMode(uniIn, 3);
954     else
955         VME_SET_UNIInput_SubPelMode(uniIn, 0);
956     VME_SET_UNIInput_BiWeight(uniIn, 32);
957     VME_CLEAR_UNIInput_BMEDisableFBR(uniIn);
958     SLICE(fbrIn.format<uint>(), 1, 16, 2) = 0; // zero L1 motion vectors
959     matrix<uchar, 7, 32> fbrOut8x8;
960     VME_SET_UNIInput_FBRMbModeInput(uniIn, 3);
961     VME_SET_UNIInput_FBRSubMBShapeInput(uniIn, 0);
962     if (precision)//QPEL
963         VME_SET_UNIInput_FBRSubPredModeInput(uniIn, 3);
964     else
965         VME_SET_UNIInput_FBRSubPredModeInput(uniIn, 0);
966     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(0, 0) = mv8.format<uint>()[0]; // motion vectors 8x8_0
967     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(1, 0) = mv8.format<uint>()[1]; // motion vectors 8x8_1
968     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(2, 0) = mv8.format<uint>()[2]; // motion vectors 8x8_2
969     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(3, 0) = mv8.format<uint>()[3]; // motion vectors 8x8_3
970     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(0, 1) = mv8_2.format<uint>()[0]; // motion vectors 8x8_2_0
971     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(1, 1) = mv8_2.format<uint>()[1]; // motion vectors 8x8_2_1
972     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(2, 1) = mv8_2.format<uint>()[2]; // motion vectors 8x8_2_2
973     fbrIn.format<uint, 4, 8>().select<1, 1, 4, 2>(3, 1) = mv8_2.format<uint>()[3]; // motion vectors 8x8_2_3
974     run_vme_fbr(uniIn, fbrIn, SURF_SRC_AND_REF, 3, 0, 170, fbrOut8x8);
975     VME_GET_FBROutput_Rec0_8x8_4Mv(fbrOut8x8, mv8.format<uint>());
976     VME_GET_FBROutput_Rec1_8x8_4Mv(fbrOut8x8, mv8_2.format<uint>());
977     VME_GET_FBROutput_Dist_8x8_Bi(fbrOut8x8, dist8);
978 
979 
980     // distortions actual complete distortion calculation
981     // 8x8
982     //write(SURF_DIST8x8  , mbX * DIST_SIZE * 2  , mbY * 2, dist8.format<uint,2,2>());     //8x8 Bidir distortions
983 
984     // motion vectors
985     // 8x8
986     write(SURF_MV8x8, mbX * MVDATA_SIZE * 2, mbY * 2, mv8);       //8x8mvs  Ref0
987     write(SURF_MV8x8_2, mbX * MVDATA_SIZE * 2, mbY * 2, mv8_2);     //8x8mvs  Ref1
988 }