1 /***************************************************************** 2 * gavl - a general purpose audio/video processing library 3 * 4 * Copyright (c) 2001 - 2011 Members of the Gmerlin project 5 * gmerlin-general@lists.sourceforge.net 6 * http://gmerlin.sourceforge.net 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation, either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 * *****************************************************************/ 21 22 /* SSE Optimized scaling (linear, y) */ 23 24 25 26 static void (FUNC_NAME)(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) 27 { 28 int i, imax; 29 uint8_t * src; 30 uint8_t * src_start; 31 uint8_t * dst; 32 33 /* Load factor */ 34 movss_m2r(ctx->table_v.pixels[scanline].factor_f[0], xmm0); 35 shufps_r2ri(xmm0, xmm0, 0x00); 36 37 dst = dest_start; 38 39 src_start = 40 ctx->src + ctx->table_v.pixels[scanline].index * ctx->src_stride; 41 42 /* While source is not aligned... */ 43 imax = (((long)(src_start)) % 16)/4; 44 45 for(i = 0; i < imax; i++) 46 { 47 src = src_start; 48 49 movss_m2r(*src, xmm1); 50 movss_m2r(*(src+ctx->src_stride), xmm2); 51 subss_r2r(xmm2, xmm1); 52 mulss_r2r(xmm0, xmm1); 53 addss_r2r(xmm2, xmm1); 54 movss_r2m(xmm1, *dst); 55 dst+=4; 56 src_start+=4; 57 } 58 59 /* SSE routines scale 8 numbers (= 32 bytes) at once */ 60 imax = (ctx->dst_size * 4 * WIDTH_MUL - (dst - dest_start)) / /* Bytes left */ 61 (32); /* Bytes processed at once */ 62 63 for(i = 0; i < imax; i++) 64 { 65 src = src_start; 66 67 movaps_m2r(*src, xmm1); 68 movaps_m2r(*(src+ctx->src_stride), xmm2); 69 70 movaps_m2r(*(src+16), xmm3); 71 movaps_m2r(*(src+ctx->src_stride+16), xmm4); 72 73 subps_r2r(xmm2, xmm1); 74 mulps_r2r(xmm0, xmm1); 75 addps_r2r(xmm2, xmm1); 76 movups_r2m(xmm1, *dst); 77 78 subps_r2r(xmm4, xmm3); 79 mulps_r2r(xmm0, xmm3); 80 addps_r2r(xmm4, xmm3); 81 movups_r2m(xmm3, *(dst+16)); 82 83 dst += 32; 84 src_start += 32; 85 } 86 87 imax = (ctx->dst_size * 4 * WIDTH_MUL - (dst - dest_start)) / 4; 88 89 // imax = (ctx->dst_size * WIDTH_MUL); 90 91 if(!imax) 92 return; 93 94 for(i = 0; i < imax; i++) 95 { 96 src = src_start; 97 98 movss_m2r(*src, xmm1); 99 movss_m2r(*(src+ctx->src_stride), xmm2); 100 subss_r2r(xmm2, xmm1); 101 mulss_r2r(xmm0, xmm1); 102 addss_r2r(xmm2, xmm1); 103 movss_r2m(xmm1, *dst); 104 105 dst+=4; 106 src_start+=4; 107 } 108 } 109 110 #undef FUNC_NAME 111 #undef NUM_TAPS 112 #undef WIDTH_MUL 113 #undef ACCUM 114 #undef ACCUM_C 115 #undef OUTPUT 116 #undef OUTPUT_C 117 118 #ifdef INIT_GLOBAL 119 #undef INIT_GLOBAL 120 #endif 121 122 #ifdef INIT_C 123 #undef INIT_C 124 #endif 125 126 #undef INIT 127 128