1 #include <string.h>
2 #include <math.h>
3 
4 // Define a few macros for CPU dependent instructions.
5 // I suspect I don't really understand how the C macro preprocessor works but
6 // this seems to get the job done.          // TRB 7/01
7 
8 // BEFORE USING THESE YOU MUST SET:
9 
10 // #define SIMD_TYPE MMXEXT            (or MMX or 3DNOW)
11 
12 // some macros for pavgb instruction
13 //      V_PAVGB(mmr1, mmr2, mmr work register, smask) mmr2 may = mmrw if you can trash it
14 
15 #define V_PAVGB_MMX(mmr1, mmr2, mmrw, smask) \
16 	"movq    "mmr2",  "mmrw"\n\t"            \
17 	"pand    "smask", "mmrw"\n\t"            \
18 	"psrlw   $1,      "mmrw"\n\t"            \
19 	"pand    "smask", "mmr1"\n\t"            \
20 	"psrlw   $1,      "mmr1"\n\t"            \
21 	"paddusb "mmrw",  "mmr1"\n\t"
22 #define V_PAVGB_MMXEXT(mmr1, mmr2, mmrw, smask)      "pavgb   "mmr2", "mmr1"\n\t"
23 #define V_PAVGB_3DNOW(mmr1, mmr2, mmrw, smask)    "pavgusb "mmr2", "mmr1"\n\t"
24 #define V_PAVGB(mmr1, mmr2, mmrw, smask)          V_PAVGB2(mmr1, mmr2, mmrw, smask, SIMD_TYPE)
25 #define V_PAVGB2(mmr1, mmr2, mmrw, smask, simd_type) V_PAVGB3(mmr1, mmr2, mmrw, smask, simd_type)
26 #define V_PAVGB3(mmr1, mmr2, mmrw, smask, simd_type) V_PAVGB_##simd_type(mmr1, mmr2, mmrw, smask)
27 
28 // some macros for pmaxub instruction
29 #define V_PMAXUB_MMX(mmr1, mmr2) \
30     "psubusb "mmr2", "mmr1"\n\t" \
31     "paddusb "mmr2", "mmr1"\n\t"
32 #define V_PMAXUB_MMXEXT(mmr1, mmr2)      "pmaxub "mmr2", "mmr1"\n\t"
33 #define V_PMAXUB_3DNOW(mmr1, mmr2)    V_PMAXUB_MMX(mmr1, mmr2)  // use MMX version
34 #define V_PMAXUB(mmr1, mmr2)          V_PMAXUB2(mmr1, mmr2, SIMD_TYPE)
35 #define V_PMAXUB2(mmr1, mmr2, simd_type) V_PMAXUB3(mmr1, mmr2, simd_type)
36 #define V_PMAXUB3(mmr1, mmr2, simd_type) V_PMAXUB_##simd_type(mmr1, mmr2)
37 
38 // some macros for pminub instruction
39 //      V_PMINUB(mmr1, mmr2, mmr work register)     mmr2 may NOT = mmrw
40 #define V_PMINUB_MMX(mmr1, mmr2, mmrw) \
41     "pcmpeqb "mmrw", "mmrw"\n\t"       \
42     "psubusb "mmr2", "mmrw"\n\t"       \
43     "paddusb "mmrw", "mmr1"\n\t"       \
44     "psubusb "mmrw", "mmr1"\n\t"
45 #define V_PMINUB_MMXEXT(mmr1, mmr2, mmrw)      "pminub "mmr2", "mmr1"\n\t"
46 #define V_PMINUB_3DNOW(mmr1, mmr2, mmrw)    V_PMINUB_MMX(mmr1, mmr2, mmrw)  // use MMX version
47 #define V_PMINUB(mmr1, mmr2, mmrw)          V_PMINUB2(mmr1, mmr2, mmrw, SIMD_TYPE)
48 #define V_PMINUB2(mmr1, mmr2, mmrw, simd_type) V_PMINUB3(mmr1, mmr2, mmrw, simd_type)
49 #define V_PMINUB3(mmr1, mmr2, mmrw, simd_type) V_PMINUB_##simd_type(mmr1, mmr2, mmrw)
50 
51 // some macros for movntq instruction
52 //      V_MOVNTQ(mmr1, mmr2)
53 #define V_MOVNTQ_MMX(mmr1, mmr2)      "movq   "mmr2", "mmr1"\n\t"
54 #define V_MOVNTQ_3DNOW(mmr1, mmr2)    "movq   "mmr2", "mmr1"\n\t"
55 #define V_MOVNTQ_MMXEXT(mmr1, mmr2)      "movntq "mmr2", "mmr1"\n\t"
56 #define V_MOVNTQ(mmr1, mmr2)          V_MOVNTQ2(mmr1, mmr2, SIMD_TYPE)
57 #define V_MOVNTQ2(mmr1, mmr2, simd_type) V_MOVNTQ3(mmr1, mmr2, simd_type)
58 #define V_MOVNTQ3(mmr1, mmr2, simd_type) V_MOVNTQ_##simd_type(mmr1, mmr2)
59 
60 // end of macros
61 
62 #ifdef IS_SSE2
63 
64 #define MERGE4PIXavg(PADDR1, PADDR2)                                                     \
65     "movdqu  "PADDR1",   %%xmm0\n\t"       /* our 4 pixels */                            \
66     "movdqu  "PADDR2",   %%xmm1\n\t"       /* our pixel2 value */                        \
67     "movdqa  %%xmm0,     %%xmm2\n\t"       /* another copy of our pixel1 value */        \
68     "movdqa  %%xmm1,     %%xmm3\n\t"       /* another copy of our pixel1 value */        \
69     "psubusb %%xmm1,     %%xmm2\n\t"                                                     \
70     "psubusb %%xmm0,     %%xmm3\n\t"                                                     \
71     "por     %%xmm3,     %%xmm2\n\t"                                                     \
72     "pavgb   %%xmm1,     %%xmm0\n\t"       /* avg of 2 pixels */                         \
73     "movdqa  %%xmm2,     %%xmm3\n\t"       /* another copy of our our weights */         \
74     "pxor    %%xmm1,     %%xmm1\n\t"                                                     \
75     "psubusb %%xmm7,     %%xmm3\n\t"       /* nonzero where old weights lower, else 0 */ \
76     "pcmpeqb %%xmm1,     %%xmm3\n\t"       /* now ff where new better, else 00 */        \
77     "pcmpeqb %%xmm3,     %%xmm1\n\t"       /* here ff where old better, else 00 */       \
78     "pand    %%xmm3,     %%xmm0\n\t"       /* keep only better new pixels */             \
79     "pand    %%xmm3,     %%xmm2\n\t"       /* and weights */                             \
80     "pand    %%xmm1,     %%xmm5\n\t"       /* keep only better old pixels */             \
81     "pand    %%xmm1,     %%xmm7\n\t"                                                     \
82     "por     %%xmm0,     %%xmm5\n\t"       /* and merge new & old vals */                \
83     "por     %%xmm2,     %%xmm7\n\t"
84 
85 #define MERGE4PIXavgH(PADDR1A, PADDR1B, PADDR2A, PADDR2B)                                \
86     "movdqu  "PADDR1A",   %%xmm0\n\t"      /* our 4 pixels */                            \
87     "movdqu  "PADDR2A",   %%xmm1\n\t"      /* our pixel2 value */                        \
88     "movdqu  "PADDR1B",   %%xmm2\n\t"      /* our 4 pixels */                            \
89     "movdqu  "PADDR2B",   %%xmm3\n\t"      /* our pixel2 value */                        \
90     "pavgb   %%xmm2,      %%xmm0\n\t"                                                    \
91     "pavgb   %%xmm3,      %%xmm1\n\t"                                                    \
92     "movdqa  %%xmm0,      %%xmm2\n\t"      /* another copy of our pixel1 value */        \
93     "movdqa  %%xmm1,      %%xmm3\n\t"      /* another copy of our pixel1 value */        \
94     "psubusb %%xmm1,      %%xmm2\n\t"                                                    \
95     "psubusb %%xmm0,      %%xmm3\n\t"                                                    \
96     "por     %%xmm3,      %%xmm2\n\t"                                                    \
97     "pavgb   %%xmm1,      %%xmm0\n\t"      /* avg of 2 pixels */                         \
98     "movdqa  %%xmm2,      %%xmm3\n\t"      /* another copy of our our weights */         \
99     "pxor    %%xmm1,      %%xmm1\n\t"                                                    \
100     "psubusb %%xmm7,      %%xmm3\n\t"      /* nonzero where old weights lower, else 0 */ \
101     "pcmpeqb %%xmm1,      %%xmm3\n\t"      /* now ff where new better, else 00 */        \
102     "pcmpeqb %%xmm3,      %%xmm1\n\t"      /* here ff where old better, else 00 */       \
103     "pand    %%xmm3,      %%xmm0\n\t"      /* keep only better new pixels */             \
104     "pand    %%xmm3,      %%xmm2\n\t"      /* and weights */                             \
105     "pand    %%xmm1,      %%xmm5\n\t"      /* keep only better old pixels */             \
106     "pand    %%xmm1,      %%xmm7\n\t"                                                    \
107     "por     %%xmm0,      %%xmm5\n\t"      /* and merge new & old vals */                \
108     "por     %%xmm2,      %%xmm7\n\t"
109 
110 #define RESET_CHROMA "por "_UVMask", %%xmm7\n\t"
111 
112 #else // ifdef IS_SSE2
113 
114 #define MERGE4PIXavg(PADDR1, PADDR2)                                                    \
115     "movq    "PADDR1",   %%mm0\n\t"       /* our 4 pixels */                            \
116     "movq    "PADDR2",   %%mm1\n\t"       /* our pixel2 value */                        \
117     "movq    %%mm0,      %%mm2\n\t"       /* another copy of our pixel1 value */        \
118     "movq    %%mm1,      %%mm3\n\t"       /* another copy of our pixel1 value */        \
119     "psubusb %%mm1,      %%mm2\n\t"                                                     \
120     "psubusb %%mm0,      %%mm3\n\t"                                                     \
121     "por     %%mm3,      %%mm2\n\t"                                                     \
122     V_PAVGB ("%%mm0", "%%mm1", "%%mm3", _ShiftMask) /* avg of 2 pixels */               \
123     "movq    %%mm2,      %%mm3\n\t"       /* another copy of our our weights */         \
124     "pxor    %%mm1,      %%mm1\n\t"                                                     \
125     "psubusb %%mm7,      %%mm3\n\t"       /* nonzero where old weights lower, else 0 */ \
126     "pcmpeqb %%mm1,      %%mm3\n\t"       /* now ff where new better, else 00 */        \
127     "pcmpeqb %%mm3,      %%mm1\n\t"       /* here ff where old better, else 00 */       \
128     "pand    %%mm3,      %%mm0\n\t"       /* keep only better new pixels */             \
129     "pand    %%mm3,      %%mm2\n\t"       /* and weights */                             \
130     "pand    %%mm1,      %%mm5\n\t"       /* keep only better old pixels */             \
131     "pand    %%mm1,      %%mm7\n\t"                                                     \
132     "por     %%mm0,      %%mm5\n\t"       /* and merge new & old vals */                \
133     "por     %%mm2,      %%mm7\n\t"
134 
135 #define MERGE4PIXavgH(PADDR1A, PADDR1B, PADDR2A, PADDR2B)                               \
136     "movq    "PADDR1A",   %%mm0\n\t"      /* our 4 pixels */                            \
137     "movq    "PADDR2A",   %%mm1\n\t"      /* our pixel2 value */                        \
138     "movq    "PADDR1B",   %%mm2\n\t"      /* our 4 pixels */                            \
139     "movq    "PADDR2B",   %%mm3\n\t"      /* our pixel2 value */                        \
140     V_PAVGB("%%mm0", "%%mm2", "%%mm2", _ShiftMask)                                      \
141     V_PAVGB("%%mm1", "%%mm3", "%%mm3", _ShiftMask)                                      \
142     "movq    %%mm0,       %%mm2\n\t"      /* another copy of our pixel1 value */        \
143     "movq    %%mm1,       %%mm3\n\t"      /* another copy of our pixel1 value */        \
144     "psubusb %%mm1,       %%mm2\n\t"                                                    \
145     "psubusb %%mm0,       %%mm3\n\t"                                                    \
146     "por     %%mm3,       %%mm2\n\t"                                                    \
147     V_PAVGB("%%mm0", "%%mm1", "%%mm3", _ShiftMask)   /* avg of 2 pixels */              \
148     "movq    %%mm2,       %%mm3\n\t"      /* another copy of our our weights */         \
149     "pxor    %%mm1,       %%mm1\n\t"                                                    \
150     "psubusb %%mm7,       %%mm3\n\t"      /* nonzero where old weights lower, else 0 */ \
151     "pcmpeqb %%mm1,       %%mm3\n\t"      /* now ff where new better, else 00 */        \
152     "pcmpeqb %%mm3,       %%mm1\n\t"      /* here ff where old better, else 00 */       \
153     "pand    %%mm3,       %%mm0\n\t"      /* keep only better new pixels */             \
154     "pand    %%mm3,       %%mm2\n\t"      /* and weights */                             \
155     "pand    %%mm1,       %%mm5\n\t"      /* keep only better old pixels */             \
156     "pand    %%mm1,       %%mm7\n\t"                                                    \
157     "por     %%mm0,       %%mm5\n\t"      /* and merge new & old vals */                \
158     "por     %%mm2,       %%mm7\n\t"
159 
160 #define RESET_CHROMA "por "_UVMask", %%mm7\n\t"
161 
162 #endif
163 
164 
165