1{
2    Free Pascal version of the Hermes pixel conversion library.
3    Copyright (C) 2012, 2013  Nikolay Nikolov (nickysn@users.sourceforge.net)
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version
9    with the following modification:
10
11    As a special exception, the copyright holders of this library give you
12    permission to link this library with independent modules to produce an
13    executable, regardless of the license terms of these independent modules,and
14    to copy and distribute the resulting executable under terms of your choice,
15    provided that you also meet, for each linked independent module, the terms
16    and conditions of the license of that module. An independent module is a
17    module which is not derived from or based on this library. If you modify
18    this library, you may extend this exception to your version of the library,
19    but you are not obligated to do so. If you do not wish to do so, delete this
20    exception statement from your version.
21
22    This library is distributed in the hope that it will be useful,
23    but WITHOUT ANY WARRANTY; without even the implied warranty of
24    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
25    Lesser General Public License for more details.
26
27    You should have received a copy of the GNU Lesser General Public
28    License along with this library; if not, write to the Free Software
29    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
30}
31
32{$ASMMODE intel}
33
34{ -------------------------------------------------------------------------
35
36                             NORMAL CONVERTERS
37
38  ------------------------------------------------------------------------- }
39
40procedure ConvertX86_64_index8_32(iface: PHermesConverterInterface); cdecl;
41label
42  preloop_start, preloop_skip, loop_start, loop_pre_remainder, loop_remainder, done;
43var
44  i: Integer;
45  s_pixel: Uint8;
46  d_pixel: Uint32;
47  source, dest: PUint8;
48  lookup: PUint32;
49  s_width: int64;
50begin
51  source := iface^.s_pixels;
52  dest := iface^.d_pixels;
53  lookup := iface^.lookup;
54  s_width := iface^.s_width;
55  if s_width <= 0 then
56    exit;
57  repeat
58{    for i := 0 to iface^.s_width - 1 do
59    begin}
60{      s_pixel := source^;
61      d_pixel := iface^.lookup[s_pixel];
62      PUint32(dest)^ := d_pixel or $FF;
63      Inc(source);
64      Inc(dest, 4);}
65      asm
66        mov rsi, [source]
67        mov rdi, [dest]
68        mov rbx, [lookup]
69        mov rcx, [s_width]
70
71preloop_start:
72        test rdi, 15
73        jz preloop_skip
74
75        movzx rax, byte [rsi]
76        mov edx, dword [rbx + rax * 4]
77        movnti [rdi], edx
78        inc rsi
79        add rdi, 4
80        sub rcx, 1
81        jz done
82        jmp preloop_start
83
84preloop_skip:
85        mov r8, rcx
86        and r8, 3
87        shr rcx, 2
88        test rcx, rcx
89        jz loop_pre_remainder
90
91        align 16
92loop_start:
93        movzx rax, byte [rsi]
94        movzx r9, byte [rsi + 1]
95        movzx r10, byte [rsi + 2]
96        movzx r11, byte [rsi + 3]
97
98        movd xmm0, dword [rbx + rax * 4]
99        movd xmm1, dword [rbx + r9 * 4]
100        movd xmm2, dword [rbx + r10 * 4]
101        movd xmm3, dword [rbx + r11 * 4]
102        punpckldq xmm0, xmm1
103        punpckldq xmm2, xmm3
104        punpcklqdq xmm0, xmm2
105
106        movntdq [rdi], xmm0
107        add rsi, 4
108        add rdi, 16
109        sub ecx, 1
110        jnz loop_start
111
112loop_pre_remainder:
113        mov rcx, r8
114        test rcx, rcx
115        jz done
116loop_remainder:
117        movzx rax, byte [rsi]
118        mov edx, dword [rbx + rax * 4]
119        movnti [rdi], edx
120        inc rsi
121        add rdi, 4
122        sub ecx, 1
123        jnz loop_remainder
124
125done:
126        mov [source], rsi
127        mov [dest], rdi
128      end;
129{    end;}
130    Inc(source, iface^.s_add);
131    Inc(dest, iface^.d_add);
132    Dec(iface^.s_height);
133  until iface^.s_height = 0;
134  asm
135    sfence
136  end;
137end;
138