1(* charmetrics1.sml *)
2
3(* This decodes stuff from the "charmetrics.sml" hash tables to give
4 * information about characters in the STIX (and cmuntt and odokai)
5 * fonts. This is a component of ACN adjustments to the code here so that
6 * use is made of Unicode fonts not the original TeX ones...
7 *)
8
9(* To perform boolean operations on 32-bit integers I need to turn things
10   into LargeWord items. This looks clumsy and maybe something simpler
11   would be OK (eg maybe I do not need to go via LargeInt) but of this
12   works I will feel OK. At least this seems to work on Poly/ML - but
13   in general SML may not provide very good guarantees about the width
14   of integers that it supports.
15 *)
16
17fun andb(a, b) =
18   Int.fromLarge (
19      LargeWord.toLargeInt (
20         LargeWord.andb (
21            LargeWord.fromLargeInt (
22               LargeInt.toLarge a),
23            LargeWord.fromLargeInt (
24               LargeInt.toLarge b))));
25
26fun orb(a, b) =
27   Int.fromLarge (
28      LargeWord.toLargeInt (
29         LargeWord.orb (
30            LargeWord.fromLargeInt (
31               LargeInt.toLarge a),
32            LargeWord.fromLargeInt (
33               LargeInt.toLarge b))));
34
35fun notb a =
36   Int.fromLarge (
37      LargeWord.toLargeInt (
38         LargeWord.notb (
39            LargeWord.fromLargeInt (
40               LargeInt.toLarge a))));
41
42fun <<(a, n) =
43   Int.fromLarge (
44      LargeWord.toLargeInt (
45         LargeWord.<< (
46            LargeWord.fromLargeInt (
47               LargeInt.toLarge a),
48               Word.fromInt n)));
49
50fun >>(a, n) =
51   Int.fromLarge (
52      LargeWord.toLargeInt (
53         LargeWord.>> (
54            LargeWord.fromLargeInt (
55               LargeInt.toLarge a),
56               Word.fromInt n)));
57
58fun getv v n = Vector.sub(v, n);
59
60(* The hash table access code here has to be keyed to cod ein a file
61 * called "charmetrics.cpp" in the CSL part of the Reduce source code.
62 * The stuff there accesses raw font metric information (which is in turn
63 * extracted from the .otf font files via a collection of messy steps) and
64 * picks hash parameters that let me end up with especially compact
65 * metric tables. Well even in the form used here the tables consume
66 * quite a lot of space, but I believe they represent a good balance
67 * between spaec saving and compact representation given that the Unicode
68 * fonts concerned have many thousands of characters (specifically I have
69 * over 30,000 codepoints with measurements).
70 *)
71
72fun lookupchar fontnum codepoint =
73  let
74    val cp1 =
75(* I first map the font code and (21-bit) codepoint so that codepoints use
76   only 16 bits, with certain invalid or unused ranges mapped onto the
77   value 0xffff (which does not represent a valid character). The messy
78   set of tests here are mostly present because in a hash table I need to
79   store keys (as well as values) and if I did not compress the keys
80   somewhat I would be very tight on bits when fitting my table into
81   a collection of 32-bit words.
82 *)
83      if fontnum < 2 then
84        if andb(codepoint, 0xd800) = 0xd800 then 0xffff
85        else if codepoint >= 0x10000 then
86          if codepoint < 0x10800 then 0xd800 + andb(codepoint, 0xfff)
87          else 0xffff
88        else codepoint
89      else if codepoint >= 0x4000 andalso codepoint < 0x8000 then 0xffff
90      else if codepoint >= 0x1d8000 andalso codepoint < 0x1d9000 then
91        0x5000 + andb(codepoint, 0xfff)
92      else if codepoint >= 0x10000 then 0xffff
93      else codepoint
94(* I combine the (reduced) codempoint with the font number to get a key *)
95    val fullkey = <<(fontnum, 16) + cp1
96    val key = >>(fullkey, 2)
97(* My first hash location is merely the key reduced modulo the hash table
98   size. This is clearly rather cheap and simplistic, but turns out to
99   be good enough.
100 *)
101    val h1 = key mod hashsize
102    val w = getv metrics_hash h1;
103    val v = andb((getv w 0), 0x7ffff)
104  in
105(* If I find my character at its first choice location can unpick information
106   and the sub-function lookupchar1 does that.
107 *)
108    if v = key then lookupchar1 fullkey w fontnum
109    else let
110(* A second hash function is an offset version of the key modulo a different
111   number. These two numbers were chosen following an exhausive search to
112   find values that led to high hash table occupancy.
113 *)
114      val h2 = (key mod CHAR_METRICS_MODULUS) + CHAR_METRICS_OFFSET;
115      val w = getv metrics_hash h2;
116      val v = andb((getv w 0), 0x7ffff)
117    in
118      if v = key then lookupchar1 fullkey w fontnum
119      else let
120(* A third (and final) hash function is simply the sum of the previous two,
121   obviously reduced modulo the table size.
122 *)
123        val h3 = (h1 + h2) mod hashsize;
124        val w = getv metrics_hash h3;
125        val v = andb((getv w 0), 0x7ffff)
126      in
127        if v = key then lookupchar1 fullkey w fontnum
128(* If the character was not present I will return NONE *)
129        else NONE
130      end
131    end
132  end
133
134and lookupchar1 fullkey row fontnum =
135  let
136    val v = 2*andb(fullkey, 3)
137    val wlo = getv row (v+2)
138  in
139    if wlo = 0 then NONE
140    else
141      let
142        val whi = getv row (v+3)
143        val width = andb(>>(whi, 19), 0x1fff)
144        val llx   = andb(>>(whi, 6),  0x1fff) - 3000
145        val lly   = andb(>>(wlo, 26), 0x3f) +
146                    andb(<<(whi, 6),  0xfc0) - 1000
147        val urx   = andb(>>(wlo, 13), 0x1fff) - 500
148        val ury   = andb(wlo, 0x1fff) - 1000
149        val ki =
150          if v = 0      then andb(>>(getv row 0, 19), 0x7ff)
151          else if v = 2 then andb(>>(getv row 0, 30), 0x3) +
152                             andb(<<(getv row 1, 2), 0x7fc)
153          else if v = 4 then andb(>>(getv row 1, 9), 0x7ff)
154          else (* v = 6 *)   andb(>>(getv row 1, 20), 0x7ff);
155        val kerninfo =
156          if ki = 0 then 0
157          else ki + (getv fontkern fontnum)
158      in
159        SOME (width, llx, lly, urx, ury, kerninfo)
160      end
161  end;
162
163
164(* Given the output from lookupchar you can then the following with a
165   second codepoint (which must be in the same font) to find an adjustment
166   to the space between the two characters so that they are properly
167   kerned. The initial metric information provides an index into the
168   main kern table, and the code then does a linear search starting from
169   there to seek information relevant to the succesor character. This
170   is sort of reasonable because any one character only has a modest
171   number of successor thet kern with it. The table in fact contains both
172   kern and ligature information, and a bit flags a particular entry to
173   note which sort of adjustment is being recorded in any given word.
174 *)
175
176fun lookupkernadjustment NONE cp = 0
177  | lookupkernadjustment (SOME (width, llx, lly, urx, ury, kerninfo)) cp =
178  if kerninfo = 0 then 0
179  else let
180    fun seek i =
181      let
182        val w = getv kerntable i
183      in
184        if andb(w, 0x001fffff) = cp andalso
185           andb(w, 0x00200000) = 0 then
186          let
187            val w1 = andb(>>(w, 23), 0x1ff)
188          in
189            if andb(w1, 0x100) = 0 then w1
190            else w1 - 0x200
191          end
192        else if andb(w, 0x00400000) <> 0 then 0
193        else seek (i + 1)
194      end
195   in
196     seek kerninfo
197   end;
198
199(* For instance if you have just looked up "f" and you now go
200   (lookupligature ... "i") you should get back SOME ("fi") where the
201   result is the numeric codepoint for an f-i-ligature. If no ligature
202   is available you get back NONE.
203 *)
204
205fun lookupligature NONE cp = NONE
206  | lookupligature (SOME (width, llx, lly, urx, ury, kerninfo)) cp =
207  if kerninfo = 0 then NONE
208  else let
209    fun seek i =
210      let
211        val w = getv kerntable i
212      in
213        if andb(w, 0x001fffff) = cp andalso
214           andb(w, 0x00200000) <> 0 then SOME(andb(>>(w, 23), 0x1ff))
215        else if andb(w, 0x00400000) <> 0 then NONE
216        else seek (i + 1)
217      end
218   in
219     seek kerninfo
220   end;
221
222(* On Original TeX accent positions above characters were adjusted using
223   information that was stored in the kern table. Here I have a separate
224   table for accent offsets, arranged as a hash table that uses at worst
225   two probes. This returns 0 if there us no adjustment needed (or
226   know adjustment know about). Note that the values in the table
227   are treated as 32-bit signed numbers. This information is only
228   present in the STIXMath font (of the fonts I am using) and so I do
229   not worry about other fonts here!
230 *)
231
232fun extend11 n =
233  if andb(n, 0x400) = 0 then n
234  else n - 0x800;
235
236fun accentposition key =
237  let
238    val h1 = key mod TOPCENTRE_SIZE
239    val w = getv topcentre_hash h1
240    val v = andb(w, 0x1fffff)
241    in
242      if v = key then extend11(>>(w, 21))
243      else
244        let
245          val h2 = key mod TOPCENTRE_MODULUS + TOPCENTRE_OFFSET
246          val w = getv topcentre_hash h2
247          val v = andb(w, 0x1fffffff)
248        in
249          if v = key then extend11(>>(w, 21))
250          else 0
251        end
252    end;
253
254(* The the STIXMath font there are tables for "variants" and for
255   "extended characters". The first copes with single glyphs that
256   render as a range of sized of (sor instance) parentheses. The
257   latter gives information about a set of characters that can be used
258   to build up seriously over-sized delimiters as in
259      /   top hook
260      |   top extension
261     <    middle part
262      |   lower extension
263      \   lower hook
264   At present I do not really understand the intent of all the
265   data present and associated with the parts! As with accent
266   positions the data I have here only applies in the STIXMath font.
267   (variants cp) returns a Vector of length 6 where each element
268   is either a codepoint or zero (or NONE if no variants are
269   available at all).
270*)
271
272fun variants key =
273  let
274    val h1 = key mod VARIANT_SIZE
275    val w = getv variant_hash h1
276    val v = getv w 0
277    in
278      if v = key then SOME w
279      else
280        let
281          val h2 = key mod VARIANT_MODULUS + VARIANT_OFFSET
282          val w = getv variant_hash h2
283          val v = getv w 0
284          in
285            if v = key then SOME w
286            else
287              let
288                val h3 = (h1 + h2) mod VARIANT_SIZE
289                val w = getv variant_hash h1
290                val v = getv w 0
291              in
292                if v = key then SOME w
293                else NONE
294              end
295        end
296  end;
297
298(* Same sort of idea but for extensions and you get back a longer vector *)
299
300fun extension key =
301  let
302    val h1 = key mod EXTENSION_SIZE
303    val w = getv extension_hash h1
304    val v = getv w 0
305    in
306      if v = key then SOME w
307      else
308        let
309          val h2 = key mod EXTENSION_MODULUS + EXTENSION_OFFSET
310          val w = getv extension_hash h2
311          val v = getv w 0
312          in
313            if v = key then SOME w
314            else
315              let
316                val h3 = (h1 + h2) mod EXTENSION_SIZE
317                val w = getv extension_hash h1
318                val v = getv w 0
319              in
320                if v = key then SOME w
321                else NONE
322              end
323        end
324  end;
325
326
327(* end of Metrics.sml *)
328
329
330