1#!/bin/sh
2
3set -e
4D="$(dirname "$0")"
5
6# Convenience function for checking that a command exists.
7requires() {
8    cmd="$1"
9    if ! command -v "$cmd" > /dev/null 2>&1; then
10        echo "DEPENDENCY MISSING: $cmd must be installed" >&2
11        exit 1
12    fi
13}
14
15# Test if an array ($2) contains a particular element ($1).
16array_exists() {
17    needle="$1"
18    shift
19
20    for el in "$@"; do
21        if [ "$el" = "$needle" ]; then
22            return 0
23        fi
24    done
25    return 1
26}
27
28graphemes() {
29    regex="$(sh "$D/regex/grapheme.sh")"
30
31    echo "generating forward grapheme DFA"
32    ucd-generate dfa \
33        --name GRAPHEME_BREAK_FWD \
34        --sparse --minimize --anchored --state-size 2 \
35        src/unicode/fsm/ \
36        "$regex"
37
38    echo "generating reverse grapheme DFA"
39    ucd-generate dfa \
40        --name GRAPHEME_BREAK_REV \
41        --reverse --longest \
42        --sparse --minimize --anchored --state-size 2 \
43        src/unicode/fsm/ \
44        "$regex"
45}
46
47words() {
48    regex="$(sh "$D/regex/word.sh")"
49
50    echo "generating forward word DFA (this can take a while)"
51    ucd-generate dfa \
52        --name WORD_BREAK_FWD \
53        --sparse --minimize --anchored --state-size 4 \
54        src/unicode/fsm/ \
55        "$regex"
56}
57
58sentences() {
59    regex="$(sh "$D/regex/sentence.sh")"
60
61    echo "generating forward sentence DFA (this can take a while)"
62    ucd-generate dfa \
63        --name SENTENCE_BREAK_FWD \
64        --minimize \
65        --sparse --anchored --state-size 4 \
66        src/unicode/fsm/ \
67        "$regex"
68}
69
70regional_indicator() {
71    # For finding all occurrences of region indicators. This is used to handle
72    # regional indicators as a special case for the reverse grapheme iterator
73    # and the reverse word iterator.
74    echo "generating regional indicator DFA"
75    ucd-generate dfa \
76        --name REGIONAL_INDICATOR_REV \
77        --reverse \
78        --classes --minimize --anchored --premultiply --state-size 1 \
79        src/unicode/fsm/ \
80        "\p{gcb=Regional_Indicator}"
81}
82
83simple_word() {
84    echo "generating forward simple word DFA"
85    ucd-generate dfa \
86        --name SIMPLE_WORD_FWD \
87        --sparse --minimize --state-size 2 \
88        src/unicode/fsm/ \
89        "\w"
90}
91
92whitespace() {
93    echo "generating forward whitespace DFA"
94    ucd-generate dfa \
95        --name WHITESPACE_ANCHORED_FWD \
96        --anchored --classes --premultiply --minimize --state-size 1 \
97        src/unicode/fsm/ \
98        "\s+"
99
100    echo "generating reverse whitespace DFA"
101    ucd-generate dfa \
102        --name WHITESPACE_ANCHORED_REV \
103        --reverse \
104        --anchored --classes --premultiply --minimize --state-size 2 \
105        src/unicode/fsm/ \
106        "\s+"
107}
108
109main() {
110    if array_exists "-h" "$@" || array_exists "--help" "$@"; then
111        echo "Usage: $(basename "$0") [--list-commands] [<command>] ..." >&2
112        exit
113    fi
114
115    commands="
116        graphemes
117        sentences
118        words
119        regional-indicator
120        simple-word
121        whitespace
122    "
123    if array_exists "--list-commands" "$@"; then
124        for cmd in $commands; do
125            echo "$cmd"
126        done
127        exit
128    fi
129
130    # ucd-generate is used to compile regexes into DFAs.
131    requires ucd-generate
132
133    mkdir -p src/unicode/fsm/
134
135    cmds=$*
136    if [ $# -eq 0 ] || array_exists "all" "$@"; then
137        cmds=$commands
138    fi
139    for cmd in $cmds; do
140        if array_exists "$cmd" $commands; then
141            fun="$(echo "$cmd" | sed 's/-/_/g')"
142            eval "$fun"
143        else
144            echo "unrecognized command: $cmd" >&2
145        fi
146    done
147}
148
149main "$@"
150