1#!/bin/sh 2 3set -e 4D="$(dirname "$0")" 5 6# Convenience function for checking that a command exists. 7requires() { 8 cmd="$1" 9 if ! command -v "$cmd" > /dev/null 2>&1; then 10 echo "DEPENDENCY MISSING: $cmd must be installed" >&2 11 exit 1 12 fi 13} 14 15# Test if an array ($2) contains a particular element ($1). 16array_exists() { 17 needle="$1" 18 shift 19 20 for el in "$@"; do 21 if [ "$el" = "$needle" ]; then 22 return 0 23 fi 24 done 25 return 1 26} 27 28graphemes() { 29 regex="$(sh "$D/regex/grapheme.sh")" 30 31 echo "generating forward grapheme DFA" 32 ucd-generate dfa \ 33 --name GRAPHEME_BREAK_FWD \ 34 --sparse --minimize --anchored --state-size 2 \ 35 src/unicode/fsm/ \ 36 "$regex" 37 38 echo "generating reverse grapheme DFA" 39 ucd-generate dfa \ 40 --name GRAPHEME_BREAK_REV \ 41 --reverse --longest \ 42 --sparse --minimize --anchored --state-size 2 \ 43 src/unicode/fsm/ \ 44 "$regex" 45} 46 47words() { 48 regex="$(sh "$D/regex/word.sh")" 49 50 echo "generating forward word DFA (this can take a while)" 51 ucd-generate dfa \ 52 --name WORD_BREAK_FWD \ 53 --sparse --minimize --anchored --state-size 4 \ 54 src/unicode/fsm/ \ 55 "$regex" 56} 57 58sentences() { 59 regex="$(sh "$D/regex/sentence.sh")" 60 61 echo "generating forward sentence DFA (this can take a while)" 62 ucd-generate dfa \ 63 --name SENTENCE_BREAK_FWD \ 64 --minimize \ 65 --sparse --anchored --state-size 4 \ 66 src/unicode/fsm/ \ 67 "$regex" 68} 69 70regional_indicator() { 71 # For finding all occurrences of region indicators. This is used to handle 72 # regional indicators as a special case for the reverse grapheme iterator 73 # and the reverse word iterator. 74 echo "generating regional indicator DFA" 75 ucd-generate dfa \ 76 --name REGIONAL_INDICATOR_REV \ 77 --reverse \ 78 --classes --minimize --anchored --premultiply --state-size 1 \ 79 src/unicode/fsm/ \ 80 "\p{gcb=Regional_Indicator}" 81} 82 83simple_word() { 84 echo "generating forward simple word DFA" 85 ucd-generate dfa \ 86 --name SIMPLE_WORD_FWD \ 87 --sparse --minimize --state-size 2 \ 88 src/unicode/fsm/ \ 89 "\w" 90} 91 92whitespace() { 93 echo "generating forward whitespace DFA" 94 ucd-generate dfa \ 95 --name WHITESPACE_ANCHORED_FWD \ 96 --anchored --classes --premultiply --minimize --state-size 1 \ 97 src/unicode/fsm/ \ 98 "\s+" 99 100 echo "generating reverse whitespace DFA" 101 ucd-generate dfa \ 102 --name WHITESPACE_ANCHORED_REV \ 103 --reverse \ 104 --anchored --classes --premultiply --minimize --state-size 2 \ 105 src/unicode/fsm/ \ 106 "\s+" 107} 108 109main() { 110 if array_exists "-h" "$@" || array_exists "--help" "$@"; then 111 echo "Usage: $(basename "$0") [--list-commands] [<command>] ..." >&2 112 exit 113 fi 114 115 commands=" 116 graphemes 117 sentences 118 words 119 regional-indicator 120 simple-word 121 whitespace 122 " 123 if array_exists "--list-commands" "$@"; then 124 for cmd in $commands; do 125 echo "$cmd" 126 done 127 exit 128 fi 129 130 # ucd-generate is used to compile regexes into DFAs. 131 requires ucd-generate 132 133 mkdir -p src/unicode/fsm/ 134 135 cmds=$* 136 if [ $# -eq 0 ] || array_exists "all" "$@"; then 137 cmds=$commands 138 fi 139 for cmd in $cmds; do 140 if array_exists "$cmd" $commands; then 141 fun="$(echo "$cmd" | sed 's/-/_/g')" 142 eval "$fun" 143 else 144 echo "unrecognized command: $cmd" >&2 145 fi 146 done 147} 148 149main "$@" 150