1function r = do_highbyte_escapes(s)
2%DO_HIGHBYTE_ESCAPES  Convert "\x"-escaped strings to bytes
3%   Convert sequences strings of the form '\xNM' into characters.
4%   Here NM is a two-char hex string.  Typically sequences of these
5%   represent utf-8 characters.
6%
7%   Example:
8%   >> s = 'aaa\xe2\x8c\x88bbb\xe2\x8c\x89ccc';
9%   >> do_highbyte_escapes(s)
10%      ans = aaa⌈bbb⌉ccc
11%
12%
13%   But be careful for escaped backslashes that happen to be followed
14%   by an 'x'; substrings with an even number of backspaces such as
15%   '\\x' or '\\\\x' should not be converted.  Examples:
16%   >> s = 'aaa \xe2\x8c\x88 bbb \\xe2\\x8c\\\\x89 ccc';
17%   >> do_highbyte_escapes(s)
18%      ans = aaa ⌈ bbb \\xe2\\x8c\\\\x89 ccc
19%
20%   >> s = 'aaa \\\xe2\x8c\x88 bbb';
21%   >> do_highbyte_escapes(s)
22%      ans = aaa \\⌈ bbb
23%
24%
25%   Copyright 2016-2017 Colin B. Macdonald
26%
27%   Copying and distribution of this file, with or without modification,
28%   are permitted in any medium without royalty provided the copyright
29%   notice and this notice are preserved.  This file is offered as-is,
30%   without any warranty.
31
32
33  % pad the string with one char in case string starts with \x
34  s = ['_' s];
35  i = 2;  % start at 2 b/c of this padding
36
37  [TE, NM] = regexp(s, '(?<=[^\\])(?:\\\\)*\\x(?<hex>..)', 'tokenExtents', 'names');
38  %                      1.  2.    3.          4.
39  % explanation:
40  % 1. look behind ...
41  % 2. ... for anything that isn't '\'
42  % 3. zero or more pairs '\\'
43  % 4. two chars as a named token
44
45  if (isempty(TE))
46    r = s(i:end);
47    return
48  end
49
50  % get the two-char hex numbers make them into bytes
51  if (exist ('OCTAVE_VERSION', 'builtin') && ...
52      compare_versions (OCTAVE_VERSION (), '4.3.0', '<'))
53    % Bug on old Octave: https://savannah.gnu.org/bugs/?49659
54    dec = char(hex2dec(NM.hex));
55  else
56    % roughly 3-4 times slower than the above
57    dec = char (hex2dec (struct2cell (NM)));
58  end
59  % faster:
60  %d = uint8('ee');
61  %d = (d >= 48 & d <= 57).*(d-48) + (d >= 97 & d <= 102).*(d-87);
62  %d = 16*d(1) + d(2);
63
64  % Yep, its a loop :(  Takes about 0.02s for a string of length 1179
65  % containing 291 escaped unicode chars.  Roughly 6 times slower than
66  % the hex2dec bit above.
67  r = '';
68  for j=1:length(TE)
69    r = [r s(i:TE{j}(1)-3) dec(j)];
70    i = TE{j}(2)+1;
71  end
72  r = [r s(i:end)];
73
74  if (~ exist ('OCTAVE_VERSION', 'builtin'))
75    % matlab is not UTF-8 internally
76    r = native2unicode(uint8(r));
77  end
78
79end
80
81
82%% Note: tests in private/ execute but generally fail with __run_test_suite__
83% So these are commented out as of 2017-08
84
85%%!test
86%%! s = 'a\\\xe2\x8c\x88y\xe2\x8c\x89b';
87%%! r = 'a\\⌈y⌉b';
88%%! assert (do_highbyte_escapes(s), r)
89
90%%!test
91%%! s = '\\xe2';
92%%! r = '\\xe2';
93%%! assert (do_highbyte_escapes(s), r)
94
95%%!test
96%%! s = '\xe2\x8c\x88';
97%%! r = '⌈';
98%%! assert (do_highbyte_escapes(s), r)
99
100%%!test
101%%! s = '\\\xe2\x8c\x88';
102%%! r = '\\⌈';
103%%! assert (do_highbyte_escapes(s), r)
104