1function r = do_highbyte_escapes(s) 2%DO_HIGHBYTE_ESCAPES Convert "\x"-escaped strings to bytes 3% Convert sequences strings of the form '\xNM' into characters. 4% Here NM is a two-char hex string. Typically sequences of these 5% represent utf-8 characters. 6% 7% Example: 8% >> s = 'aaa\xe2\x8c\x88bbb\xe2\x8c\x89ccc'; 9% >> do_highbyte_escapes(s) 10% ans = aaa⌈bbb⌉ccc 11% 12% 13% But be careful for escaped backslashes that happen to be followed 14% by an 'x'; substrings with an even number of backspaces such as 15% '\\x' or '\\\\x' should not be converted. Examples: 16% >> s = 'aaa \xe2\x8c\x88 bbb \\xe2\\x8c\\\\x89 ccc'; 17% >> do_highbyte_escapes(s) 18% ans = aaa ⌈ bbb \\xe2\\x8c\\\\x89 ccc 19% 20% >> s = 'aaa \\\xe2\x8c\x88 bbb'; 21% >> do_highbyte_escapes(s) 22% ans = aaa \\⌈ bbb 23% 24% 25% Copyright 2016-2017 Colin B. Macdonald 26% 27% Copying and distribution of this file, with or without modification, 28% are permitted in any medium without royalty provided the copyright 29% notice and this notice are preserved. This file is offered as-is, 30% without any warranty. 31 32 33 % pad the string with one char in case string starts with \x 34 s = ['_' s]; 35 i = 2; % start at 2 b/c of this padding 36 37 [TE, NM] = regexp(s, '(?<=[^\\])(?:\\\\)*\\x(?<hex>..)', 'tokenExtents', 'names'); 38 % 1. 2. 3. 4. 39 % explanation: 40 % 1. look behind ... 41 % 2. ... for anything that isn't '\' 42 % 3. zero or more pairs '\\' 43 % 4. two chars as a named token 44 45 if (isempty(TE)) 46 r = s(i:end); 47 return 48 end 49 50 % get the two-char hex numbers make them into bytes 51 if (exist ('OCTAVE_VERSION', 'builtin') && ... 52 compare_versions (OCTAVE_VERSION (), '4.3.0', '<')) 53 % Bug on old Octave: https://savannah.gnu.org/bugs/?49659 54 dec = char(hex2dec(NM.hex)); 55 else 56 % roughly 3-4 times slower than the above 57 dec = char (hex2dec (struct2cell (NM))); 58 end 59 % faster: 60 %d = uint8('ee'); 61 %d = (d >= 48 & d <= 57).*(d-48) + (d >= 97 & d <= 102).*(d-87); 62 %d = 16*d(1) + d(2); 63 64 % Yep, its a loop :( Takes about 0.02s for a string of length 1179 65 % containing 291 escaped unicode chars. Roughly 6 times slower than 66 % the hex2dec bit above. 67 r = ''; 68 for j=1:length(TE) 69 r = [r s(i:TE{j}(1)-3) dec(j)]; 70 i = TE{j}(2)+1; 71 end 72 r = [r s(i:end)]; 73 74 if (~ exist ('OCTAVE_VERSION', 'builtin')) 75 % matlab is not UTF-8 internally 76 r = native2unicode(uint8(r)); 77 end 78 79end 80 81 82%% Note: tests in private/ execute but generally fail with __run_test_suite__ 83% So these are commented out as of 2017-08 84 85%%!test 86%%! s = 'a\\\xe2\x8c\x88y\xe2\x8c\x89b'; 87%%! r = 'a\\⌈y⌉b'; 88%%! assert (do_highbyte_escapes(s), r) 89 90%%!test 91%%! s = '\\xe2'; 92%%! r = '\\xe2'; 93%%! assert (do_highbyte_escapes(s), r) 94 95%%!test 96%%! s = '\xe2\x8c\x88'; 97%%! r = '⌈'; 98%%! assert (do_highbyte_escapes(s), r) 99 100%%!test 101%%! s = '\\\xe2\x8c\x88'; 102%%! r = '\\⌈'; 103%%! assert (do_highbyte_escapes(s), r) 104