1######################################################################## 2## 3## Copyright (C) 2016-2021 The Octave Project Developers 4## 5## See the file COPYRIGHT.md in the top-level directory of this 6## distribution or <https://octave.org/copyright/>. 7## 8## This file is part of Octave. 9## 10## Octave is free software: you can redistribute it and/or modify it 11## under the terms of the GNU General Public License as published by 12## the Free Software Foundation, either version 3 of the License, or 13## (at your option) any later version. 14## 15## Octave is distributed in the hope that it will be useful, but 16## WITHOUT ANY WARRANTY; without even the implied warranty of 17## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18## GNU General Public License for more details. 19## 20## You should have received a copy of the GNU General Public License 21## along with Octave; see the file COPYING. If not, see 22## <https://www.gnu.org/licenses/>. 23## 24######################################################################## 25 26## -*- texinfo -*- 27## @deftypefn {} {@var{utf8_str} =} native2unicode (@var{native_bytes}, @var{codepage}) 28## @deftypefnx {} {@var{utf8_str} =} native2unicode (@var{native_bytes}) 29## Convert byte stream @var{native_bytes} to UTF-8 using @var{codepage}. 30## 31## The numbers in the vector @var{native_bytes} are rounded and clipped to 32## integers between 0 and 255. This byte stream is then mapped into the 33## code page given by the string @var{codepage} and returned in the string 34## @var{utf8_str}. Octave uses UTF-8 as its internal encoding. The string 35## @var{codepage} must be an identifier of a valid code page. Examples for 36## valid code pages are @qcode{"ISO-8859-1"}, @qcode{"Shift-JIS"}, or 37## @qcode{"UTF-16"}. For a list of supported code pages, see 38## @url{https://www.gnu.org/software/libiconv}. If @var{codepage} is omitted 39## or empty, the system default codepage is used. 40## 41## If @var{native_bytes} is a string vector, it is returned as is. 42## 43## @seealso{unicode2native} 44## @end deftypefn 45 46function utf8_str = native2unicode (native_bytes, codepage = "") 47 48 if (nargin < 1 || nargin > 2) 49 print_usage (); 50 endif 51 52 if (ischar (native_bytes)) 53 utf8_str = native_bytes; 54 return; 55 endif 56 57 if (! isnumeric (native_bytes) || ! isvector (native_bytes)) 58 error ("native2unicode: NATIVE_BYTES must be a numeric vector"); 59 endif 60 61 if (! ischar (codepage)) 62 error ("native2unicode: CODEPAGE must be a string") 63 endif 64 65 ## FIXME: Would it be better to do this by converting to uint8? Or to 66 ## let __native2unicode to the clipping? Multiple steps here means 67 ## looping through the data and allocating memory multiple times. 68 69 native_bytes = round (native_bytes); 70 native_bytes(native_bytes < 0) = 0; 71 native_bytes(native_bytes > 255) = 255; 72 73 utf8_str = __native2unicode__ (native_bytes, codepage); 74 75 if (! isrow (native_bytes)) 76 utf8_str = utf8_str.'; 77 endif 78 79endfunction 80 81 82## "ЄЅІЇЈЉЊ" 83%!testif HAVE_ICONV 84%! assert (double (native2unicode (164:170, 'ISO-8859-5')), 85%! [208 132 208 133 208 134 208 135 208 136 208 137 208 138]); 86## ["ЄЅІ" 0 "ЇЈЉЊ"] 87%!testif HAVE_ICONV 88%! assert (double (native2unicode ([164:166 0 167:170], 'ISO-8859-5')), 89%! [208 132 208 133 208 134 0 208 135 208 136 208 137 208 138]); 90 91%!assert (native2unicode ("foobar"), "foobar"); 92%!assert <*54384> (double (native2unicode ([0 0 120.3 0 0 122.6 0 0])), 93%! [0 0 120 0 0 123 0 0]); 94 95%!error <Invalid call> native2unicode () 96%!error <Invalid call> native2unicode (1, 'ISO-8859-1', 'test') 97%!error <NATIVE_BYTES must be a numeric vector> native2unicode ([1 2; 3 4]) 98%!error <NATIVE_BYTES must be a numeric vector> native2unicode ({1 2 3 4}) 99%!error <CODEPAGE must be a string> native2unicode (164:170, 123) 100%!testif HAVE_ICONV 101%! fail ("native2unicode (234, 'foo')", 102%! "converting from codepage 'foo' to UTF-8"); 103