1########################################################################
2##
3## Copyright (C) 2016-2021 The Octave Project Developers
4##
5## See the file COPYRIGHT.md in the top-level directory of this
6## distribution or <https://octave.org/copyright/>.
7##
8## This file is part of Octave.
9##
10## Octave is free software: you can redistribute it and/or modify it
11## under the terms of the GNU General Public License as published by
12## the Free Software Foundation, either version 3 of the License, or
13## (at your option) any later version.
14##
15## Octave is distributed in the hope that it will be useful, but
16## WITHOUT ANY WARRANTY; without even the implied warranty of
17## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18## GNU General Public License for more details.
19##
20## You should have received a copy of the GNU General Public License
21## along with Octave; see the file COPYING.  If not, see
22## <https://www.gnu.org/licenses/>.
23##
24########################################################################
25
26## -*- texinfo -*-
27## @deftypefn  {} {@var{utf8_str} =} native2unicode (@var{native_bytes}, @var{codepage})
28## @deftypefnx {} {@var{utf8_str} =} native2unicode (@var{native_bytes})
29## Convert byte stream @var{native_bytes} to UTF-8 using @var{codepage}.
30##
31## The numbers in the vector @var{native_bytes} are rounded and clipped to
32## integers between 0 and 255.  This byte stream is then mapped into the
33## code page given by the string @var{codepage} and returned in the string
34## @var{utf8_str}.  Octave uses UTF-8 as its internal encoding.  The string
35## @var{codepage} must be an identifier of a valid code page.  Examples for
36## valid code pages are @qcode{"ISO-8859-1"}, @qcode{"Shift-JIS"}, or
37## @qcode{"UTF-16"}.  For a list of supported code pages, see
38## @url{https://www.gnu.org/software/libiconv}.  If @var{codepage} is omitted
39## or empty, the system default codepage is used.
40##
41## If @var{native_bytes} is a string vector, it is returned as is.
42##
43## @seealso{unicode2native}
44## @end deftypefn
45
46function utf8_str = native2unicode (native_bytes, codepage = "")
47
48  if (nargin < 1 || nargin > 2)
49    print_usage ();
50  endif
51
52  if (ischar (native_bytes))
53    utf8_str = native_bytes;
54    return;
55  endif
56
57  if (! isnumeric (native_bytes) || ! isvector (native_bytes))
58    error ("native2unicode: NATIVE_BYTES must be a numeric vector");
59  endif
60
61  if (! ischar (codepage))
62    error ("native2unicode: CODEPAGE must be a string")
63  endif
64
65  ## FIXME: Would it be better to do this by converting to uint8?  Or to
66  ## let __native2unicode to the clipping?  Multiple steps here means
67  ## looping through the data and allocating memory multiple times.
68
69  native_bytes = round (native_bytes);
70  native_bytes(native_bytes < 0) = 0;
71  native_bytes(native_bytes > 255) = 255;
72
73  utf8_str = __native2unicode__ (native_bytes, codepage);
74
75  if (! isrow (native_bytes))
76    utf8_str = utf8_str.';
77  endif
78
79endfunction
80
81
82## "ЄЅІЇЈЉЊ"
83%!testif HAVE_ICONV
84%! assert (double (native2unicode (164:170, 'ISO-8859-5')),
85%!         [208 132 208 133 208 134 208 135 208 136 208 137 208 138]);
86## ["ЄЅІ" 0 "ЇЈЉЊ"]
87%!testif HAVE_ICONV
88%! assert (double (native2unicode ([164:166 0 167:170], 'ISO-8859-5')),
89%!         [208 132 208 133 208 134 0 208 135 208 136 208 137 208 138]);
90
91%!assert (native2unicode ("foobar"), "foobar");
92%!assert <*54384> (double (native2unicode ([0 0 120.3 0 0 122.6 0 0])),
93%!        [0 0 120 0 0 123 0 0]);
94
95%!error <Invalid call> native2unicode ()
96%!error <Invalid call> native2unicode (1, 'ISO-8859-1', 'test')
97%!error <NATIVE_BYTES must be a numeric vector> native2unicode ([1 2; 3 4])
98%!error <NATIVE_BYTES must be a numeric vector> native2unicode ({1 2 3 4})
99%!error <CODEPAGE must be a string> native2unicode (164:170, 123)
100%!testif HAVE_ICONV
101%! fail ("native2unicode (234, 'foo')",
102%!       "converting from codepage 'foo' to UTF-8");
103