1 /* UnicodeReader.java --
2    Copyright (C) 2005  Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 package gnu.xml.stream;
39 
40 import java.io.IOException;
41 import java.io.Reader;
42 
43 /**
44  * A reader that converts UTF-16 characters to Unicode code points.
45  *
46  * @author <a href='mailto:dog@gnu.org'>Chris Burdess</a>
47  */
48 public class UnicodeReader
49 {
50 
51   final Reader in;
52 
UnicodeReader(Reader in)53   UnicodeReader(Reader in)
54   {
55     this.in = in;
56   }
57 
mark(int limit)58   public void mark(int limit)
59     throws IOException
60   {
61     in.mark(limit * 2);
62   }
63 
reset()64   public void reset()
65     throws IOException
66   {
67     in.reset();
68   }
69 
read()70   public int read()
71     throws IOException
72   {
73     int ret = in.read();
74     if (ret == -1)
75       return ret;
76     if (ret >= 0xd800 && ret < 0xdc00)
77       {
78         // Unicode surrogate?
79         int low = in.read();
80         if (low >= 0xdc00 && low < 0xe000)
81           ret = Character.toCodePoint((char) ret, (char) low);
82         else
83           throw new IOException("unpaired surrogate: U+" +
84                                 Integer.toHexString(ret));
85       }
86     else if (ret >= 0xdc00 && ret < 0xe000)
87       throw new IOException("unpaired surrogate: U+" +
88                             Integer.toHexString(ret));
89     return ret;
90   }
91 
read(int[] buf, int off, int len)92   public int read(int[] buf, int off, int len)
93     throws IOException
94   {
95     if (len == 0)
96       return 0;
97     char[] b2 = new char[len];
98     int ret = in.read(b2, 0, len);
99     if (ret <= 0)
100       return ret;
101     int l = ret - 1;
102     int i = 0, j = off;
103     for (; i < l; i++)
104       {
105         char c = b2[i];
106         if (c >= 0xd800 && c < 0xdc00)
107           {
108             // Unicode surrogate?
109             char d = b2[i + 1];
110             if (d >= 0xdc00 && d < 0xe000)
111               {
112                 buf[j++] = Character.toCodePoint(c, d);
113                 i++;
114                 continue;
115               }
116             else
117               throw new IOException("unpaired surrogate: U+" +
118                                     Integer.toHexString(c));
119           }
120         else if (c >= 0xdc00 && c < 0xe000)
121           throw new IOException("unpaired surrogate: U+" +
122                                 Integer.toHexString(c));
123         buf[j++] = (int) c;
124       }
125     if (i == l)
126       {
127         // last char
128         char c = b2[l];
129         if (c >= 0xd800 && c < 0xdc00)
130           {
131             int low = in.read();
132             if (low >= 0xdc00 && low < 0xe000)
133               {
134                 buf[j++] = Character.toCodePoint(c, (char) low);
135                 return j;
136               }
137             else
138               throw new IOException("unpaired surrogate: U+" +
139                                     Integer.toHexString(c));
140           }
141         else if (c >= 0xdc00 && c < 0xe000)
142           throw new IOException("unpaired surrogate: U+" +
143                                 Integer.toHexString(c));
144         buf[j++] = (int) c;
145       }
146     return j;
147   }
148 
close()149   public void close()
150     throws IOException
151   {
152     in.close();
153   }
154 
155   /**
156    * Returns the specified UTF-16 char array as an array of Unicode code
157    * points.
158    */
toCodePointArray(String text)159   public static int[] toCodePointArray(String text)
160     throws IOException
161   {
162     char[] b2 = text.toCharArray();
163     int[] buf = new int[b2.length];
164     if (b2.length > 0)
165       {
166         int l = b2.length - 1;
167         int i = 0, j = 0;
168         for (; i < l; i++)
169           {
170             char c = b2[i];
171             if (c >= 0xd800 && c < 0xdc00)
172               {
173                 // Unicode surrogate?
174                 char d = b2[i + 1];
175                 if (d >= 0xdc00 && d < 0xe000)
176                   {
177                     buf[j++] = Character.toCodePoint(c, d);
178                     i++;
179                     continue;
180                   }
181                 else
182                   throw new IOException("unpaired surrogate: U+" +
183                                         Integer.toHexString(c));
184               }
185             else if (c >= 0xdc00 && c < 0xe000)
186               throw new IOException("unpaired surrogate: U+" +
187                                     Integer.toHexString(c));
188             buf[j++] = (int) c;
189           }
190         if (i == l)
191           {
192             // last char
193             buf[j++] = (int) b2[l];
194             if (j < buf.length)
195               {
196                 int[] buf2 = new int[j];
197                 System.arraycopy(buf, 0, buf2, 0, j);
198                 buf = buf2;
199               }
200           }
201       }
202     return buf;
203   }
204 
205 }
206