1-- ------------------------------------------------------------
2
3{- |
4   Module     : Data.Char.Properties.XMLCharProps
5   Copyright  : Copyright (C) 2010 - Uwe Schmidt
6   License    : MIT
7
8   Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
9   Stability  : stable
10   Portability: portable
11
12   XML character properties
13
14-}
15
16-- ------------------------------------------------------------
17
18module Data.Char.Properties.XMLCharProps
19    ( isXmlChar
20    , isXmlCharCR
21    , isXml1ByteChar
22    , isXmlLatin1Char
23    , isXmlSpaceChar
24    , isXmlSpaceCharCR
25    , isXml11SpaceChar
26    , isXmlNameChar
27    , isXmlNameStartChar
28    , isXmlNCNameChar
29    , isXmlNCNameStartChar
30    , isXmlPubidChar
31    , isXmlLetter
32    , isXmlBaseChar
33    , isXmlIdeographicChar
34    , isXmlCombiningChar
35    , isXmlDigit
36    , isXmlExtender
37    , isXmlControlOrPermanentlyUndefined
38
39    , charPropXmlChar
40    , charPropXmlCharCR
41    , charPropXml1ByteChar
42    , charPropXmlLatin1Char
43    , charPropXmlSpaceChar
44    , charPropXmlSpaceCharCR
45    , charPropXml11SpaceChar
46    , charPropXmlNameChar
47    , charPropXmlNameStartChar
48    , charPropXmlNCNameChar
49    , charPropXmlNCNameStartChar
50    , charPropXmlPubidChar
51    , charPropXmlLetter
52    , charPropXmlBaseChar
53    , charPropXmlIdeographicChar
54    , charPropXmlCombiningChar
55    , charPropXmlDigit
56    , charPropXmlExtender
57    , charPropXmlControlOrPermanentlyUndefined
58    )
59where
60
61import Data.Set.CharSet
62
63-- ------------------------------------------------------------
64
65-- |
66-- checking for valid XML characters
67
68isXmlChar :: Char -> Bool
69isXmlChar c                                     -- optimized
70    = ( c >= ' ' && c <= '\55295' )
71      ||
72      c `elem` ['\n', '\t', '\r']
73      ||
74      ( c >= '\57344'
75        &&
76        ( c <= '\65533'
77          ||
78          c >= '\65536' &&   c <= '\1114111'
79        )
80      )
81
82{- old
83isXmlChar c = c `elemCS` charPropXmlChar
84-}
85
86{-# INLINE isXmlChar #-}
87
88charPropXmlChar :: CharSet
89charPropXmlChar
90    = [ ('\x0009', '\x000A')
91      , ('\x000D', '\x000D')
92      , ('\x0020', '\xD7FF')
93      , ('\xE000', '\xFFFD')
94      , ('\x10000', '\x10FFFF')
95      ]
96
97-- |
98-- checking for valid XML characters, except CR
99
100isXmlCharCR :: Char -> Bool
101isXmlCharCR c                                     -- optimized
102    = ( c >= ' ' && c <= '\55295' )
103      ||
104      c `elem` ['\n', '\t']
105      ||
106      ( c >= '\57344'
107        &&
108        ( c <= '\65533'
109          ||
110          c >= '\65536' &&   c <= '\1114111'
111        )
112      )
113
114{- old
115isXmlCharCR c = c `elemCS` charPropXmlCharCR
116-}
117
118{-# INLINE isXmlCharCR #-}
119
120charPropXmlCharCR :: CharSet
121charPropXmlCharCR
122    = [ ('\x0009', '\x000A')
123      , ('\x0020', '\xD7FF')
124      , ('\xE000', '\xFFFD')
125      , ('\x10000', '\x10FFFF')
126      ]
127
128-- |
129-- check for a legal 1 byte XML char
130
131isXml1ByteChar :: Char -> Bool
132isXml1ByteChar c = c `elemCS` charPropXml1ByteChar
133{-# INLINE isXml1ByteChar #-}
134
135charPropXml1ByteChar :: CharSet
136charPropXml1ByteChar
137    = stringCS ['\x09', '\x0A', '\x0D']
138      `unionCS`
139      [ ('\x20', '\x7F') ]
140
141-- |
142-- test for a legal latin1 XML char
143
144isXmlLatin1Char :: Char -> Bool
145isXmlLatin1Char c = c `elemCS` charPropXmlLatin1Char
146{-# INLINE isXmlLatin1Char #-}
147
148charPropXmlLatin1Char :: CharSet
149charPropXmlLatin1Char
150    = charPropXml1ByteChar
151      `unionCS`
152      [ ('\x80', '\xFF') ]
153
154-- |
155-- checking for XML space character: \\\n, \\\r, \\\t and \" \"
156
157isXmlSpaceChar :: Char -> Bool
158isXmlSpaceChar c
159    = c == ' '
160      ||
161      c == '\n'
162      ||
163      c == '\t'
164      ||
165      c == '\r'
166
167{- old
168isXmlSpaceChar c = c `elemCS` charPropXmlSpaceChar
169-}
170{-# INLINE isXmlSpaceChar #-}
171
172charPropXmlSpaceChar          :: CharSet
173charPropXmlSpaceChar
174    = stringCS ['\x20', '\x09', '\x0D', '\x0A']
175
176-- |
177-- checking for XML space character: \\\n, \\\t and \" \"
178
179isXmlSpaceCharCR :: Char -> Bool
180isXmlSpaceCharCR c
181    = c == ' '
182      ||
183      c == '\n'
184      ||
185      c == '\t'
186
187{- old
188isXmlSpaceCharCR c = c `elemCS` charPropXmlSpaceCharCR
189-}
190{-# INLINE isXmlSpaceCharCR #-}
191
192charPropXmlSpaceCharCR          :: CharSet
193charPropXmlSpaceCharCR
194    = stringCS ['\x20', '\x09', '\x0A']
195
196-- |
197-- checking for XML1.1 space character: additional space 0x85 and 0x2028
198--
199-- see also : 'isXmlSpaceChar'
200
201isXml11SpaceChar :: Char -> Bool
202isXml11SpaceChar c = c `elemCS` charPropXml11SpaceChar
203
204charPropXml11SpaceChar                :: CharSet
205charPropXml11SpaceChar
206    = stringCS ['\x09', '\x0A', '\x0D', '\x20', '\x85', '\x2028']
207
208-- |
209-- checking for XML name character
210
211isXmlNameChar :: Char -> Bool
212isXmlNameChar c                        -- optimized for ASCII chars
213    | c <= 'z'
214        = c >= 'a'
215          ||
216          ( c >= 'A' && c <= 'Z' )
217          ||
218          ( c >= '0' && c <= '9' )
219          ||
220          c `elem` ['-', '.', ':', '_']
221    | c >= '\183'
222        = c `elemCS` charPropXmlNameChar
223    | otherwise
224        = False
225{-# INLINE isXmlNameChar #-}
226
227charPropXmlNameChar           :: CharSet
228charPropXmlNameChar
229    = charPropXmlLetter
230      `unionCS`
231      charPropXmlDigit
232      `unionCS`
233      (singleCS '\x2D' `unionCS` singleCS '\x2E')               -- '-' | '.'
234      `unionCS`
235      (singleCS '\x3A' `unionCS` singleCS '\x5F')               -- Letter | ':' | '_'
236      `unionCS`
237      charPropXmlCombiningChar
238      `unionCS`
239      charPropXmlExtender
240
241-- |
242-- checking for XML name start character
243--
244-- see also : 'isXmlNameChar'
245
246isXmlNameStartChar :: Char -> Bool
247isXmlNameStartChar c                                            -- optimized for ASCII chars
248    | c <= 'z'
249        = c >= 'a'
250          ||
251          ( c >= 'A' && c <= 'Z' )
252          ||
253          c `elem` [':', '_']
254    | c >= '\192'
255        = c `elemCS` charPropXmlNameStartChar
256    | otherwise
257        = False
258{-# INLINE isXmlNameStartChar #-}
259
260charPropXmlNameStartChar              :: CharSet
261charPropXmlNameStartChar
262    = charPropXmlLetter
263      `unionCS`
264      singleCS '\x3A'
265      `unionCS`
266      singleCS '\x5F'           -- Letter | ':' | '_'
267
268-- |
269-- checking for XML NCName character: no \":\" allowed
270--
271-- see also : 'isXmlNameChar'
272
273isXmlNCNameChar :: Char -> Bool
274isXmlNCNameChar c                                               -- optimized for ASCII chars
275    | c <= 'z'
276        = c >= 'a'
277          ||
278          ( c >= 'A' && c <= 'Z' )
279          ||
280          ( c >= '0' && c <= '9' )
281          ||
282          c `elem` ['-', '.', '_']
283    | c >= '\183'
284        = c `elemCS` charPropXmlNameChar
285    | otherwise
286        = False
287{-# INLINE isXmlNCNameChar #-}
288
289charPropXmlNCNameChar                 :: CharSet
290charPropXmlNCNameChar
291    = charPropXmlNameChar
292      `diffCS`
293      singleCS '\x3A'                                           -- no :
294
295-- |
296-- checking for XML NCName start character: no \":\" allowed
297--
298-- see also : 'isXmlNameChar', 'isXmlNCNameChar'
299
300isXmlNCNameStartChar :: Char -> Bool
301isXmlNCNameStartChar c                                          -- optimized for ASCII chars
302    | c <= 'z'
303        = c >= 'a'
304          ||
305          ( c >= 'A' && c <= 'Z' )
306          ||
307          c == '_'
308    | c >= '\192'
309        = c `elemCS` charPropXmlNameStartChar
310    | otherwise
311        = False
312{-# INLINE isXmlNCNameStartChar #-}
313
314charPropXmlNCNameStartChar            :: CharSet
315charPropXmlNCNameStartChar
316    = charPropXmlNameStartChar
317      `diffCS`
318      singleCS '\x3A'                                           -- no :
319
320-- |
321-- checking for XML public id character
322
323isXmlPubidChar :: Char -> Bool
324isXmlPubidChar c = c `elemCS` charPropXmlPubidChar
325
326charPropXmlPubidChar          :: CharSet
327charPropXmlPubidChar
328    = rangeCS '0' '9'
329      `unionCS`
330      rangeCS 'A' 'Z'
331      `unionCS`
332      rangeCS 'a' 'z'
333      `unionCS`
334      stringCS " \r\n-'()+,./:=?;!*#@$_%"
335
336-- |
337-- checking for XML letter
338
339isXmlLetter :: Char -> Bool
340isXmlLetter c = c `elemCS` charPropXmlLetter
341{-# INLINE isXmlLetter #-}
342
343charPropXmlLetter             :: CharSet
344charPropXmlLetter
345    = charPropXmlBaseChar
346      `unionCS`
347      charPropXmlIdeographicChar
348
349-- |
350-- checking for XML base charater
351
352isXmlBaseChar :: Char -> Bool
353isXmlBaseChar c = c `elemCS` charPropXmlBaseChar
354
355charPropXmlBaseChar           :: CharSet
356charPropXmlBaseChar
357    = [ ('\x0041', '\x005A')
358      , ('\x0061', '\x007A')
359      , ('\x00C0', '\x00D6')
360      , ('\x00D8', '\x00F6')
361      , ('\x00F8', '\x0131')
362      , ('\x0134', '\x013E')
363      , ('\x0141', '\x0148')
364      , ('\x014A', '\x017E')
365      , ('\x0180', '\x01C3')
366      , ('\x01CD', '\x01F0')
367      , ('\x01F4', '\x01F5')
368      , ('\x01FA', '\x0217')
369      , ('\x0250', '\x02A8')
370      , ('\x02BB', '\x02C1')
371      , ('\x0386', '\x0386')
372      , ('\x0388', '\x038A')
373      , ('\x038C', '\x038C')
374      , ('\x038E', '\x03A1')
375      , ('\x03A3', '\x03CE')
376      , ('\x03D0', '\x03D6')
377      , ('\x03DA', '\x03DA')
378      , ('\x03DC', '\x03DC')
379      , ('\x03DE', '\x03DE')
380      , ('\x03E0', '\x03E0')
381      , ('\x03E2', '\x03F3')
382      , ('\x0401', '\x040C')
383      , ('\x040E', '\x044F')
384      , ('\x0451', '\x045C')
385      , ('\x045E', '\x0481')
386      , ('\x0490', '\x04C4')
387      , ('\x04C7', '\x04C8')
388      , ('\x04CB', '\x04CC')
389      , ('\x04D0', '\x04EB')
390      , ('\x04EE', '\x04F5')
391      , ('\x04F8', '\x04F9')
392      , ('\x0531', '\x0556')
393      , ('\x0559', '\x0559')
394      , ('\x0561', '\x0586')
395      , ('\x05D0', '\x05EA')
396      , ('\x05F0', '\x05F2')
397      , ('\x0621', '\x063A')
398      , ('\x0641', '\x064A')
399      , ('\x0671', '\x06B7')
400      , ('\x06BA', '\x06BE')
401      , ('\x06C0', '\x06CE')
402      , ('\x06D0', '\x06D3')
403      , ('\x06D5', '\x06D5')
404      , ('\x06E5', '\x06E6')
405      , ('\x0905', '\x0939')
406      , ('\x093D', '\x093D')
407      , ('\x0958', '\x0961')
408      , ('\x0985', '\x098C')
409      , ('\x098F', '\x0990')
410      , ('\x0993', '\x09A8')
411      , ('\x09AA', '\x09B0')
412      , ('\x09B2', '\x09B2')
413      , ('\x09B6', '\x09B9')
414      , ('\x09DC', '\x09DD')
415      , ('\x09DF', '\x09E1')
416      , ('\x09F0', '\x09F1')
417      , ('\x0A05', '\x0A0A')
418      , ('\x0A0F', '\x0A10')
419      , ('\x0A13', '\x0A28')
420      , ('\x0A2A', '\x0A30')
421      , ('\x0A32', '\x0A33')
422      , ('\x0A35', '\x0A36')
423      , ('\x0A38', '\x0A39')
424      , ('\x0A59', '\x0A5C')
425      , ('\x0A5E', '\x0A5E')
426      , ('\x0A72', '\x0A74')
427      , ('\x0A85', '\x0A8B')
428      , ('\x0A8D', '\x0A8D')
429      , ('\x0A8F', '\x0A91')
430      , ('\x0A93', '\x0AA8')
431      , ('\x0AAA', '\x0AB0')
432      , ('\x0AB2', '\x0AB3')
433      , ('\x0AB5', '\x0AB9')
434      , ('\x0ABD', '\x0ABD')
435      , ('\x0AE0', '\x0AE0')
436      , ('\x0B05', '\x0B0C')
437      , ('\x0B0F', '\x0B10')
438      , ('\x0B13', '\x0B28')
439      , ('\x0B2A', '\x0B30')
440      , ('\x0B32', '\x0B33')
441      , ('\x0B36', '\x0B39')
442      , ('\x0B3D', '\x0B3D')
443      , ('\x0B5C', '\x0B5D')
444      , ('\x0B5F', '\x0B61')
445      , ('\x0B85', '\x0B8A')
446      , ('\x0B8E', '\x0B90')
447      , ('\x0B92', '\x0B95')
448      , ('\x0B99', '\x0B9A')
449      , ('\x0B9C', '\x0B9C')
450      , ('\x0B9E', '\x0B9F')
451      , ('\x0BA3', '\x0BA4')
452      , ('\x0BA8', '\x0BAA')
453      , ('\x0BAE', '\x0BB5')
454      , ('\x0BB7', '\x0BB9')
455      , ('\x0C05', '\x0C0C')
456      , ('\x0C0E', '\x0C10')
457      , ('\x0C12', '\x0C28')
458      , ('\x0C2A', '\x0C33')
459      , ('\x0C35', '\x0C39')
460      , ('\x0C60', '\x0C61')
461      , ('\x0C85', '\x0C8C')
462      , ('\x0C8E', '\x0C90')
463      , ('\x0C92', '\x0CA8')
464      , ('\x0CAA', '\x0CB3')
465      , ('\x0CB5', '\x0CB9')
466      , ('\x0CDE', '\x0CDE')
467      , ('\x0CE0', '\x0CE1')
468      , ('\x0D05', '\x0D0C')
469      , ('\x0D0E', '\x0D10')
470      , ('\x0D12', '\x0D28')
471      , ('\x0D2A', '\x0D39')
472      , ('\x0D60', '\x0D61')
473      , ('\x0E01', '\x0E2E')
474      , ('\x0E30', '\x0E30')
475      , ('\x0E32', '\x0E33')
476      , ('\x0E40', '\x0E45')
477      , ('\x0E81', '\x0E82')
478      , ('\x0E84', '\x0E84')
479      , ('\x0E87', '\x0E88')
480      , ('\x0E8A', '\x0E8A')
481      , ('\x0E8D', '\x0E8D')
482      , ('\x0E94', '\x0E97')
483      , ('\x0E99', '\x0E9F')
484      , ('\x0EA1', '\x0EA3')
485      , ('\x0EA5', '\x0EA5')
486      , ('\x0EA7', '\x0EA7')
487      , ('\x0EAA', '\x0EAB')
488      , ('\x0EAD', '\x0EAE')
489      , ('\x0EB0', '\x0EB0')
490      , ('\x0EB2', '\x0EB3')
491      , ('\x0EBD', '\x0EBD')
492      , ('\x0EC0', '\x0EC4')
493      , ('\x0F40', '\x0F47')
494      , ('\x0F49', '\x0F69')
495      , ('\x10A0', '\x10C5')
496      , ('\x10D0', '\x10F6')
497      , ('\x1100', '\x1100')
498      , ('\x1102', '\x1103')
499      , ('\x1105', '\x1107')
500      , ('\x1109', '\x1109')
501      , ('\x110B', '\x110C')
502      , ('\x110E', '\x1112')
503      , ('\x113C', '\x113C')
504      , ('\x113E', '\x113E')
505      , ('\x1140', '\x1140')
506      , ('\x114C', '\x114C')
507      , ('\x114E', '\x114E')
508      , ('\x1150', '\x1150')
509      , ('\x1154', '\x1155')
510      , ('\x1159', '\x1159')
511      , ('\x115F', '\x1161')
512      , ('\x1163', '\x1163')
513      , ('\x1165', '\x1165')
514      , ('\x1167', '\x1167')
515      , ('\x1169', '\x1169')
516      , ('\x116D', '\x116E')
517      , ('\x1172', '\x1173')
518      , ('\x1175', '\x1175')
519      , ('\x119E', '\x119E')
520      , ('\x11A8', '\x11A8')
521      , ('\x11AB', '\x11AB')
522      , ('\x11AE', '\x11AF')
523      , ('\x11B7', '\x11B8')
524      , ('\x11BA', '\x11BA')
525      , ('\x11BC', '\x11C2')
526      , ('\x11EB', '\x11EB')
527      , ('\x11F0', '\x11F0')
528      , ('\x11F9', '\x11F9')
529      , ('\x1E00', '\x1E9B')
530      , ('\x1EA0', '\x1EF9')
531      , ('\x1F00', '\x1F15')
532      , ('\x1F18', '\x1F1D')
533      , ('\x1F20', '\x1F45')
534      , ('\x1F48', '\x1F4D')
535      , ('\x1F50', '\x1F57')
536      , ('\x1F59', '\x1F59')
537      , ('\x1F5B', '\x1F5B')
538      , ('\x1F5D', '\x1F5D')
539      , ('\x1F5F', '\x1F7D')
540      , ('\x1F80', '\x1FB4')
541      , ('\x1FB6', '\x1FBC')
542      , ('\x1FBE', '\x1FBE')
543      , ('\x1FC2', '\x1FC4')
544      , ('\x1FC6', '\x1FCC')
545      , ('\x1FD0', '\x1FD3')
546      , ('\x1FD6', '\x1FDB')
547      , ('\x1FE0', '\x1FEC')
548      , ('\x1FF2', '\x1FF4')
549      , ('\x1FF6', '\x1FFC')
550      , ('\x2126', '\x2126')
551      , ('\x212A', '\x212B')
552      , ('\x212E', '\x212E')
553      , ('\x2180', '\x2182')
554      , ('\x3041', '\x3094')
555      , ('\x30A1', '\x30FA')
556      , ('\x3105', '\x312C')
557      , ('\xAC00', '\xD7A3')
558      ]
559
560-- |
561-- checking for XML ideographic charater
562
563isXmlIdeographicChar :: Char -> Bool
564isXmlIdeographicChar c = c `elemCS` charPropXmlIdeographicChar
565{-# INLINE isXmlIdeographicChar #-}
566
567charPropXmlIdeographicChar    :: CharSet
568charPropXmlIdeographicChar
569    = [ ('\x3007', '\x3007')
570      , ('\x3021', '\x3029')
571      , ('\x4E00', '\x9FA5')
572      ]
573
574-- |
575-- checking for XML combining charater
576
577isXmlCombiningChar :: Char -> Bool
578isXmlCombiningChar c = c `elemCS` charPropXmlCombiningChar
579
580charPropXmlCombiningChar      :: CharSet
581charPropXmlCombiningChar
582    = [ ('\x0300', '\x0345')
583      , ('\x0360', '\x0361')
584      , ('\x0483', '\x0486')
585      , ('\x0591', '\x05A1')
586      , ('\x05A3', '\x05B9')
587      , ('\x05BB', '\x05BD')
588      , ('\x05BF', '\x05BF')
589      , ('\x05C1', '\x05C2')
590      , ('\x05C4', '\x05C4')
591      , ('\x064B', '\x0652')
592      , ('\x0670', '\x0670')
593      , ('\x06D6', '\x06DC')
594      , ('\x06DD', '\x06DF')
595      , ('\x06E0', '\x06E4')
596      , ('\x06E7', '\x06E8')
597      , ('\x06EA', '\x06ED')
598      , ('\x0901', '\x0903')
599      , ('\x093C', '\x093C')
600      , ('\x093E', '\x094C')
601      , ('\x094D', '\x094D')
602      , ('\x0951', '\x0954')
603      , ('\x0962', '\x0963')
604      , ('\x0981', '\x0983')
605      , ('\x09BC', '\x09BC')
606      , ('\x09BE', '\x09BE')
607      , ('\x09BF', '\x09BF')
608      , ('\x09C0', '\x09C4')
609      , ('\x09C7', '\x09C8')
610      , ('\x09CB', '\x09CD')
611      , ('\x09D7', '\x09D7')
612      , ('\x09E2', '\x09E3')
613      , ('\x0A02', '\x0A02')
614      , ('\x0A3C', '\x0A3C')
615      , ('\x0A3E', '\x0A3E')
616      , ('\x0A3F', '\x0A3F')
617      , ('\x0A40', '\x0A42')
618      , ('\x0A47', '\x0A48')
619      , ('\x0A4B', '\x0A4D')
620      , ('\x0A70', '\x0A71')
621      , ('\x0A81', '\x0A83')
622      , ('\x0ABC', '\x0ABC')
623      , ('\x0ABE', '\x0AC5')
624      , ('\x0AC7', '\x0AC9')
625      , ('\x0ACB', '\x0ACD')
626      , ('\x0B01', '\x0B03')
627      , ('\x0B3C', '\x0B3C')
628      , ('\x0B3E', '\x0B43')
629      , ('\x0B47', '\x0B48')
630      , ('\x0B4B', '\x0B4D')
631      , ('\x0B56', '\x0B57')
632      , ('\x0B82', '\x0B83')
633      , ('\x0BBE', '\x0BC2')
634      , ('\x0BC6', '\x0BC8')
635      , ('\x0BCA', '\x0BCD')
636      , ('\x0BD7', '\x0BD7')
637      , ('\x0C01', '\x0C03')
638      , ('\x0C3E', '\x0C44')
639      , ('\x0C46', '\x0C48')
640      , ('\x0C4A', '\x0C4D')
641      , ('\x0C55', '\x0C56')
642      , ('\x0C82', '\x0C83')
643      , ('\x0CBE', '\x0CC4')
644      , ('\x0CC6', '\x0CC8')
645      , ('\x0CCA', '\x0CCD')
646      , ('\x0CD5', '\x0CD6')
647      , ('\x0D02', '\x0D03')
648      , ('\x0D3E', '\x0D43')
649      , ('\x0D46', '\x0D48')
650      , ('\x0D4A', '\x0D4D')
651      , ('\x0D57', '\x0D57')
652      , ('\x0E31', '\x0E31')
653      , ('\x0E34', '\x0E3A')
654      , ('\x0E47', '\x0E4E')
655      , ('\x0EB1', '\x0EB1')
656      , ('\x0EB4', '\x0EB9')
657      , ('\x0EBB', '\x0EBC')
658      , ('\x0EC8', '\x0ECD')
659      , ('\x0F18', '\x0F19')
660      , ('\x0F35', '\x0F35')
661      , ('\x0F37', '\x0F37')
662      , ('\x0F39', '\x0F39')
663      , ('\x0F3E', '\x0F3E')
664      , ('\x0F3F', '\x0F3F')
665      , ('\x0F71', '\x0F84')
666      , ('\x0F86', '\x0F8B')
667      , ('\x0F90', '\x0F95')
668      , ('\x0F97', '\x0F97')
669      , ('\x0F99', '\x0FAD')
670      , ('\x0FB1', '\x0FB7')
671      , ('\x0FB9', '\x0FB9')
672      , ('\x20D0', '\x20DC')
673      , ('\x20E1', '\x20E1')
674      , ('\x302A', '\x302F')
675      , ('\x3099', '\x3099')
676      , ('\x309A', '\x309A')
677      ]
678
679-- |
680-- checking for XML digit
681
682isXmlDigit :: Char -> Bool
683isXmlDigit c = c `elemCS` charPropXmlDigit
684
685charPropXmlDigit              :: CharSet
686charPropXmlDigit
687    = [ ('\x0030', '\x0039')
688      , ('\x0660', '\x0669')
689      , ('\x06F0', '\x06F9')
690      , ('\x0966', '\x096F')
691      , ('\x09E6', '\x09EF')
692      , ('\x0A66', '\x0A6F')
693      , ('\x0AE6', '\x0AEF')
694      , ('\x0B66', '\x0B6F')
695      , ('\x0BE7', '\x0BEF')
696      , ('\x0C66', '\x0C6F')
697      , ('\x0CE6', '\x0CEF')
698      , ('\x0D66', '\x0D6F')
699      , ('\x0E50', '\x0E59')
700      , ('\x0ED0', '\x0ED9')
701      , ('\x0F20', '\x0F29')
702      ]
703
704-- |
705-- checking for XML extender
706
707isXmlExtender :: Char -> Bool
708isXmlExtender c = c `elemCS` charPropXmlExtender
709
710charPropXmlExtender           :: CharSet
711charPropXmlExtender
712    = [ ('\x00B7', '\x00B7')
713      , ('\x02D0', '\x02D0')
714      , ('\x02D1', '\x02D1')
715      , ('\x0387', '\x0387')
716      , ('\x0640', '\x0640')
717      , ('\x0E46', '\x0E46')
718      , ('\x0EC6', '\x0EC6')
719      , ('\x3005', '\x3005')
720      , ('\x3031', '\x3035')
721      , ('\x309D', '\x309E')
722      , ('\x30FC', '\x30FE')
723      ]
724
725-- |
726-- checking for XML control or permanently discouraged char
727--
728-- see Errata to XML1.0 (http:\/\/www.w3.org\/XML\/xml-V10-2e-errata) No 46
729--
730-- Document authors are encouraged to avoid "compatibility characters",
731-- as defined in section 6.8 of [Unicode] (see also D21 in section 3.6 of [Unicode3]).
732-- The characters defined in the following ranges are also discouraged.
733-- They are either control characters or permanently undefined Unicode characters:
734
735
736isXmlControlOrPermanentlyUndefined :: Char -> Bool
737isXmlControlOrPermanentlyUndefined c = c `elemCS` charPropXmlControlOrPermanentlyUndefined
738
739charPropXmlControlOrPermanentlyUndefined      :: CharSet
740charPropXmlControlOrPermanentlyUndefined
741    = [ ('\x7F', '\x84')
742      , ('\x86', '\x9F')
743      , ('\xFDD0', '\xFDDF')
744      , ('\x1FFFE', '\x1FFFF')
745      , ('\x2FFFE', '\x2FFFF')
746      , ('\x3FFFE', '\x3FFFF')
747      , ('\x4FFFE', '\x4FFFF')
748      , ('\x5FFFE', '\x5FFFF')
749      , ('\x6FFFE', '\x6FFFF')
750      , ('\x7FFFE', '\x7FFFF')
751      , ('\x8FFFE', '\x8FFFF')
752      , ('\x9FFFE', '\x9FFFF')
753      , ('\xAFFFE', '\xAFFFF')
754      , ('\xBFFFE', '\xBFFFF')
755      , ('\xCFFFE', '\xCFFFF')
756      , ('\xDFFFE', '\xDFFFF')
757      , ('\xEFFFE', '\xEFFFF')
758      , ('\xFFFFE', '\xFFFFF')
759      , ('\x10FFFE', '\x10FFFF')
760      ]
761
762-- ------------------------------------------------------------
763