1 /*
2 * xml_getencoding.c
3 *
4 * Copyright (c) Chris Putnam 2007-2020
5 *
6 * Source code released under the GPL version 2
7 *
8 */
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12
13 #include <R.h>
14
15 #include "charsets.h"
16 #include "str.h"
17 #include "str_conv.h"
18 #include "xml.h"
19 #include "xml_encoding.h"
20
21 static int
xml_getencodingr(xml * node)22 xml_getencodingr( xml *node )
23 {
24 int n = CHARSET_UNKNOWN, m;
25 str *s;
26 char *t;
27
28 if ( xml_tag_matches( node, "xml" ) ) {
29 s = xml_attribute( node, "encoding" );
30 if ( str_has_value( s ) ) {
31 t = str_cstr( s );
32 if ( !strcasecmp( t, "UTF-8" ) )
33 n = CHARSET_UNICODE;
34 else if ( !strcasecmp( t, "UTF8" ) )
35 n = CHARSET_UNICODE;
36 else if ( !strcasecmp( t, "GB18030" ) )
37 n = CHARSET_GB18030;
38 else n = charset_find( t );
39 if ( n==CHARSET_UNKNOWN ) {
40 REprintf( "Warning: did not recognize encoding '%s'\n", t );
41 }
42 }
43 }
44 if ( node->down ) {
45 m = xml_getencodingr( node->down );
46 if ( m!=CHARSET_UNKNOWN ) n = m;
47 }
48 if ( node->next ) {
49 m = xml_getencodingr( node->next );
50 if ( m!=CHARSET_UNKNOWN ) n = m;
51 }
52
53 return n;
54 }
55
56 int
xml_getencoding(str * s)57 xml_getencoding( str *s )
58 {
59 int file_charset = CHARSET_UNKNOWN;
60 str descriptor;
61 xml descriptxml;
62 char *p, *q;
63
64 p = strstr( str_cstr( s ), "<?xml" );
65 if ( !p ) p = strstr( str_cstr( s ), "<?XML" );
66 if ( p ) {
67 q = strstr( p, "?>" );
68 if ( q ) {
69 str_init( &descriptor );
70 str_segcpy( &descriptor, p, q+2 );
71 xml_init( &descriptxml );
72 xml_parse( str_cstr( &descriptor ), &descriptxml );
73 file_charset = xml_getencodingr( &descriptxml );
74 xml_free( &descriptxml );
75 str_free( &descriptor );
76 str_segdel( s, p, q+2 );
77 }
78 }
79 return file_charset;
80 }
81