1 /*
2  * xml_getencoding.c
3  *
4  * Copyright (c) Chris Putnam 2007-2020
5  *
6  * Source code released under the GPL version 2
7  *
8  */
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 
13 #include <R.h>
14 
15 #include "charsets.h"
16 #include "str.h"
17 #include "str_conv.h"
18 #include "xml.h"
19 #include "xml_encoding.h"
20 
21 static int
xml_getencodingr(xml * node)22 xml_getencodingr( xml *node )
23 {
24 	int n = CHARSET_UNKNOWN, m;
25 	str *s;
26 	char *t;
27 
28 	if ( xml_tag_matches( node, "xml" ) ) {
29 		s = xml_attribute( node, "encoding" );
30 		if ( str_has_value( s ) ) {
31 			t = str_cstr( s );
32 			if ( !strcasecmp( t, "UTF-8" ) )
33 				n = CHARSET_UNICODE;
34 			else if ( !strcasecmp( t, "UTF8" ) )
35 				n = CHARSET_UNICODE;
36 			else if ( !strcasecmp( t, "GB18030" ) )
37 				n = CHARSET_GB18030;
38 			else n = charset_find( t );
39 			if ( n==CHARSET_UNKNOWN ) {
40 				REprintf( "Warning: did not recognize encoding '%s'\n", t );
41 			}
42 		}
43 	}
44         if ( node->down ) {
45 		m = xml_getencodingr( node->down );
46 		if ( m!=CHARSET_UNKNOWN ) n = m;
47 	}
48         if ( node->next ) {
49 		m = xml_getencodingr( node->next );
50 		if ( m!=CHARSET_UNKNOWN ) n = m;
51 	}
52 
53 	return n;
54 }
55 
56 int
xml_getencoding(str * s)57 xml_getencoding( str *s )
58 {
59 	int file_charset = CHARSET_UNKNOWN;
60 	str descriptor;
61 	xml descriptxml;
62 	char *p, *q;
63 
64 	p = strstr( str_cstr( s ), "<?xml" );
65 	if ( !p ) p = strstr( str_cstr( s ), "<?XML" );
66 	if ( p ) {
67 		q = strstr( p, "?>" );
68 		if ( q ) {
69 			str_init( &descriptor );
70 			str_segcpy( &descriptor, p, q+2 );
71 			xml_init( &descriptxml );
72 			xml_parse( str_cstr( &descriptor ), &descriptxml );
73 			file_charset = xml_getencodingr( &descriptxml );
74 			xml_free( &descriptxml );
75 			str_free( &descriptor );
76 			str_segdel( s, p, q+2 );
77 		}
78 	}
79 	return file_charset;
80 }
81