1 /*
2  * xml_getencoding.c
3  *
4  * Copyright (c) Chris Putnam 2007-2020
5  *
6  * Source code released under the GPL version 2
7  *
8  */
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include "charsets.h"
13 #include "str.h"
14 #include "str_conv.h"
15 #include "xml.h"
16 #include "xml_encoding.h"
17 
18 static int
xml_getencodingr(xml * node)19 xml_getencodingr( xml *node )
20 {
21 	int n = CHARSET_UNKNOWN, m;
22 	str *s;
23 	char *t;
24 
25 	if ( xml_tag_matches( node, "xml" ) ) {
26 		s = xml_attribute( node, "encoding" );
27 		if ( str_has_value( s ) ) {
28 			t = str_cstr( s );
29 			if ( !strcasecmp( t, "UTF-8" ) )
30 				n = CHARSET_UNICODE;
31 			else if ( !strcasecmp( t, "UTF8" ) )
32 				n = CHARSET_UNICODE;
33 			else if ( !strcasecmp( t, "GB18030" ) )
34 				n = CHARSET_GB18030;
35 			else n = charset_find( t );
36 			if ( n==CHARSET_UNKNOWN ) {
37 				// Patch: Disable output logging
38 			}
39 		}
40 	}
41         if ( node->down ) {
42 		m = xml_getencodingr( node->down );
43 		if ( m!=CHARSET_UNKNOWN ) n = m;
44 	}
45         if ( node->next ) {
46 		m = xml_getencodingr( node->next );
47 		if ( m!=CHARSET_UNKNOWN ) n = m;
48 	}
49 
50 	return n;
51 }
52 
53 int
xml_getencoding(str * s)54 xml_getencoding( str *s )
55 {
56 	int file_charset = CHARSET_UNKNOWN;
57 	str descriptor;
58 	xml descriptxml;
59 	char *p, *q;
60 
61 	p = strstr( str_cstr( s ), "<?xml" );
62 	if ( !p ) p = strstr( str_cstr( s ), "<?XML" );
63 	if ( p ) {
64 		q = strstr( p, "?>" );
65 		if ( q ) {
66 			str_init( &descriptor );
67 			str_segcpy( &descriptor, p, q+2 );
68 			xml_init( &descriptxml );
69 			xml_parse( str_cstr( &descriptor ), &descriptxml );
70 			file_charset = xml_getencodingr( &descriptxml );
71 			xml_free( &descriptxml );
72 			str_free( &descriptor );
73 			str_segdel( s, p, q+2 );
74 		}
75 	}
76 	return file_charset;
77 }
78