1 /* mmutf8fix.c
2  * fix invalid UTF8 sequences. This is begun as a very simple replacer
3  * of non-control characters, and actually breaks some UTF-8 encoding
4  * right now. If the module turns out to be useful, it should be enhanced
5  * to support modes that really detect invalid UTF8. In the longer term
6  * it could also be evolved into an any-charset-to-UTF8 converter. But
7  * first let's see if it really gets into widespread enough use.
8  *
9  * Copyright 2013-2016 Adiscon GmbH.
10  *
11  * This file is part of rsyslog.
12  *
13  * Licensed under the Apache License, Version 2.0 (the "License");
14  * you may not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  *       http://www.apache.org/licenses/LICENSE-2.0
18  *       -or-
19  *       see COPYING.ASL20 in the source distribution
20  *
21  * Unless required by applicable law or agreed to in writing, software
22  * distributed under the License is distributed on an "AS IS" BASIS,
23  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24  * See the License for the specific language governing permissions and
25  * limitations under the License.
26  */
27 #include "config.h"
28 #include "rsyslog.h"
29 #include <stdio.h>
30 #include <stdarg.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <assert.h>
34 #include <signal.h>
35 #include <errno.h>
36 #include <unistd.h>
37 #include <stdint.h>
38 #include "conf.h"
39 #include "syslogd-types.h"
40 #include "srUtils.h"
41 #include "template.h"
42 #include "module-template.h"
43 #include "errmsg.h"
44 
45 MODULE_TYPE_OUTPUT
46 MODULE_TYPE_NOKEEP
47 MODULE_CNFNAME("mmutf8fix")
48 
49 
50 DEF_OMOD_STATIC_DATA
51 
52 /* define operation modes we have */
53 #define MODE_CC 0	 /* just fix control characters */
54 #define MODE_UTF8 1	 /* do real UTF-8 fixing */
55 
56 /* config variables */
57 typedef struct _instanceData {
58 	uchar replChar;
59 	uint8_t mode;		/* operations mode */
60 } instanceData;
61 
62 typedef struct wrkrInstanceData {
63 	instanceData *pData;
64 } wrkrInstanceData_t;
65 
66 struct modConfData_s {
67 	rsconf_t *pConf;	/* our overall config object */
68 };
69 static modConfData_t *loadModConf = NULL;/* modConf ptr to use for the current load process */
70 static modConfData_t *runModConf = NULL;/* modConf ptr to use for the current exec process */
71 
72 
73 /* tables for interfacing with the v6 config system */
74 /* action (instance) parameters */
75 static struct cnfparamdescr actpdescr[] = {
76 	{ "mode", eCmdHdlrGetWord, 0 },
77 	{ "replacementchar", eCmdHdlrGetChar, 0 }
78 };
79 static struct cnfparamblk actpblk =
80 	{ CNFPARAMBLK_VERSION,
81 	  sizeof(actpdescr)/sizeof(struct cnfparamdescr),
82 	  actpdescr
83 	};
84 
85 BEGINbeginCnfLoad
86 CODESTARTbeginCnfLoad
87 	loadModConf = pModConf;
88 	pModConf->pConf = pConf;
89 ENDbeginCnfLoad
90 
91 BEGINendCnfLoad
92 CODESTARTendCnfLoad
93 ENDendCnfLoad
94 
95 BEGINcheckCnf
96 CODESTARTcheckCnf
97 ENDcheckCnf
98 
99 BEGINactivateCnf
100 CODESTARTactivateCnf
101 	runModConf = pModConf;
102 ENDactivateCnf
103 
104 BEGINfreeCnf
105 CODESTARTfreeCnf
106 ENDfreeCnf
107 
108 
109 BEGINcreateInstance
110 CODESTARTcreateInstance
111 ENDcreateInstance
112 
113 
114 BEGINcreateWrkrInstance
115 CODESTARTcreateWrkrInstance
116 ENDcreateWrkrInstance
117 
118 
119 BEGINisCompatibleWithFeature
120 CODESTARTisCompatibleWithFeature
121 ENDisCompatibleWithFeature
122 
123 
124 BEGINfreeInstance
125 CODESTARTfreeInstance
126 ENDfreeInstance
127 
128 
129 BEGINfreeWrkrInstance
130 CODESTARTfreeWrkrInstance
131 ENDfreeWrkrInstance
132 
133 
134 static inline void
setInstParamDefaults(instanceData * pData)135 setInstParamDefaults(instanceData *pData)
136 {
137 	pData->mode = MODE_UTF8;
138 	pData->replChar = ' ';
139 }
140 
141 BEGINnewActInst
142 	struct cnfparamvals *pvals;
143 	int i;
144 CODESTARTnewActInst
145 	DBGPRINTF("newActInst (mmutf8fix)\n");
146 	if((pvals = nvlstGetParams(lst, &actpblk, NULL)) == NULL) {
147 		ABORT_FINALIZE(RS_RET_MISSING_CNFPARAMS);
148 	}
149 
150 	CODE_STD_STRING_REQUESTnewActInst(1)
151 	CHKiRet(OMSRsetEntry(*ppOMSR, 0, NULL, OMSR_TPL_AS_MSG));
152 	CHKiRet(createInstance(&pData));
153 	setInstParamDefaults(pData);
154 
155 	for(i = 0 ; i < actpblk.nParams ; ++i) {
156 		if(!pvals[i].bUsed)
157 			continue;
158 		if(!strcmp(actpblk.descr[i].name, "mode")) {
159 			if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf-8",
160 					 sizeof("utf-8")-1)) {
161 				pData->mode = MODE_UTF8;
162 			} else if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"controlcharacters",
163 					 sizeof("controlcharacters")-1)) {
164 				pData->mode = MODE_CC;
165 			} else {
166 				char *cstr = es_str2cstr(pvals[i].val.d.estr, NULL);
167 				LogError(0, RS_RET_INVLD_MODE,
168 					"mmutf8fix: invalid mode '%s' - ignored",
169 					cstr);
170 				free(cstr);
171 			}
172 		} else if(!strcmp(actpblk.descr[i].name, "replacementchar")) {
173 			pData->replChar = es_getBufAddr(pvals[i].val.d.estr)[0];
174 		} else {
175 			dbgprintf("mmutf8fix: program error, non-handled "
176 			  "param '%s'\n", actpblk.descr[i].name);
177 		}
178 	}
179 
180 CODE_STD_FINALIZERnewActInst
181 	cnfparamvalsDestruct(pvals, &actpblk);
182 ENDnewActInst
183 
184 
185 BEGINdbgPrintInstInfo
186 CODESTARTdbgPrintInstInfo
187 ENDdbgPrintInstInfo
188 
189 
190 BEGINtryResume
191 CODESTARTtryResume
192 ENDtryResume
193 
194 
195 static void
doCC(instanceData * pData,uchar * msg,int lenMsg)196 doCC(instanceData *pData, uchar *msg, int lenMsg)
197 {
198 	int i;
199 
200 	for(i = 0 ; i < lenMsg ; ++i) {
201 		if(msg[i] < 32 || msg[i] > 126) {
202 			msg[i] = pData->replChar;
203 		}
204 	}
205 }
206 
207 /* fix an invalid multibyte sequence */
208 static void
fixInvldMBSeq(instanceData * pData,uchar * msg,int lenMsg,int strtIdx,int cnt)209 fixInvldMBSeq(instanceData *pData, uchar *msg, int lenMsg, int strtIdx, int cnt)
210 {
211 	int i, endIdx;
212 
213 	/* Actually strtIdx + cnt will not exceed msgLen,
214 	   but this check does bring peace of mind */
215 	endIdx = strtIdx + cnt;
216 	if(endIdx > lenMsg)
217 		endIdx = lenMsg;
218 	for(i = strtIdx ; i < endIdx ; ++i)
219 		msg[i] = pData->replChar;
220 }
221 
222 static void
doUTF8(instanceData * pData,uchar * msg,int lenMsg)223 doUTF8(instanceData *pData, uchar *msg, int lenMsg)
224 {
225 	uchar c;
226 	int8_t bytesLeft = 0;
227 	uint32_t codepoint;
228 	int strtIdx = 0;
229 	int i;
230 
231 	for(i = 0 ; i < lenMsg ; ++i) {
232 		c = msg[i];
233 		if(bytesLeft) {
234 			if((c & 0xc0) != 0x80) {
235 				/* invalid continuation byte, invalidate all bytes
236 				   up to (but not including) the current byte
237 				   startIdx is always set if bytesLeft is set */
238 				fixInvldMBSeq(pData, msg, lenMsg, strtIdx, i - strtIdx);
239 				bytesLeft = 0;
240 				goto startOfSequence;
241 			} else {
242 				codepoint = (codepoint << 6) | (c & 0x3f);
243 				--bytesLeft;
244 				if(bytesLeft == 0) {
245 					int seqLen = i - strtIdx + 1;
246 
247 					if (
248 					    /* an overlong encoding? (a codepoint must use only
249 					       the minimum number of bytes to represent its value) */
250 					    (((2 == seqLen) && (codepoint < 0x80)) ||
251 					     ((3 == seqLen) && (codepoint < 0x800)) ||
252 					     ((4 == seqLen) && (codepoint < 0x10000)))
253 					    ||
254 					    /* UTF-16 surrogates? */
255 					    ((codepoint >= 0xD800) && (codepoint <= 0xDFFF))
256 					    ||
257 					    /* too-large codepoint? */
258 					    (codepoint > 0x10FFFF)
259 					) {
260 						/* sequence invalid, invalidate all bytes
261 						   startIdx is always set if bytesLeft is set */
262 						fixInvldMBSeq(pData, msg, lenMsg, strtIdx, seqLen);
263 					}
264 				}
265 			}
266 		} else {
267 startOfSequence:
268 			if((c & 0x80) == 0) {
269 				/* 1-byte sequence, US-ASCII */
270 				; /* nothing to do, all well */
271 			} else if((c & 0xe0) == 0xc0) {
272 				/* 2-byte sequence */
273 				strtIdx = i;
274 				bytesLeft = 1;
275 				codepoint = c & 0x1f;
276 			} else if((c & 0xf0) == 0xe0) {
277 				/* 3-byte sequence */
278 				strtIdx = i;
279 				bytesLeft = 2;
280 				codepoint = c & 0x0f;
281 			} else if((c & 0xf8) == 0xf0) {
282 				/* 4-byte sequence */
283 				strtIdx = i;
284 				bytesLeft = 3;
285 				codepoint = c & 0x07;
286 			} else {   /* invalid, either:
287 				      - stray continuation byte (0x80 <= x <= 0xBF)
288 				      - 5&6 byte sequence start (x >= 0xF8) forbidden by RFC3629
289 				    */
290 				msg[i] = pData->replChar;
291 			}
292 		}
293 	}
294 	if (bytesLeft) {
295 		/* invalid, there was not enough bytes to complete a sequence
296 		   startIdx is always set if bytesLeft is set */
297 		fixInvldMBSeq(pData, msg, lenMsg, strtIdx, i - strtIdx);
298 	}
299 }
300 
301 BEGINdoAction_NoStrings
302 	smsg_t **ppMsg = (smsg_t **) pMsgData;
303 	smsg_t *pMsg = ppMsg[0];
304 	uchar *msg;
305 	int lenMsg;
306 CODESTARTdoAction
307 	lenMsg = getMSGLen(pMsg);
308 	msg = getMSG(pMsg);
309 	if(pWrkrData->pData->mode == MODE_CC) {
310 		doCC(pWrkrData->pData, msg, lenMsg);
311 	} else {
312 		doUTF8(pWrkrData->pData, msg, lenMsg);
313 	}
314 ENDdoAction
315 
316 
317 NO_LEGACY_CONF_parseSelectorAct
318 
319 
320 BEGINmodExit
321 CODESTARTmodExit
322 ENDmodExit
323 
324 
325 BEGINqueryEtryPt
326 CODESTARTqueryEtryPt
327 CODEqueryEtryPt_STD_OMOD_QUERIES
328 CODEqueryEtryPt_STD_OMOD8_QUERIES
329 CODEqueryEtryPt_STD_CONF2_OMOD_QUERIES
330 CODEqueryEtryPt_STD_CONF2_QUERIES
331 ENDqueryEtryPt
332 
333 
334 BEGINmodInit()
335 CODESTARTmodInit
336 	*ipIFVersProvided = CURR_MOD_IF_VERSION; /* we only support the current interface specification */
337 CODEmodInit_QueryRegCFSLineHdlr
338 	DBGPRINTF("mmutf8fix: module compiled with rsyslog version %s.\n", VERSION);
339 ENDmodInit
340