1 /* mmutf8fix.c
2 * fix invalid UTF8 sequences. This is begun as a very simple replacer
3 * of non-control characters, and actually breaks some UTF-8 encoding
4 * right now. If the module turns out to be useful, it should be enhanced
5 * to support modes that really detect invalid UTF8. In the longer term
6 * it could also be evolved into an any-charset-to-UTF8 converter. But
7 * first let's see if it really gets into widespread enough use.
8 *
9 * Copyright 2013-2016 Adiscon GmbH.
10 *
11 * This file is part of rsyslog.
12 *
13 * Licensed under the Apache License, Version 2.0 (the "License");
14 * you may not use this file except in compliance with the License.
15 * You may obtain a copy of the License at
16 *
17 * http://www.apache.org/licenses/LICENSE-2.0
18 * -or-
19 * see COPYING.ASL20 in the source distribution
20 *
21 * Unless required by applicable law or agreed to in writing, software
22 * distributed under the License is distributed on an "AS IS" BASIS,
23 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24 * See the License for the specific language governing permissions and
25 * limitations under the License.
26 */
27 #include "config.h"
28 #include "rsyslog.h"
29 #include <stdio.h>
30 #include <stdarg.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <assert.h>
34 #include <signal.h>
35 #include <errno.h>
36 #include <unistd.h>
37 #include <stdint.h>
38 #include "conf.h"
39 #include "syslogd-types.h"
40 #include "srUtils.h"
41 #include "template.h"
42 #include "module-template.h"
43 #include "errmsg.h"
44
45 MODULE_TYPE_OUTPUT
46 MODULE_TYPE_NOKEEP
47 MODULE_CNFNAME("mmutf8fix")
48
49
50 DEF_OMOD_STATIC_DATA
51
52 /* define operation modes we have */
53 #define MODE_CC 0 /* just fix control characters */
54 #define MODE_UTF8 1 /* do real UTF-8 fixing */
55
56 /* config variables */
57 typedef struct _instanceData {
58 uchar replChar;
59 uint8_t mode; /* operations mode */
60 } instanceData;
61
62 typedef struct wrkrInstanceData {
63 instanceData *pData;
64 } wrkrInstanceData_t;
65
66 struct modConfData_s {
67 rsconf_t *pConf; /* our overall config object */
68 };
69 static modConfData_t *loadModConf = NULL;/* modConf ptr to use for the current load process */
70 static modConfData_t *runModConf = NULL;/* modConf ptr to use for the current exec process */
71
72
73 /* tables for interfacing with the v6 config system */
74 /* action (instance) parameters */
75 static struct cnfparamdescr actpdescr[] = {
76 { "mode", eCmdHdlrGetWord, 0 },
77 { "replacementchar", eCmdHdlrGetChar, 0 }
78 };
79 static struct cnfparamblk actpblk =
80 { CNFPARAMBLK_VERSION,
81 sizeof(actpdescr)/sizeof(struct cnfparamdescr),
82 actpdescr
83 };
84
85 BEGINbeginCnfLoad
86 CODESTARTbeginCnfLoad
87 loadModConf = pModConf;
88 pModConf->pConf = pConf;
89 ENDbeginCnfLoad
90
91 BEGINendCnfLoad
92 CODESTARTendCnfLoad
93 ENDendCnfLoad
94
95 BEGINcheckCnf
96 CODESTARTcheckCnf
97 ENDcheckCnf
98
99 BEGINactivateCnf
100 CODESTARTactivateCnf
101 runModConf = pModConf;
102 ENDactivateCnf
103
104 BEGINfreeCnf
105 CODESTARTfreeCnf
106 ENDfreeCnf
107
108
109 BEGINcreateInstance
110 CODESTARTcreateInstance
111 ENDcreateInstance
112
113
114 BEGINcreateWrkrInstance
115 CODESTARTcreateWrkrInstance
116 ENDcreateWrkrInstance
117
118
119 BEGINisCompatibleWithFeature
120 CODESTARTisCompatibleWithFeature
121 ENDisCompatibleWithFeature
122
123
124 BEGINfreeInstance
125 CODESTARTfreeInstance
126 ENDfreeInstance
127
128
129 BEGINfreeWrkrInstance
130 CODESTARTfreeWrkrInstance
131 ENDfreeWrkrInstance
132
133
134 static inline void
setInstParamDefaults(instanceData * pData)135 setInstParamDefaults(instanceData *pData)
136 {
137 pData->mode = MODE_UTF8;
138 pData->replChar = ' ';
139 }
140
141 BEGINnewActInst
142 struct cnfparamvals *pvals;
143 int i;
144 CODESTARTnewActInst
145 DBGPRINTF("newActInst (mmutf8fix)\n");
146 if((pvals = nvlstGetParams(lst, &actpblk, NULL)) == NULL) {
147 ABORT_FINALIZE(RS_RET_MISSING_CNFPARAMS);
148 }
149
150 CODE_STD_STRING_REQUESTnewActInst(1)
151 CHKiRet(OMSRsetEntry(*ppOMSR, 0, NULL, OMSR_TPL_AS_MSG));
152 CHKiRet(createInstance(&pData));
153 setInstParamDefaults(pData);
154
155 for(i = 0 ; i < actpblk.nParams ; ++i) {
156 if(!pvals[i].bUsed)
157 continue;
158 if(!strcmp(actpblk.descr[i].name, "mode")) {
159 if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf-8",
160 sizeof("utf-8")-1)) {
161 pData->mode = MODE_UTF8;
162 } else if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"controlcharacters",
163 sizeof("controlcharacters")-1)) {
164 pData->mode = MODE_CC;
165 } else {
166 char *cstr = es_str2cstr(pvals[i].val.d.estr, NULL);
167 LogError(0, RS_RET_INVLD_MODE,
168 "mmutf8fix: invalid mode '%s' - ignored",
169 cstr);
170 free(cstr);
171 }
172 } else if(!strcmp(actpblk.descr[i].name, "replacementchar")) {
173 pData->replChar = es_getBufAddr(pvals[i].val.d.estr)[0];
174 } else {
175 dbgprintf("mmutf8fix: program error, non-handled "
176 "param '%s'\n", actpblk.descr[i].name);
177 }
178 }
179
180 CODE_STD_FINALIZERnewActInst
181 cnfparamvalsDestruct(pvals, &actpblk);
182 ENDnewActInst
183
184
185 BEGINdbgPrintInstInfo
186 CODESTARTdbgPrintInstInfo
187 ENDdbgPrintInstInfo
188
189
190 BEGINtryResume
191 CODESTARTtryResume
192 ENDtryResume
193
194
195 static void
doCC(instanceData * pData,uchar * msg,int lenMsg)196 doCC(instanceData *pData, uchar *msg, int lenMsg)
197 {
198 int i;
199
200 for(i = 0 ; i < lenMsg ; ++i) {
201 if(msg[i] < 32 || msg[i] > 126) {
202 msg[i] = pData->replChar;
203 }
204 }
205 }
206
207 /* fix an invalid multibyte sequence */
208 static void
fixInvldMBSeq(instanceData * pData,uchar * msg,int lenMsg,int strtIdx,int cnt)209 fixInvldMBSeq(instanceData *pData, uchar *msg, int lenMsg, int strtIdx, int cnt)
210 {
211 int i, endIdx;
212
213 /* Actually strtIdx + cnt will not exceed msgLen,
214 but this check does bring peace of mind */
215 endIdx = strtIdx + cnt;
216 if(endIdx > lenMsg)
217 endIdx = lenMsg;
218 for(i = strtIdx ; i < endIdx ; ++i)
219 msg[i] = pData->replChar;
220 }
221
222 static void
doUTF8(instanceData * pData,uchar * msg,int lenMsg)223 doUTF8(instanceData *pData, uchar *msg, int lenMsg)
224 {
225 uchar c;
226 int8_t bytesLeft = 0;
227 uint32_t codepoint;
228 int strtIdx = 0;
229 int i;
230
231 for(i = 0 ; i < lenMsg ; ++i) {
232 c = msg[i];
233 if(bytesLeft) {
234 if((c & 0xc0) != 0x80) {
235 /* invalid continuation byte, invalidate all bytes
236 up to (but not including) the current byte
237 startIdx is always set if bytesLeft is set */
238 fixInvldMBSeq(pData, msg, lenMsg, strtIdx, i - strtIdx);
239 bytesLeft = 0;
240 goto startOfSequence;
241 } else {
242 codepoint = (codepoint << 6) | (c & 0x3f);
243 --bytesLeft;
244 if(bytesLeft == 0) {
245 int seqLen = i - strtIdx + 1;
246
247 if (
248 /* an overlong encoding? (a codepoint must use only
249 the minimum number of bytes to represent its value) */
250 (((2 == seqLen) && (codepoint < 0x80)) ||
251 ((3 == seqLen) && (codepoint < 0x800)) ||
252 ((4 == seqLen) && (codepoint < 0x10000)))
253 ||
254 /* UTF-16 surrogates? */
255 ((codepoint >= 0xD800) && (codepoint <= 0xDFFF))
256 ||
257 /* too-large codepoint? */
258 (codepoint > 0x10FFFF)
259 ) {
260 /* sequence invalid, invalidate all bytes
261 startIdx is always set if bytesLeft is set */
262 fixInvldMBSeq(pData, msg, lenMsg, strtIdx, seqLen);
263 }
264 }
265 }
266 } else {
267 startOfSequence:
268 if((c & 0x80) == 0) {
269 /* 1-byte sequence, US-ASCII */
270 ; /* nothing to do, all well */
271 } else if((c & 0xe0) == 0xc0) {
272 /* 2-byte sequence */
273 strtIdx = i;
274 bytesLeft = 1;
275 codepoint = c & 0x1f;
276 } else if((c & 0xf0) == 0xe0) {
277 /* 3-byte sequence */
278 strtIdx = i;
279 bytesLeft = 2;
280 codepoint = c & 0x0f;
281 } else if((c & 0xf8) == 0xf0) {
282 /* 4-byte sequence */
283 strtIdx = i;
284 bytesLeft = 3;
285 codepoint = c & 0x07;
286 } else { /* invalid, either:
287 - stray continuation byte (0x80 <= x <= 0xBF)
288 - 5&6 byte sequence start (x >= 0xF8) forbidden by RFC3629
289 */
290 msg[i] = pData->replChar;
291 }
292 }
293 }
294 if (bytesLeft) {
295 /* invalid, there was not enough bytes to complete a sequence
296 startIdx is always set if bytesLeft is set */
297 fixInvldMBSeq(pData, msg, lenMsg, strtIdx, i - strtIdx);
298 }
299 }
300
301 BEGINdoAction_NoStrings
302 smsg_t **ppMsg = (smsg_t **) pMsgData;
303 smsg_t *pMsg = ppMsg[0];
304 uchar *msg;
305 int lenMsg;
306 CODESTARTdoAction
307 lenMsg = getMSGLen(pMsg);
308 msg = getMSG(pMsg);
309 if(pWrkrData->pData->mode == MODE_CC) {
310 doCC(pWrkrData->pData, msg, lenMsg);
311 } else {
312 doUTF8(pWrkrData->pData, msg, lenMsg);
313 }
314 ENDdoAction
315
316
317 NO_LEGACY_CONF_parseSelectorAct
318
319
320 BEGINmodExit
321 CODESTARTmodExit
322 ENDmodExit
323
324
325 BEGINqueryEtryPt
326 CODESTARTqueryEtryPt
327 CODEqueryEtryPt_STD_OMOD_QUERIES
328 CODEqueryEtryPt_STD_OMOD8_QUERIES
329 CODEqueryEtryPt_STD_CONF2_OMOD_QUERIES
330 CODEqueryEtryPt_STD_CONF2_QUERIES
331 ENDqueryEtryPt
332
333
334 BEGINmodInit()
335 CODESTARTmodInit
336 *ipIFVersProvided = CURR_MOD_IF_VERSION; /* we only support the current interface specification */
337 CODEmodInit_QueryRegCFSLineHdlr
338 DBGPRINTF("mmutf8fix: module compiled with rsyslog version %s.\n", VERSION);
339 ENDmodInit
340