1 /* Licensed to the Apache Software Foundation (ASF) under one or more
2  * contributor license agreements.  See the NOTICE file distributed with
3  * this work for additional information regarding copyright ownership.
4  * The ASF licenses this file to You under the Apache License, Version 2.0
5  * (the "License"); you may not use this file except in compliance with
6  * the License.  You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  * simple hokey charset recoding configuration module
19  *
20  * See mod_ebcdic and mod_charset for more thought-out examples.  This
21  * one is just so Jeff can learn how a module works and experiment with
22  * basic character set recoding configuration.
23  *
24  * !!!This is an extremely cheap ripoff of mod_charset.c from Russian Apache!!!
25  */
26 
27 #include "httpd.h"
28 #include "http_config.h"
29 
30 #include "http_core.h"
31 #include "http_log.h"
32 #include "http_main.h"
33 #include "http_protocol.h"
34 #include "http_request.h"
35 #include "util_charset.h"
36 #include "apr_buckets.h"
37 #include "util_filter.h"
38 #include "apr_strings.h"
39 #include "apr_lib.h"
40 #include "apr_xlate.h"
41 #define APR_WANT_STRFUNC
42 #include "apr_want.h"
43 
44 #define OUTPUT_XLATE_BUF_SIZE (16*1024) /* size of translation buffer used on output */
45 #define INPUT_XLATE_BUF_SIZE  (8*1024)  /* size of translation buffer used on input */
46 
47 #define XLATE_MIN_BUFF_LEFT 128  /* flush once there is no more than this much
48                                   * space left in the translation buffer
49                                   */
50 
51 #define FATTEST_CHAR  8          /* we don't handle chars wider than this that straddle
52                                   * two buckets
53                                   */
54 
55 /* extended error status codes; this is used in addition to an apr_status_t to
56  * track errors in the translation filter
57  */
58 typedef enum {
59     EES_INIT = 0,   /* no error info yet; value must be 0 for easy init */
60     EES_LIMIT,      /* built-in restriction encountered */
61     EES_INCOMPLETE_CHAR, /* incomplete multi-byte char at end of content */
62     EES_BUCKET_READ,
63     EES_DOWNSTREAM, /* something bad happened in a filter below xlate */
64     EES_BAD_INPUT   /* input data invalid */
65 } ees_t;
66 
67 /* registered name of the output translation filter */
68 #define XLATEOUT_FILTER_NAME "XLATEOUT"
69 /* registered name of input translation filter */
70 #define XLATEIN_FILTER_NAME  "XLATEIN"
71 
72 typedef struct charset_dir_t {
73     const char *charset_source; /* source encoding */
74     const char *charset_default; /* how to ship on wire */
75     /** module does ap_add_*_filter()? */
76     enum {IA_INIT, IA_IMPADD, IA_NOIMPADD} implicit_add;
77     /** treat all mimetypes as text? */
78     enum {FX_INIT, FX_FORCE, FX_NOFORCE} force_xlate;
79 } charset_dir_t;
80 
81 /* charset_filter_ctx_t is created for each filter instance; because the same
82  * filter code is used for translating in both directions, we need this context
83  * data to tell the filter which translation handle to use; it also can hold a
84  * character which was split between buckets
85  */
86 typedef struct charset_filter_ctx_t {
87     apr_xlate_t *xlate;
88     int is_sb;              /* single-byte translation? */
89     charset_dir_t *dc;
90     ees_t ees;              /* extended error status */
91     apr_size_t saved;
92     char buf[FATTEST_CHAR]; /* we want to be able to build a complete char here */
93     int ran;                /* has filter instance run before? */
94     int noop;               /* should we pass brigades through unchanged? */
95     char *tmp;              /* buffer for input filtering */
96     apr_bucket_brigade *bb; /* input buckets we couldn't finish translating */
97     apr_bucket_brigade *tmpbb; /* used for passing downstream */
98 } charset_filter_ctx_t;
99 
100 /* charset_req_t is available via r->request_config if any translation is
101  * being performed
102  */
103 typedef struct charset_req_t {
104     charset_dir_t *dc;
105     charset_filter_ctx_t *output_ctx, *input_ctx;
106 } charset_req_t;
107 
108 module AP_MODULE_DECLARE_DATA charset_lite_module;
109 
create_charset_dir_conf(apr_pool_t * p,char * dummy)110 static void *create_charset_dir_conf(apr_pool_t *p,char *dummy)
111 {
112     charset_dir_t *dc = (charset_dir_t *)apr_pcalloc(p,sizeof(charset_dir_t));
113 
114     return dc;
115 }
116 
merge_charset_dir_conf(apr_pool_t * p,void * basev,void * overridesv)117 static void *merge_charset_dir_conf(apr_pool_t *p, void *basev, void *overridesv)
118 {
119     charset_dir_t *a = (charset_dir_t *)apr_pcalloc (p, sizeof(charset_dir_t));
120     charset_dir_t *base = (charset_dir_t *)basev,
121         *over = (charset_dir_t *)overridesv;
122 
123     /* If it is defined in the current container, use it.  Otherwise, use the one
124      * from the enclosing container.
125      */
126 
127     a->charset_default =
128         over->charset_default ? over->charset_default : base->charset_default;
129     a->charset_source =
130         over->charset_source ? over->charset_source : base->charset_source;
131     a->implicit_add =
132         over->implicit_add != IA_INIT ? over->implicit_add : base->implicit_add;
133     a->force_xlate=
134         over->force_xlate != FX_INIT ? over->force_xlate : base->force_xlate;
135     return a;
136 }
137 
138 /* CharsetSourceEnc charset
139  */
add_charset_source(cmd_parms * cmd,void * in_dc,const char * name)140 static const char *add_charset_source(cmd_parms *cmd, void *in_dc,
141                                       const char *name)
142 {
143     charset_dir_t *dc = in_dc;
144 
145     dc->charset_source = name;
146     return NULL;
147 }
148 
149 /* CharsetDefault charset
150  */
add_charset_default(cmd_parms * cmd,void * in_dc,const char * name)151 static const char *add_charset_default(cmd_parms *cmd, void *in_dc,
152                                        const char *name)
153 {
154     charset_dir_t *dc = in_dc;
155 
156     dc->charset_default = name;
157     return NULL;
158 }
159 
160 /* CharsetOptions optionflag...
161  */
add_charset_options(cmd_parms * cmd,void * in_dc,const char * flag)162 static const char *add_charset_options(cmd_parms *cmd, void *in_dc,
163                                        const char *flag)
164 {
165     charset_dir_t *dc = in_dc;
166 
167     if (!strcasecmp(flag, "ImplicitAdd")) {
168         dc->implicit_add = IA_IMPADD;
169     }
170     else if (!strcasecmp(flag, "NoImplicitAdd")) {
171         dc->implicit_add = IA_NOIMPADD;
172     }
173     else if (!strcasecmp(flag, "TranslateAllMimeTypes")) {
174         dc->force_xlate = FX_FORCE;
175     }
176     else if (!strcasecmp(flag, "NoTranslateAllMimeTypes")) {
177         dc->force_xlate = FX_NOFORCE;
178     }
179     else {
180         return apr_pstrcat(cmd->temp_pool,
181                            "Invalid CharsetOptions option: ",
182                            flag,
183                            NULL);
184     }
185 
186     return NULL;
187 }
188 
189 /* find_code_page() is a fixup hook that checks if the module is
190  * configured and the input or output potentially need to be translated.
191  * If so, context is initialized for the filters.
192  */
find_code_page(request_rec * r)193 static int find_code_page(request_rec *r)
194 {
195     charset_dir_t *dc = ap_get_module_config(r->per_dir_config,
196                                              &charset_lite_module);
197     charset_req_t *reqinfo;
198     charset_filter_ctx_t *input_ctx, *output_ctx;
199     apr_status_t rv;
200 
201     ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, r,
202                   "uri: %s file: %s method: %d "
203                   "imt: %s flags: %s%s%s %s->%s",
204                   r->uri,
205                   r->filename ? r->filename : "(none)",
206                   r->method_number,
207                   r->content_type ? r->content_type : "(unknown)",
208                   r->main     ? "S" : "",    /* S if subrequest */
209                   r->prev     ? "R" : "",    /* R if redirect */
210                   r->proxyreq ? "P" : "",    /* P if proxy */
211                   dc->charset_source, dc->charset_default);
212 
213     /* If we don't have a full directory configuration, bail out.
214      */
215     if (!dc->charset_source || !dc->charset_default) {
216         ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01448)
217                       "incomplete configuration: src %s, dst %s",
218                       dc->charset_source ? dc->charset_source : "unspecified",
219                       dc->charset_default ? dc->charset_default : "unspecified");
220         return DECLINED;
221     }
222 
223     /* catch proxy requests */
224     if (r->proxyreq) {
225         return DECLINED;
226     }
227 
228     /* mod_rewrite indicators */
229     if (r->filename
230         && (!strncmp(r->filename, "redirect:", 9)
231             || !strncmp(r->filename, "gone:", 5)
232             || !strncmp(r->filename, "passthrough:", 12)
233             || !strncmp(r->filename, "forbidden:", 10))) {
234         return DECLINED;
235     }
236 
237     /* no translation when server and network charsets are set to the same value */
238     if (!strcasecmp(dc->charset_source, dc->charset_default)) {
239         return DECLINED;
240     }
241 
242     /* Get storage for the request data and the output filter context.
243      * We rarely need the input filter context, so allocate that separately.
244      */
245     reqinfo = (charset_req_t *)apr_pcalloc(r->pool,
246                                            sizeof(charset_req_t) +
247                                            sizeof(charset_filter_ctx_t));
248     output_ctx = (charset_filter_ctx_t *)(reqinfo + 1);
249 
250     reqinfo->dc = dc;
251     output_ctx->dc = dc;
252     output_ctx->tmpbb = apr_brigade_create(r->pool,
253                                            r->connection->bucket_alloc);
254     ap_set_module_config(r->request_config, &charset_lite_module, reqinfo);
255 
256     reqinfo->output_ctx = output_ctx;
257 
258     switch (r->method_number) {
259     case M_PUT:
260     case M_POST:
261         /* Set up input translation.  Note: A request body can be included
262          * with the OPTIONS method, but for now we don't set up translation
263          * of it.
264          */
265         input_ctx = apr_pcalloc(r->pool, sizeof(charset_filter_ctx_t));
266         input_ctx->bb = apr_brigade_create(r->pool,
267                                            r->connection->bucket_alloc);
268         input_ctx->tmp = apr_palloc(r->pool, INPUT_XLATE_BUF_SIZE);
269         input_ctx->dc = dc;
270         reqinfo->input_ctx = input_ctx;
271         rv = apr_xlate_open(&input_ctx->xlate, dc->charset_source,
272                             dc->charset_default, r->pool);
273         if (rv != APR_SUCCESS) {
274             ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, APLOGNO(01449)
275                           "can't open translation %s->%s",
276                           dc->charset_default, dc->charset_source);
277             return HTTP_INTERNAL_SERVER_ERROR;
278         }
279         if (apr_xlate_sb_get(input_ctx->xlate, &input_ctx->is_sb) != APR_SUCCESS) {
280             input_ctx->is_sb = 0;
281         }
282     }
283 
284     return DECLINED;
285 }
286 
configured_in_list(request_rec * r,const char * filter_name,struct ap_filter_t * filter_list)287 static int configured_in_list(request_rec *r, const char *filter_name,
288                               struct ap_filter_t *filter_list)
289 {
290     struct ap_filter_t *filter = filter_list;
291 
292     while (filter) {
293         if (!strcasecmp(filter_name, filter->frec->name)) {
294             return 1;
295         }
296         filter = filter->next;
297     }
298     return 0;
299 }
300 
configured_on_input(request_rec * r,const char * filter_name)301 static int configured_on_input(request_rec *r, const char *filter_name)
302 {
303     return configured_in_list(r, filter_name, r->input_filters);
304 }
305 
configured_on_output(request_rec * r,const char * filter_name)306 static int configured_on_output(request_rec *r, const char *filter_name)
307 {
308     return configured_in_list(r, filter_name, r->output_filters);
309 }
310 
311 /* xlate_insert_filter() is a filter hook which decides whether or not
312  * to insert a translation filter for the current request.
313  */
xlate_insert_filter(request_rec * r)314 static void xlate_insert_filter(request_rec *r)
315 {
316     /* Hey... don't be so quick to use reqinfo->dc here; reqinfo may be NULL */
317     charset_req_t *reqinfo = ap_get_module_config(r->request_config,
318                                                   &charset_lite_module);
319     charset_dir_t *dc = ap_get_module_config(r->per_dir_config,
320                                              &charset_lite_module);
321 
322     if (dc && (dc->implicit_add == IA_NOIMPADD)) {
323         ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, r,
324                       "xlate output filter not added implicitly because "
325                       "CharsetOptions included 'NoImplicitAdd'");
326         return;
327     }
328 
329     if (reqinfo) {
330         if (reqinfo->output_ctx && !configured_on_output(r, XLATEOUT_FILTER_NAME)) {
331             ap_add_output_filter(XLATEOUT_FILTER_NAME, reqinfo->output_ctx, r,
332                                  r->connection);
333         }
334         ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, r,
335                       "xlate output filter not added implicitly because %s",
336                       !reqinfo->output_ctx ?
337                       "no output configuration available" :
338                       "another module added the filter");
339 
340         if (reqinfo->input_ctx && !configured_on_input(r, XLATEIN_FILTER_NAME)) {
341             ap_add_input_filter(XLATEIN_FILTER_NAME, reqinfo->input_ctx, r,
342                                 r->connection);
343         }
344         ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, r,
345                       "xlate input filter not added implicitly because %s",
346                       !reqinfo->input_ctx ?
347                       "no input configuration available" :
348                       "another module added the filter");
349     }
350 }
351 
352 /* stuff that sucks that I know of:
353  *
354  * bucket handling:
355  *  why create an eos bucket when we see it come down the stream?  just send the one
356  *  passed as input...  news flash: this will be fixed when xlate_out_filter() starts
357  *  using the more generic xlate_brigade()
358  *
359  * translation mechanics:
360  *   we don't handle characters that straddle more than two buckets; an error
361  *   will be generated
362  */
363 
send_bucket_downstream(ap_filter_t * f,apr_bucket * b)364 static apr_status_t send_bucket_downstream(ap_filter_t *f, apr_bucket *b)
365 {
366     charset_filter_ctx_t *ctx = f->ctx;
367     apr_status_t rv;
368 
369     APR_BRIGADE_INSERT_TAIL(ctx->tmpbb, b);
370     rv = ap_pass_brigade(f->next, ctx->tmpbb);
371     if (rv != APR_SUCCESS) {
372         ctx->ees = EES_DOWNSTREAM;
373     }
374     apr_brigade_cleanup(ctx->tmpbb);
375     return rv;
376 }
377 
378 /* send_downstream() is passed the translated data; it puts it in a single-
379  * bucket brigade and passes the brigade to the next filter
380  */
send_downstream(ap_filter_t * f,const char * tmp,apr_size_t len)381 static apr_status_t send_downstream(ap_filter_t *f, const char *tmp, apr_size_t len)
382 {
383     request_rec *r = f->r;
384     conn_rec *c = r->connection;
385     apr_bucket *b;
386 
387     b = apr_bucket_transient_create(tmp, len, c->bucket_alloc);
388     return send_bucket_downstream(f, b);
389 }
390 
send_eos(ap_filter_t * f)391 static apr_status_t send_eos(ap_filter_t *f)
392 {
393     request_rec *r = f->r;
394     conn_rec *c = r->connection;
395     apr_bucket_brigade *bb;
396     apr_bucket *b;
397     charset_filter_ctx_t *ctx = f->ctx;
398     apr_status_t rv;
399 
400     bb = apr_brigade_create(r->pool, c->bucket_alloc);
401     b = apr_bucket_eos_create(c->bucket_alloc);
402     APR_BRIGADE_INSERT_TAIL(bb, b);
403     rv = ap_pass_brigade(f->next, bb);
404     if (rv != APR_SUCCESS) {
405         ctx->ees = EES_DOWNSTREAM;
406     }
407     return rv;
408 }
409 
set_aside_partial_char(charset_filter_ctx_t * ctx,const char * partial,apr_size_t partial_len)410 static apr_status_t set_aside_partial_char(charset_filter_ctx_t *ctx,
411                                            const char *partial,
412                                            apr_size_t partial_len)
413 {
414     apr_status_t rv;
415 
416     if (sizeof(ctx->buf) > partial_len) {
417         ctx->saved = partial_len;
418         memcpy(ctx->buf, partial, partial_len);
419         rv = APR_SUCCESS;
420     }
421     else {
422         rv = APR_INCOMPLETE;
423         ctx->ees = EES_LIMIT; /* we don't handle chars this wide which straddle
424                                * buckets
425                                */
426     }
427     return rv;
428 }
429 
finish_partial_char(charset_filter_ctx_t * ctx,const char ** cur_str,apr_size_t * cur_len,char ** out_str,apr_size_t * out_len)430 static apr_status_t finish_partial_char(charset_filter_ctx_t *ctx,
431                                         /* input buffer: */
432                                         const char **cur_str,
433                                         apr_size_t *cur_len,
434                                         /* output buffer: */
435                                         char **out_str,
436                                         apr_size_t *out_len)
437 {
438     apr_status_t rv;
439     apr_size_t tmp_input_len;
440 
441     /* Keep adding bytes from the input string to the saved string until we
442      *    1) finish the input char
443      *    2) get an error
444      * or 3) run out of bytes to add
445      */
446 
447     do {
448         ctx->buf[ctx->saved] = **cur_str;
449         ++ctx->saved;
450         ++*cur_str;
451         --*cur_len;
452         tmp_input_len = ctx->saved;
453         rv = apr_xlate_conv_buffer(ctx->xlate,
454                                    ctx->buf,
455                                    &tmp_input_len,
456                                    *out_str,
457                                    out_len);
458     } while (rv == APR_INCOMPLETE && *cur_len);
459 
460     if (rv == APR_SUCCESS) {
461         ctx->saved = 0;
462     }
463     else {
464         ctx->ees = EES_LIMIT; /* code isn't smart enough to handle chars
465                                * straddling more than two buckets
466                                */
467     }
468 
469     return rv;
470 }
471 
log_xlate_error(ap_filter_t * f,apr_status_t rv)472 static void log_xlate_error(ap_filter_t *f, apr_status_t rv)
473 {
474     charset_filter_ctx_t *ctx = f->ctx;
475     const char *msg;
476     char msgbuf[100];
477     apr_size_t len;
478 
479     switch(ctx->ees) {
480     case EES_LIMIT:
481         rv = 0;
482         msg = APLOGNO(02193) "xlate filter - a built-in restriction was encountered";
483         break;
484     case EES_BAD_INPUT:
485         rv = 0;
486         msg = APLOGNO(02194) "xlate filter - an input character was invalid";
487         break;
488     case EES_BUCKET_READ:
489         rv = 0;
490         msg = APLOGNO(02195) "xlate filter - bucket read routine failed";
491         break;
492     case EES_INCOMPLETE_CHAR:
493         rv = 0;
494         strcpy(msgbuf, APLOGNO(02196) "xlate filter - incomplete char at end of input - ");
495         len = ctx->saved;
496 
497         /* We must ensure not to process more than what would fit in the
498          * remaining of the destination buffer, including terminating NULL */
499         if (len > (sizeof(msgbuf) - strlen(msgbuf) - 1) / 2)
500             len = (sizeof(msgbuf) - strlen(msgbuf) - 1) / 2;
501 
502         ap_bin2hex(ctx->buf, len, msgbuf + strlen(msgbuf));
503         msg = msgbuf;
504         break;
505     case EES_DOWNSTREAM:
506         msg = APLOGNO(02197) "xlate filter - an error occurred in a lower filter";
507         break;
508     default:
509         msg = APLOGNO(02198) "xlate filter - returning error";
510     }
511     ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, APLOGNO(02997) "%s", msg);
512 }
513 
514 /* chk_filter_chain() is called once per filter instance; it tries to
515  * determine if the current filter instance should be disabled because
516  * its translation is incompatible with the translation of an existing
517  * instance of the translate filter
518  *
519  * Example bad scenario:
520  *
521  *   configured filter chain for the request:
522  *     INCLUDES XLATEOUT(8859-1->UTS-16)
523  *   configured filter chain for the subrequest:
524  *     XLATEOUT(8859-1->UTS-16)
525  *
526  *   When the subrequest is processed, the filter chain will be
527  *     XLATEOUT(8859-1->UTS-16) XLATEOUT(8859-1->UTS-16)
528  *   This makes no sense, so the instance of XLATEOUT added for the
529  *   subrequest will be noop-ed.
530  *
531  * Example good scenario:
532  *
533  *   configured filter chain for the request:
534  *     INCLUDES XLATEOUT(8859-1->UTS-16)
535  *   configured filter chain for the subrequest:
536  *     XLATEOUT(IBM-1047->8859-1)
537  *
538  *   When the subrequest is processed, the filter chain will be
539  *     XLATEOUT(IBM-1047->8859-1) XLATEOUT(8859-1->UTS-16)
540  *   This makes sense, so the instance of XLATEOUT added for the
541  *   subrequest will be left alone and it will translate from
542  *   IBM-1047->8859-1.
543  */
chk_filter_chain(ap_filter_t * f)544 static void chk_filter_chain(ap_filter_t *f)
545 {
546     ap_filter_t *curf;
547     charset_filter_ctx_t *curctx, *last_xlate_ctx = NULL,
548         *ctx = f->ctx;
549     int output = !strcasecmp(f->frec->name, XLATEOUT_FILTER_NAME);
550 
551     if (ctx->noop) {
552         return;
553     }
554 
555     /* walk the filter chain; see if it makes sense for our filter to
556      * do any translation
557      */
558     curf = output ? f->r->output_filters : f->r->input_filters;
559     while (curf) {
560         if (!strcasecmp(curf->frec->name, f->frec->name) &&
561             curf->ctx) {
562             curctx = (charset_filter_ctx_t *)curf->ctx;
563             if (!last_xlate_ctx) {
564                 last_xlate_ctx = curctx;
565             }
566             else {
567                 if (strcmp(last_xlate_ctx->dc->charset_default,
568                            curctx->dc->charset_source)) {
569                     /* incompatible translation
570                      * if our filter instance is incompatible with an instance
571                      * already in place, noop our instance
572                      * Notes:
573                      * . We are only willing to noop our own instance.
574                      * . It is possible to noop another instance which has not
575                      *   yet run, but this is not currently implemented.
576                      *   Hopefully it will not be needed.
577                      * . It is not possible to noop an instance which has
578                      *   already run.
579                      */
580                     if (last_xlate_ctx == f->ctx) {
581                         last_xlate_ctx->noop = 1;
582                         if (APLOGrtrace1(f->r)) {
583                             const char *symbol = output ? "->" : "<-";
584 
585                             ap_log_rerror(APLOG_MARK, APLOG_DEBUG,
586                                           0, f->r, APLOGNO(01451)
587                                           "%s %s - disabling "
588                                           "translation %s%s%s; existing "
589                                           "translation %s%s%s",
590                                           f->r->uri ? "uri" : "file",
591                                           f->r->uri ? f->r->uri : f->r->filename,
592                                           last_xlate_ctx->dc->charset_source,
593                                           symbol,
594                                           last_xlate_ctx->dc->charset_default,
595                                           curctx->dc->charset_source,
596                                           symbol,
597                                           curctx->dc->charset_default);
598                         }
599                     }
600                     else {
601                         const char *symbol = output ? "->" : "<-";
602 
603                         ap_log_rerror(APLOG_MARK, APLOG_ERR,
604                                       0, f->r, APLOGNO(01452)
605                                       "chk_filter_chain() - can't disable "
606                                       "translation %s%s%s; existing "
607                                       "translation %s%s%s",
608                                       last_xlate_ctx->dc->charset_source,
609                                       symbol,
610                                       last_xlate_ctx->dc->charset_default,
611                                       curctx->dc->charset_source,
612                                       symbol,
613                                       curctx->dc->charset_default);
614                     }
615                     break;
616                 }
617             }
618         }
619         curf = curf->next;
620     }
621 }
622 
623 /* xlate_brigade() is used to filter request and response bodies
624  *
625  * we'll stop when one of the following occurs:
626  * . we run out of buckets
627  * . we run out of space in the output buffer
628  * . we hit an error or metadata
629  *
630  * inputs:
631  *   bb:               brigade to process
632  *   buffer:           storage to hold the translated characters
633  *   buffer_avail:     size of buffer
634  *   (and a few more uninteresting parms)
635  *
636  * outputs:
637  *   return value:     APR_SUCCESS or some error code
638  *   bb:               we've removed any buckets representing the
639  *                     translated characters; the eos bucket, if
640  *                     present, will be left in the brigade
641  *   buffer:           filled in with translated characters
642  *   buffer_avail:     updated with the bytes remaining
643  *   hit_eos:          did we hit an EOS bucket?
644  */
xlate_brigade(charset_filter_ctx_t * ctx,apr_bucket_brigade * bb,char * buffer,apr_size_t * buffer_avail,int * hit_eos)645 static apr_status_t xlate_brigade(charset_filter_ctx_t *ctx,
646                                   apr_bucket_brigade *bb,
647                                   char *buffer,
648                                   apr_size_t *buffer_avail,
649                                   int *hit_eos)
650 {
651     apr_bucket *b = NULL; /* set to NULL only to quiet some gcc */
652     apr_bucket *consumed_bucket;
653     const char *bucket;
654     apr_size_t bytes_in_bucket; /* total bytes read from current bucket */
655     apr_size_t bucket_avail;    /* bytes left in current bucket */
656     apr_status_t rv = APR_SUCCESS;
657 
658     *hit_eos = 0;
659     bucket_avail = 0;
660     consumed_bucket = NULL;
661     while (1) {
662         if (!bucket_avail) { /* no bytes left to process in the current bucket... */
663             if (consumed_bucket) {
664                 apr_bucket_delete(consumed_bucket);
665                 consumed_bucket = NULL;
666             }
667             b = APR_BRIGADE_FIRST(bb);
668             if (b == APR_BRIGADE_SENTINEL(bb) ||
669                 APR_BUCKET_IS_METADATA(b)) {
670                 break;
671             }
672             rv = apr_bucket_read(b, &bucket, &bytes_in_bucket, APR_BLOCK_READ);
673             if (rv != APR_SUCCESS) {
674                 ctx->ees = EES_BUCKET_READ;
675                 break;
676             }
677             bucket_avail = bytes_in_bucket;
678             consumed_bucket = b;   /* for axing when we're done reading it */
679         }
680         if (bucket_avail) {
681             /* We've got data, so translate it. */
682             if (ctx->saved) {
683                 /* Rats... we need to finish a partial character from the previous
684                  * bucket.
685                  *
686                  * Strangely, finish_partial_char() increments the input buffer
687                  * pointer but does not increment the output buffer pointer.
688                  */
689                 apr_size_t old_buffer_avail = *buffer_avail;
690                 rv = finish_partial_char(ctx,
691                                          &bucket, &bucket_avail,
692                                          &buffer, buffer_avail);
693                 buffer += old_buffer_avail - *buffer_avail;
694             }
695             else {
696                 apr_size_t old_buffer_avail = *buffer_avail;
697                 apr_size_t old_bucket_avail = bucket_avail;
698                 rv = apr_xlate_conv_buffer(ctx->xlate,
699                                            bucket, &bucket_avail,
700                                            buffer,
701                                            buffer_avail);
702                 buffer  += old_buffer_avail - *buffer_avail;
703                 bucket  += old_bucket_avail - bucket_avail;
704 
705                 if (rv == APR_INCOMPLETE) { /* partial character at end of input */
706                     /* We need to save the final byte(s) for next time; we can't
707                      * convert it until we look at the next bucket.
708                      */
709                     rv = set_aside_partial_char(ctx, bucket, bucket_avail);
710                     bucket_avail = 0;
711                 }
712             }
713             if (rv != APR_SUCCESS) {
714                 /* bad input byte or partial char too big to store */
715                 break;
716             }
717             if (*buffer_avail < XLATE_MIN_BUFF_LEFT) {
718                 /* if any data remains in the current bucket, split there */
719                 if (bucket_avail) {
720                     apr_bucket_split(b, bytes_in_bucket - bucket_avail);
721                 }
722                 apr_bucket_delete(b);
723                 break;
724             }
725         }
726     }
727 
728     if (!APR_BRIGADE_EMPTY(bb)) {
729         b = APR_BRIGADE_FIRST(bb);
730         if (APR_BUCKET_IS_EOS(b)) {
731             /* Leave the eos bucket in the brigade for reporting to
732              * subsequent filters.
733              */
734             *hit_eos = 1;
735             if (ctx->saved) {
736                 /* Oops... we have a partial char from the previous bucket
737                  * that won't be completed because there's no more data.
738                  */
739                 rv = APR_INCOMPLETE;
740                 ctx->ees = EES_INCOMPLETE_CHAR;
741             }
742         }
743     }
744 
745     return rv;
746 }
747 
748 /* xlate_out_filter() handles (almost) arbitrary conversions from one charset
749  * to another...
750  * translation is determined in the fixup hook (find_code_page), which is
751  * where the filter's context data is set up... the context data gives us
752  * the translation handle
753  */
xlate_out_filter(ap_filter_t * f,apr_bucket_brigade * bb)754 static apr_status_t xlate_out_filter(ap_filter_t *f, apr_bucket_brigade *bb)
755 {
756     charset_req_t *reqinfo = ap_get_module_config(f->r->request_config,
757                                                   &charset_lite_module);
758     charset_dir_t *dc = ap_get_module_config(f->r->per_dir_config,
759                                              &charset_lite_module);
760     charset_filter_ctx_t *ctx = f->ctx;
761     apr_bucket *dptr, *consumed_bucket;
762     const char *cur_str;
763     apr_size_t cur_len, cur_avail;
764     char tmp[OUTPUT_XLATE_BUF_SIZE];
765     apr_size_t space_avail;
766     int done;
767     apr_status_t rv = APR_SUCCESS;
768 
769     if (!ctx) {
770         /* this is SetOutputFilter path; grab the preallocated context,
771          * if any; note that if we decided not to do anything in an earlier
772          * handler, we won't even have a reqinfo
773          */
774         if (reqinfo) {
775             ctx = f->ctx = reqinfo->output_ctx;
776             reqinfo->output_ctx = NULL; /* prevent SNAFU if user coded us twice
777                                          * in the filter chain; we can't have two
778                                          * instances using the same context
779                                          */
780         }
781         if (!ctx) {                   /* no idea how to translate; don't do anything */
782             ctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(charset_filter_ctx_t));
783             ctx->dc = dc;
784             ctx->noop = 1;
785         }
786     }
787 
788     /* Check the mime type to see if translation should be performed.
789      */
790     if (!ctx->noop && ctx->xlate == NULL) {
791         const char *mime_type = f->r->content_type;
792 
793         if (mime_type && (ap_cstr_casecmpn(mime_type, "text/", 5) == 0 ||
794 #if APR_CHARSET_EBCDIC
795         /* On an EBCDIC machine, be willing to translate mod_autoindex-
796          * generated output.  Otherwise, it doesn't look too cool.
797          *
798          * XXX This isn't a perfect fix because this doesn't trigger us
799          * to convert from the charset of the source code to ASCII.  The
800          * general solution seems to be to allow a generator to set an
801          * indicator in the r specifying that the body is coded in the
802          * implementation character set (i.e., the charset of the source
803          * code).  This would get several different types of documents
804          * translated properly: mod_autoindex output, mod_status output,
805          * mod_info output, hard-coded error documents, etc.
806          */
807             strcmp(mime_type, DIR_MAGIC_TYPE) == 0 ||
808 #endif
809             ap_cstr_casecmpn(mime_type, "message/", 8) == 0 ||
810             dc->force_xlate == FX_FORCE)) {
811 
812             rv = apr_xlate_open(&ctx->xlate,
813                                 dc->charset_default, dc->charset_source, f->r->pool);
814             if (rv != APR_SUCCESS) {
815                 ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, APLOGNO(01453)
816                               "can't open translation %s->%s",
817                               dc->charset_source, dc->charset_default);
818                 ctx->noop = 1;
819             }
820             else {
821                 if (apr_xlate_sb_get(ctx->xlate, &ctx->is_sb) != APR_SUCCESS) {
822                     ctx->is_sb = 0;
823                 }
824             }
825         }
826         else {
827             ctx->noop = 1;
828             if (mime_type) {
829                 ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, f->r,
830                               "mime type is %s; no translation selected",
831                               mime_type);
832             }
833         }
834     }
835 
836     ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, f->r,
837                   "xlate_out_filter() - "
838                   "charset_source: %s charset_default: %s",
839                   dc && dc->charset_source ? dc->charset_source : "(none)",
840                   dc && dc->charset_default ? dc->charset_default : "(none)");
841 
842     if (!ctx->ran) {  /* filter never ran before */
843         chk_filter_chain(f);
844         ctx->ran = 1;
845         if (!ctx->noop && !ctx->is_sb) {
846             /* We're not converting between two single-byte charsets, so unset
847              * Content-Length since it is unlikely to remain the same.
848              */
849             apr_table_unset(f->r->headers_out, "Content-Length");
850         }
851     }
852 
853     if (ctx->noop) {
854         return ap_pass_brigade(f->next, bb);
855     }
856 
857     dptr = APR_BRIGADE_FIRST(bb);
858     done = 0;
859     cur_len = 0;
860     space_avail = sizeof(tmp);
861     consumed_bucket = NULL;
862     while (!done) {
863         if (!cur_len) { /* no bytes left to process in the current bucket... */
864             if (consumed_bucket) {
865                 apr_bucket_delete(consumed_bucket);
866                 consumed_bucket = NULL;
867             }
868             if (dptr == APR_BRIGADE_SENTINEL(bb)) {
869                 break;
870             }
871             if (APR_BUCKET_IS_EOS(dptr)) {
872                 cur_len = -1; /* XXX yuck, but that tells us to send
873                                  * eos down; when we minimize our bb construction
874                                  * we'll fix this crap */
875                 if (ctx->saved) {
876                     /* Oops... we have a partial char from the previous bucket
877                      * that won't be completed because there's no more data.
878                      */
879                     rv = APR_INCOMPLETE;
880                     ctx->ees = EES_INCOMPLETE_CHAR;
881                 }
882                 break;
883             }
884             if (APR_BUCKET_IS_METADATA(dptr)) {
885                 apr_bucket *metadata_bucket;
886                 metadata_bucket = dptr;
887                 dptr = APR_BUCKET_NEXT(dptr);
888                 APR_BUCKET_REMOVE(metadata_bucket);
889                 rv = send_bucket_downstream(f, metadata_bucket);
890                 if (rv != APR_SUCCESS) {
891                     done = 1;
892                 }
893                 continue;
894             }
895             rv = apr_bucket_read(dptr, &cur_str, &cur_len, APR_BLOCK_READ);
896             if (rv != APR_SUCCESS) {
897                 ctx->ees = EES_BUCKET_READ;
898                 break;
899             }
900             consumed_bucket = dptr; /* for axing when we're done reading it */
901             dptr = APR_BUCKET_NEXT(dptr); /* get ready for when we access the
902                                           * next bucket */
903         }
904         /* Try to fill up our tmp buffer with translated data. */
905         cur_avail = cur_len;
906 
907         if (cur_len) { /* maybe we just hit the end of a pipe (len = 0) ? */
908             if (ctx->saved) {
909                 /* Rats... we need to finish a partial character from the previous
910                  * bucket.
911                  */
912                 char *tmp_tmp;
913 
914                 tmp_tmp = tmp + sizeof(tmp) - space_avail;
915                 rv = finish_partial_char(ctx,
916                                          &cur_str, &cur_len,
917                                          &tmp_tmp, &space_avail);
918             }
919             else {
920                 rv = apr_xlate_conv_buffer(ctx->xlate,
921                                            cur_str, &cur_avail,
922                                            tmp + sizeof(tmp) - space_avail, &space_avail);
923 
924                 /* Update input ptr and len after consuming some bytes */
925                 cur_str += cur_len - cur_avail;
926                 cur_len = cur_avail;
927 
928                 if (rv == APR_INCOMPLETE) { /* partial character at end of input */
929                     /* We need to save the final byte(s) for next time; we can't
930                      * convert it until we look at the next bucket.
931                      */
932                     rv = set_aside_partial_char(ctx, cur_str, cur_len);
933                     cur_len = 0;
934                 }
935             }
936         }
937 
938         if (rv != APR_SUCCESS) {
939             /* bad input byte or partial char too big to store */
940             done = 1;
941         }
942 
943         if (space_avail < XLATE_MIN_BUFF_LEFT) {
944             /* It is time to flush, as there is not enough space left in the
945              * current output buffer to bother with converting more data.
946              */
947             rv = send_downstream(f, tmp, sizeof(tmp) - space_avail);
948             if (rv != APR_SUCCESS) {
949                 done = 1;
950             }
951 
952             /* tmp is now empty */
953             space_avail = sizeof(tmp);
954         }
955     }
956 
957     if (rv == APR_SUCCESS) {
958         if (space_avail < sizeof(tmp)) { /* gotta write out what we converted */
959             rv = send_downstream(f, tmp, sizeof(tmp) - space_avail);
960         }
961     }
962     if (rv == APR_SUCCESS) {
963         if (cur_len == -1) {
964             rv = send_eos(f);
965         }
966     }
967     else {
968         log_xlate_error(f, rv);
969     }
970 
971     return rv;
972 }
973 
xlate_in_filter(ap_filter_t * f,apr_bucket_brigade * bb,ap_input_mode_t mode,apr_read_type_e block,apr_off_t readbytes)974 static apr_status_t xlate_in_filter(ap_filter_t *f, apr_bucket_brigade *bb,
975                                     ap_input_mode_t mode, apr_read_type_e block,
976                                     apr_off_t readbytes)
977 {
978     apr_status_t rv;
979     charset_req_t *reqinfo = ap_get_module_config(f->r->request_config,
980                                                   &charset_lite_module);
981     charset_dir_t *dc = ap_get_module_config(f->r->per_dir_config,
982                                              &charset_lite_module);
983     charset_filter_ctx_t *ctx = f->ctx;
984     apr_size_t buffer_size;
985     int hit_eos;
986 
987     /* just get out of the way of things we don't want. */
988     if (mode != AP_MODE_READBYTES) {
989         return ap_get_brigade(f->next, bb, mode, block, readbytes);
990     }
991 
992     if (!ctx) {
993         /* this is SetInputFilter path; grab the preallocated context,
994          * if any; note that if we decided not to do anything in an earlier
995          * handler, we won't even have a reqinfo
996          */
997         if (reqinfo) {
998             ctx = f->ctx = reqinfo->input_ctx;
999             reqinfo->input_ctx = NULL; /* prevent SNAFU if user coded us twice
1000                                         * in the filter chain; we can't have two
1001                                         * instances using the same context
1002                                         */
1003         }
1004         if (!ctx) {                   /* no idea how to translate; don't do anything */
1005             ctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(charset_filter_ctx_t));
1006             ctx->dc = dc;
1007             ctx->noop = 1;
1008         }
1009     }
1010 
1011     ap_log_rerror(APLOG_MARK, APLOG_TRACE6, 0, f->r,
1012                  "xlate_in_filter() - "
1013                  "charset_source: %s charset_default: %s",
1014                  dc && dc->charset_source ? dc->charset_source : "(none)",
1015                  dc && dc->charset_default ? dc->charset_default : "(none)");
1016 
1017     if (!ctx->ran) {  /* filter never ran before */
1018         chk_filter_chain(f);
1019         ctx->ran = 1;
1020         if (!ctx->noop && !ctx->is_sb
1021             && apr_table_get(f->r->headers_in, "Content-Length")) {
1022             /* A Content-Length header is present, but it won't be valid after
1023              * conversion because we're not converting between two single-byte
1024              * charsets.  This will affect most CGI scripts and may affect
1025              * some modules.
1026              * Content-Length can't be unset here because that would break
1027              * being able to read the request body.
1028              * Processing of chunked request bodies is not impacted by this
1029              * filter since the length was not declared anyway.
1030              */
1031             ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, f->r,
1032                           "Request body length may change, resulting in "
1033                           "misprocessing by some modules or scripts");
1034         }
1035     }
1036 
1037     if (ctx->noop) {
1038         return ap_get_brigade(f->next, bb, mode, block, readbytes);
1039     }
1040 
1041     if (APR_BRIGADE_EMPTY(ctx->bb)) {
1042         if ((rv = ap_get_brigade(f->next, bb, mode, block,
1043                                  readbytes)) != APR_SUCCESS) {
1044             return rv;
1045         }
1046     }
1047     else {
1048         APR_BRIGADE_PREPEND(bb, ctx->bb); /* first use the leftovers */
1049     }
1050 
1051     buffer_size = INPUT_XLATE_BUF_SIZE;
1052     rv = xlate_brigade(ctx, bb, ctx->tmp, &buffer_size, &hit_eos);
1053     if (rv == APR_SUCCESS) {
1054         if (!hit_eos) {
1055             /* move anything leftover into our context for next time;
1056              * we don't currently "set aside" since the data came from
1057              * down below, but I suspect that for long-term we need to
1058              * do that
1059              */
1060             APR_BRIGADE_CONCAT(ctx->bb, bb);
1061         }
1062         if (buffer_size < INPUT_XLATE_BUF_SIZE) { /* do we have output? */
1063             apr_bucket *e;
1064 
1065             e = apr_bucket_heap_create(ctx->tmp,
1066                                        INPUT_XLATE_BUF_SIZE - buffer_size,
1067                                        NULL, f->r->connection->bucket_alloc);
1068             /* make sure we insert at the head, because there may be
1069              * an eos bucket already there, and the eos bucket should
1070              * come after the data
1071              */
1072             APR_BRIGADE_INSERT_HEAD(bb, e);
1073         }
1074         else {
1075             /* XXX need to get some more data... what if the last brigade
1076              * we got had only the first byte of a multibyte char?  we need
1077              * to grab more data from the network instead of returning an
1078              * empty brigade
1079              */
1080         }
1081         /* If we have any metadata at the head of ctx->bb, go ahead and move it
1082          * onto the end of bb to be returned to our caller.
1083          */
1084         if (!APR_BRIGADE_EMPTY(ctx->bb)) {
1085             apr_bucket *b = APR_BRIGADE_FIRST(ctx->bb);
1086             while (b != APR_BRIGADE_SENTINEL(ctx->bb)
1087                    && APR_BUCKET_IS_METADATA(b)) {
1088                 APR_BUCKET_REMOVE(b);
1089                 APR_BRIGADE_INSERT_TAIL(bb, b);
1090                 b = APR_BRIGADE_FIRST(ctx->bb);
1091             }
1092         }
1093     }
1094     else {
1095         log_xlate_error(f, rv);
1096     }
1097 
1098     return rv;
1099 }
1100 
1101 static const command_rec cmds[] =
1102 {
1103     AP_INIT_TAKE1("CharsetSourceEnc",
1104                   add_charset_source,
1105                   NULL,
1106                   OR_FILEINFO,
1107                   "source (html,cgi,ssi) file charset"),
1108     AP_INIT_TAKE1("CharsetDefault",
1109                   add_charset_default,
1110                   NULL,
1111                   OR_FILEINFO,
1112                   "name of default charset"),
1113     AP_INIT_ITERATE("CharsetOptions",
1114                     add_charset_options,
1115                     NULL,
1116                     OR_FILEINFO,
1117                     "valid options: ImplicitAdd, NoImplicitAdd, TranslateAllMimeTypes, "
1118                     "NoTranslateAllMimeTypes"),
1119     {NULL}
1120 };
1121 
charset_register_hooks(apr_pool_t * p)1122 static void charset_register_hooks(apr_pool_t *p)
1123 {
1124     ap_hook_fixups(find_code_page, NULL, NULL, APR_HOOK_MIDDLE);
1125     ap_hook_insert_filter(xlate_insert_filter, NULL, NULL, APR_HOOK_REALLY_LAST);
1126     ap_register_output_filter(XLATEOUT_FILTER_NAME, xlate_out_filter, NULL,
1127                               AP_FTYPE_RESOURCE);
1128     ap_register_input_filter(XLATEIN_FILTER_NAME, xlate_in_filter, NULL,
1129                              AP_FTYPE_RESOURCE);
1130 }
1131 
1132 AP_DECLARE_MODULE(charset_lite) =
1133 {
1134     STANDARD20_MODULE_STUFF,
1135     create_charset_dir_conf,
1136     merge_charset_dir_conf,
1137     NULL,
1138     NULL,
1139     cmds,
1140     charset_register_hooks
1141 };
1142 
1143