1 /*
2 ** Copyright 2011 Double Precision, Inc. See COPYING for
3 ** distribution information.
4 */
5
6 #include "rfc2045_config.h"
7 #include "rfc3676parser.h"
8 #include <stdlib.h>
9 #include <string.h>
10
11 #define NONFLOWED_WRAP_REDUCE 74
12
13 #define NONFLOWED_THRESHOLD_EXCEEDED 30
14
15
16 static void emit_line_begin(rfc3676_parser_t handle);
17
18 static void emit_line_contents(rfc3676_parser_t handle,
19 const char32_t *uc,
20 size_t cnt);
21
22 static void emit_line_flowed_wrap(rfc3676_parser_t handle);
23
24 static void emit_line_end(rfc3676_parser_t handle);
25
26
27 static void nonflowed_line_begin(rfc3676_parser_t handle);
28
29 static void nonflowed_line_contents(rfc3676_parser_t handle,
30 const char32_t *uc,
31 size_t cnt);
32
33 static void nonflowed_line_end(rfc3676_parser_t handle);
34
35 static int nonflowed_line_process(int linebreak_opportunity,
36 char32_t ch, void *dummy);
37
38 #define EMIT_LINE_BEGIN(h) do { \
39 (*(h)->line_begin_handler)(h); \
40 } while (0)
41
42 #define EMIT_LINE_CONTENTS(h, uc, cnt) do { \
43 (*(h)->line_content_handler)((h),(uc),(cnt)); \
44 } while (0)
45
46 #define EMIT_LINE_END(h) do { \
47 (*(h)->line_end_handler)(h); \
48 } while (0)
49
50 struct rfc3676_parser_struct {
51
52 struct rfc3676_parser_info info;
53 unicode_convert_handle_t uhandle;
54
55 int errflag;
56
57 /* Receive raw text stream, converted to unicode */
58 size_t (*line_handler)(rfc3676_parser_t,
59 const char32_t *ptr, size_t cnt);
60
61 /*
62 ** Receive mostly raw text stream: CRs that precede an LF
63 ** are removed from the stream received by content_handler.
64 */
65 size_t (*content_handler)(rfc3676_parser_t,
66 const char32_t *ptr, size_t cnt);
67
68 size_t quote_level;
69 size_t sig_block_index;
70
71 /*
72 ** Flag: previous line ended in a flowed space, and the previous
73 ** line's quoting level was this.
74 */
75 int has_previous_quote_level;
76 size_t previous_quote_level;
77
78 /*
79 ** Flag: current line was flowed into from a previous line with the
80 ** same quoting level.
81 */
82 int was_previous_quote_level;
83
84 /* A line has begun */
85 void (*line_begin_handler)(rfc3676_parser_t handle);
86
87 /* Content of this line */
88 void (*line_content_handler)(rfc3676_parser_t handle,
89 const char32_t *uc,
90 size_t cnt);
91
92 /* End of this line */
93 void (*line_end_handler)(rfc3676_parser_t handle);
94
95
96 /*
97 ** When non-flowed text is getting rewrapped, we utilize the services
98 ** of the unicode_lbc_info API.
99 */
100
101 unicode_lbc_info_t lb;
102
103 struct unicode_buf nonflowed_line;
104 /* Collect unflowed line until it reaches the given size */
105
106 struct unicode_buf nonflowed_next_word;
107 /* Collects unicode stream until a linebreaking opportunity */
108
109 size_t nonflowed_line_target_width;
110 /* Targeted width of nonflowed lines */
111
112 size_t nonflowed_line_width; /* Width of nonflowed_line */
113
114 size_t nonflowed_next_word_width; /* Width of nonflowed_next_word */
115
116 /* Current handle of non-flowd content. */
117 void (*nonflowed_line_process)(struct rfc3676_parser_struct *handle,
118 int linebreak_opportunity,
119 char32_t ch,
120 size_t ch_width);
121
122 void (*nonflowed_line_end)(struct rfc3676_parser_struct *handle);
123 };
124
125 static int parse_unicode(const char *, size_t, void *);
126
127 static size_t scan_crlf(rfc3676_parser_t handle,
128 const char32_t *ptr, size_t cnt);
129
130 static size_t scan_crlf_seen_cr(rfc3676_parser_t handle,
131 const char32_t *ptr, size_t cnt);
132
133 static size_t start_of_line(rfc3676_parser_t handle,
134 const char32_t *ptr, size_t cnt);
135
136 static size_t count_quote_level(rfc3676_parser_t handle,
137 const char32_t *ptr, size_t cnt);
138
139 static size_t counted_quote_level(rfc3676_parser_t handle,
140 const char32_t *ptr, size_t cnt);
141
142 static size_t check_signature_block(rfc3676_parser_t handle,
143 const char32_t *ptr, size_t cnt);
144
145 static size_t start_content_line(rfc3676_parser_t handle,
146 const char32_t *ptr, size_t cnt);
147
148 static size_t scan_content_line(rfc3676_parser_t handle,
149 const char32_t *ptr, size_t cnt);
150
151 static size_t seen_sig_block(rfc3676_parser_t handle,
152 const char32_t *ptr, size_t cnt);
153
154 static size_t seen_notsig_block(rfc3676_parser_t handle,
155 const char32_t *ptr, size_t cnt);
156
157 static size_t seen_content_sp(rfc3676_parser_t handle,
158 const char32_t *ptr, size_t cnt);
159
160
161 /*
162 ** The top layer initializes the conversion to unicode.
163 */
164
rfc3676parser_init(const struct rfc3676_parser_info * info)165 rfc3676_parser_t rfc3676parser_init(const struct rfc3676_parser_info *info)
166 {
167 rfc3676_parser_t handle=
168 (rfc3676_parser_t)calloc(1,
169 sizeof(struct rfc3676_parser_struct));
170
171 if (!handle)
172 return NULL;
173
174 handle->info=*info;
175 if ((handle->uhandle=unicode_convert_init(info->charset,
176 unicode_u_ucs4_native,
177 parse_unicode,
178 handle)) == NULL)
179 {
180 free(handle);
181 return NULL;
182 }
183
184 if (!handle->info.isflowed)
185 handle->info.isdelsp=0; /* Sanity check */
186
187 handle->line_handler=scan_crlf;
188 handle->content_handler=start_of_line;
189 handle->has_previous_quote_level=0;
190 handle->previous_quote_level=0;
191
192 handle->line_begin_handler=emit_line_begin;
193 handle->line_content_handler=emit_line_contents;
194 handle->line_end_handler=emit_line_end;
195
196 unicode_buf_init(&handle->nonflowed_line, (size_t)-1);
197 unicode_buf_init(&handle->nonflowed_next_word, (size_t)-1);
198
199 if (!handle->info.isflowed)
200 {
201 handle->line_begin_handler=nonflowed_line_begin;
202 handle->line_content_handler=nonflowed_line_contents;
203 handle->line_end_handler=nonflowed_line_end;
204 }
205 return handle;
206 }
207
rfc3676parser(rfc3676_parser_t handle,const char * txt,size_t txt_cnt)208 int rfc3676parser(rfc3676_parser_t handle,
209 const char *txt,
210 size_t txt_cnt)
211 {
212 if (handle->errflag)
213 return handle->errflag; /* Error occured previously */
214
215 /* Convert to unicode and invoke parse_unicode() */
216
217 return unicode_convert(handle->uhandle, txt, txt_cnt);
218 }
219
220 /*
221 ** Convert char stream from iconv into char32_ts, then pass them to the
222 ** current handler, until all converted char32_ts are consumed.
223 */
224
parse_unicode(const char * ucs4,size_t nbytes,void * arg)225 static int parse_unicode(const char *ucs4, size_t nbytes, void *arg)
226 {
227 rfc3676_parser_t handle=(rfc3676_parser_t)arg;
228 char32_t ucs4buf[128];
229 const char32_t *p;
230
231 /* Keep going until there's an error, or everything is consumed. */
232
233 while (handle->errflag == 0 && nbytes)
234 {
235 /* Do it in pieces, using the temporary char32_t buffer */
236
237 size_t cnt=nbytes;
238
239 if (cnt > sizeof(ucs4buf))
240 cnt=sizeof(ucs4buf);
241
242 memcpy(ucs4buf, ucs4, cnt);
243
244 ucs4 += cnt;
245 nbytes -= cnt;
246
247 cnt /= sizeof(char32_t);
248 p=ucs4buf;
249
250 /* Keep feeding it to the current handler */
251
252 while (handle->errflag == 0 && cnt)
253 {
254 size_t n=(*handle->line_handler)(handle, p, cnt);
255
256 if (handle->errflag == 0)
257 {
258 cnt -= n;
259 p += n;
260 }
261 }
262 }
263
264 return handle->errflag;
265 }
266
rfc3676parser_deinit(rfc3676_parser_t handle,int * errptr)267 int rfc3676parser_deinit(rfc3676_parser_t handle, int *errptr)
268 {
269 /* Finish unicode conversion */
270
271 int rc=unicode_convert_deinit(handle->uhandle, errptr);
272
273 if (rc == 0)
274 rc=handle->errflag;
275
276 if (rc == 0)
277 {
278 (*handle->line_handler)(handle, NULL, 0);
279 rc=handle->errflag;
280 }
281
282 if (handle->lb)
283 {
284 int rc2=unicode_lbc_end(handle->lb);
285
286 if (rc2 && rc == 0)
287 rc=rc2;
288 }
289
290 unicode_buf_deinit(&handle->nonflowed_line);
291 unicode_buf_deinit(&handle->nonflowed_next_word);
292
293 free(handle);
294 return rc;
295 }
296
297 /*
298 ** Look for a CR that might precede an LF.
299 */
300
scan_crlf(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)301 static size_t scan_crlf(rfc3676_parser_t handle,
302 const char32_t *ptr, size_t cnt)
303 {
304 size_t i;
305
306 if (ptr == NULL)
307 {
308 if (handle->errflag == 0)
309 (*handle->content_handler)(handle, NULL, 0);
310 return 0;
311 }
312
313 for (i=0; ptr && i<cnt; ++i)
314 {
315 if (ptr[i] == '\r')
316 break;
317 }
318
319 if (i)
320 {
321 size_t consumed=0;
322
323 while (i && handle->errflag == 0)
324 {
325 size_t n=(*handle->content_handler)(handle, ptr, i);
326
327 ptr += n;
328 consumed += n;
329 i -= n;
330 }
331 return consumed;
332 }
333
334 /* Consume the first character, the CR */
335
336 handle->line_handler=scan_crlf_seen_cr;
337 return 1;
338 }
339
340 /*
341 ** Check the first character after a CR.
342 */
343
scan_crlf_seen_cr(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)344 static size_t scan_crlf_seen_cr(rfc3676_parser_t handle,
345 const char32_t *ptr, size_t cnt)
346 {
347 char32_t cr='\r';
348
349 handle->line_handler=scan_crlf;
350
351 if (ptr == NULL || *ptr != '\n')
352 {
353 /*
354 ** CR was not followed by a NL.
355 ** Restore it in the char stream.
356 */
357
358 while (handle->errflag == 0)
359 if ((*handle->content_handler)(handle, &cr, 1))
360 break;
361 }
362
363 return scan_crlf(handle, ptr, cnt);
364 }
365
366 /*
367 ** From this point on, CRLF are collapsed into NLs, so don't need to worry
368 ** about them.
369 */
370
371
372 /*
373 ** Check for an EOF indication at the start of the line.
374 */
375
start_of_line(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)376 static size_t start_of_line(rfc3676_parser_t handle,
377 const char32_t *ptr, size_t cnt)
378 {
379 if (ptr == NULL)
380 {
381 if (handle->has_previous_quote_level)
382 EMIT_LINE_END(handle); /* Last line was flowed */
383
384 return cnt; /* EOF */
385 }
386
387 /* Begin counting the quote level */
388
389 handle->content_handler=count_quote_level;
390 handle->quote_level=0;
391 return count_quote_level(handle, ptr, cnt);
392 }
393
394 /*
395 ** Count leading > in flowed content.
396 */
397
count_quote_level(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)398 static size_t count_quote_level(rfc3676_parser_t handle,
399 const char32_t *ptr, size_t cnt)
400 {
401 size_t i;
402
403 if (ptr == NULL) /* EOF, pretend that the quote level was counted */
404 return (handle->content_handler=counted_quote_level)
405 (handle, ptr, cnt);
406
407 for (i=0; i<cnt; ++i)
408 {
409 if (ptr[i] != '>' || !handle->info.isflowed)
410 {
411 handle->content_handler=counted_quote_level;
412
413 if (i == 0)
414 return counted_quote_level(handle, ptr, cnt);
415 break;
416 }
417 ++handle->quote_level;
418 }
419
420 return i;
421 }
422
423 /*
424 ** This line's quote level has now been counted.
425 */
426
counted_quote_level(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)427 static size_t counted_quote_level(rfc3676_parser_t handle,
428 const char32_t *ptr, size_t cnt)
429 {
430 handle->was_previous_quote_level=0;
431
432 /*
433 ** If the previous line was flowed and this line has the same
434 ** quote level, make the flow official.
435 */
436
437 if (handle->has_previous_quote_level &&
438 handle->quote_level == handle->previous_quote_level)
439 {
440 /* Remember that this line was flowed into */
441 handle->was_previous_quote_level=1;
442 }
443 else
444 {
445 /*
446 ** If the previous line was flowed, but this line carries
447 ** a different quote level, force-terminate the previous
448 ** line, before beginning this line.
449 */
450 if (handle->has_previous_quote_level)
451 EMIT_LINE_END(handle);
452
453 EMIT_LINE_BEGIN(handle);
454 }
455
456 handle->has_previous_quote_level=0;
457 /* Assume this line won't be flowed, until shown otherwise */
458
459
460 if (!handle->info.isflowed)
461 {
462 /*
463 ** No space-stuffing, or sig block checking, if this is not
464 ** flowed content.
465 */
466 handle->content_handler=scan_content_line;
467 return scan_content_line(handle, ptr, cnt);
468 }
469
470
471 handle->content_handler=start_content_line;
472
473 if (ptr != NULL && *ptr == ' ')
474 return 1; /* Remove stuffed space */
475
476 return start_content_line(handle, ptr, cnt);
477 }
478
479 /*
480 ** Minor deviation from RFC3676, but this fixes a lot of broken text.
481 **
482 ** If the previous line was flowed, but this is an empty line (optionally
483 ** space-stuffed), unflow the last line (make it fixed), and this becomes
484 ** a fixed line too. Example:
485 **
486 ** this is the last end of a paragraph[SPACE]
487 ** [SPACE]
488 ** This is the first line of the next paragraph.
489 **
490 ** Strict RFC3676 rules will parse this as a flowed line, then a fixed line,
491 ** resulting in no paragraph breaks.
492 */
493
start_content_line(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)494 static size_t start_content_line(rfc3676_parser_t handle,
495 const char32_t *ptr, size_t cnt)
496 {
497 /*
498 ** We'll start scanning for the signature block, as soon as
499 ** this check is done.
500 */
501 handle->content_handler=check_signature_block;
502 handle->sig_block_index=0;
503
504 if (ptr && *ptr == '\n' && handle->was_previous_quote_level)
505 {
506 EMIT_LINE_END(handle);
507 EMIT_LINE_BEGIN(handle);
508 handle->was_previous_quote_level=0;
509 }
510
511 return check_signature_block(handle, ptr, cnt);
512 }
513
514
515 static const char32_t sig_block[]={'-', '-', ' '};
516
517 /* Checking for a magical sig block */
518
check_signature_block(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)519 static size_t check_signature_block(rfc3676_parser_t handle,
520 const char32_t *ptr, size_t cnt)
521 {
522 if (ptr && *ptr == sig_block[handle->sig_block_index])
523 {
524 if (++handle->sig_block_index == sizeof(sig_block)
525 /sizeof(sig_block[0]))
526
527 /* Well, it's there, but does a NL follow? */
528 handle->content_handler=seen_sig_block;
529 return 1;
530 }
531
532 return seen_notsig_block(handle, ptr, cnt);
533 }
534
seen_sig_block(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)535 static size_t seen_sig_block(rfc3676_parser_t handle,
536 const char32_t *ptr, size_t cnt)
537 {
538 if (ptr == NULL || *ptr == '\n')
539 {
540 /*
541 ** If the previous line was flowed, the sig block is not
542 ** considered to be flowable-into content, so terminate
543 ** the previous line before emitting the sig block.
544 */
545
546 if (handle->was_previous_quote_level)
547 {
548 EMIT_LINE_END(handle);
549 EMIT_LINE_BEGIN(handle);
550 handle->was_previous_quote_level=0;
551 }
552
553 /* Pass through the sig block */
554
555 handle->content_handler=start_of_line;
556
557 EMIT_LINE_CONTENTS(handle, sig_block,
558 sizeof(sig_block)/sizeof(sig_block[0]));
559 EMIT_LINE_END(handle);
560 return ptr ? 1:0;
561 }
562
563 return seen_notsig_block(handle, ptr, cnt);
564 }
565
566 /* This is not a sig block line */
567
seen_notsig_block(rfc3676_parser_t handle,const char32_t * newptr,size_t newcnt)568 static size_t seen_notsig_block(rfc3676_parser_t handle,
569 const char32_t *newptr, size_t newcnt)
570 {
571 const char32_t *ptr;
572 size_t i;
573
574 if (handle->was_previous_quote_level)
575 emit_line_flowed_wrap(handle);
576
577 handle->content_handler=scan_content_line;
578
579 ptr=sig_block;
580 i=handle->sig_block_index;
581
582 while (i && handle->errflag == 0)
583 {
584 size_t n=(*handle->content_handler)(handle, ptr, i);
585
586 ptr += n;
587 i -= n;
588 }
589
590 return (*handle->content_handler)(handle, newptr, newcnt);
591 }
592
593 /*
594 ** Pass through the line, until encountering an NL, or a space in flowable
595 ** content.
596 */
597
scan_content_line(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)598 static size_t scan_content_line(rfc3676_parser_t handle,
599 const char32_t *ptr, size_t cnt)
600 {
601 size_t i;
602
603 for (i=0; ptr && i<cnt && ptr[i] != '\n' &&
604 (ptr[i] != ' ' || !handle->info.isflowed); ++i)
605 ;
606
607 /* Pass through anything before the NL or potentially flowable SP */
608
609 if (i)
610 EMIT_LINE_CONTENTS(handle, ptr, i);
611
612 if (i)
613 return i;
614
615 if (ptr && ptr[i] == ' ')
616 {
617 handle->content_handler=seen_content_sp;
618 return 1;
619 }
620
621 /* NL. This line does not flow */
622 EMIT_LINE_END(handle);
623
624 handle->content_handler=start_of_line;
625
626 return ptr ? 1:0;
627 }
628
seen_content_sp(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)629 static size_t seen_content_sp(rfc3676_parser_t handle,
630 const char32_t *ptr, size_t cnt)
631 {
632 char32_t sp=' ';
633
634 handle->content_handler=scan_content_line;
635
636 if (ptr == NULL || *ptr != '\n')
637 {
638 /*
639 ** SP was not followed by the NL. Pass through the space,
640 ** then resume scanning.
641 */
642 EMIT_LINE_CONTENTS(handle, &sp, 1);
643 return scan_content_line(handle, ptr, cnt);
644 }
645
646 /* NL after a SP -- flowed line */
647
648 if (!handle->info.isdelsp)
649 EMIT_LINE_CONTENTS(handle, &sp, 1);
650
651 handle->has_previous_quote_level=1;
652 handle->previous_quote_level=handle->quote_level;
653 handle->content_handler=start_of_line;
654 return ptr ? 1:0;
655 }
656
657 /**************************************************************************/
658
659 /*
660 ** At this point, the processing has reduced to the following API:
661 **
662 ** + begin logical line
663 **
664 ** + contents of the logical line (multiple consecutive invocations)
665 **
666 ** + the logical line has flowed onto the next physical line
667 **
668 ** + end of logical line
669 **
670 ** The third one, logical line flowed, is normally used for flowed text,
671 ** by definition. But, it may also be get used if non-flowed text gets
672 ** rewrapped when broken formatting is detected.
673 **
674 ** Provide default implementations of the other three API calls that
675 ** simply invoke the corresponding user callback.
676 */
677
emit_line_begin(rfc3676_parser_t handle)678 static void emit_line_begin(rfc3676_parser_t handle)
679 {
680 if (handle->errflag == 0)
681 handle->errflag=(*handle->info.line_begin)(handle->quote_level,
682 handle->info.arg);
683 }
684
emit_line_flowed_wrap(rfc3676_parser_t handle)685 static void emit_line_flowed_wrap(rfc3676_parser_t handle)
686 {
687 if (handle->errflag == 0 && handle->info.line_flowed_notify)
688 handle->errflag=(*handle->info.line_flowed_notify)
689 (handle->info.arg);
690 }
691
emit_line_contents(rfc3676_parser_t handle,const char32_t * uc,size_t cnt)692 static void emit_line_contents(rfc3676_parser_t handle,
693 const char32_t *uc,
694 size_t cnt)
695 {
696 if (handle->errflag == 0 && cnt > 0)
697 handle->errflag=(*handle->info.line_contents)
698 (uc, cnt, handle->info.arg);
699 }
700
emit_line_end(rfc3676_parser_t handle)701 static void emit_line_end(rfc3676_parser_t handle)
702 {
703 if (handle->errflag == 0)
704 handle->errflag=(*handle->info.line_end)(handle->info.arg);
705 }
706
707 /*
708 ** When processing a non-flowed text, handle broken mail formatters (I'm
709 ** looking at you, Apple Mail) that spew out quoted-printable content with
710 ** each decoded line forming a single paragraph. This is heuristically
711 ** detected by looking for lines that exceed a wrapping threshold, then
712 ** rewrapping them.
713 **
714 ** Redefine the three line API calls to launder the logical line via
715 ** the linebreak API.
716 */
717
718 static void initial_nonflowed_line(rfc3676_parser_t handle,
719 int linebreak_opportunity,
720 char32_t ch,
721 size_t ch_width);
722
723 static void initial_nonflowed_end(rfc3676_parser_t handle);
724
725 static void begin_forced_rewrap(rfc3676_parser_t handle);
726
727 /*
728 ** A non-flowed line begins. Initialize the linebreaking module.
729 */
nonflowed_line_begin(rfc3676_parser_t handle)730 static void nonflowed_line_begin(rfc3676_parser_t handle)
731 {
732 if (handle->lb)
733 {
734 /* Just in case */
735
736 int rc=unicode_lbc_end(handle->lb);
737
738 if (rc && handle->errflag == 0)
739 handle->errflag=rc;
740 }
741
742 if ((handle->lb=unicode_lbc_init(nonflowed_line_process, handle))
743 == NULL)
744 {
745 if (handle->errflag == 0)
746 handle->errflag=-1;
747 }
748
749 if (handle->lb)
750 unicode_lbc_set_opts(handle->lb,
751 UNICODE_LB_OPT_PRBREAK
752 | UNICODE_LB_OPT_SYBREAK);
753
754 unicode_buf_clear(&handle->nonflowed_line);
755 unicode_buf_clear(&handle->nonflowed_next_word);
756
757 handle->nonflowed_line_width=0;
758 handle->nonflowed_next_word_width=0;
759
760 handle->nonflowed_line_process=initial_nonflowed_line;
761 handle->nonflowed_line_end=initial_nonflowed_end;
762 emit_line_begin(handle); /* Fallthru - user callback */
763
764 handle->nonflowed_line_target_width=
765 handle->quote_level < NONFLOWED_WRAP_REDUCE - 20 ?
766 NONFLOWED_WRAP_REDUCE - handle->quote_level:20;
767 }
768
769 /*
770 ** Process contents of non-flowed lines. The contents are submitted to the
771 ** linebreaking API.
772 */
773
nonflowed_line_contents(rfc3676_parser_t handle,const char32_t * uc,size_t cnt)774 static void nonflowed_line_contents(rfc3676_parser_t handle,
775 const char32_t *uc,
776 size_t cnt)
777 {
778 if (!handle->lb)
779 return;
780
781 while (cnt)
782 {
783 if (handle->errflag == 0)
784 handle->errflag=unicode_lbc_next(handle->lb, *uc);
785
786 ++uc;
787 --cnt;
788 }
789 }
790
791 /*
792 ** End of non-flowed content. Terminate the linebreaking API, then invoke
793 ** the current end-of-line handler.
794 */
nonflowed_line_end(rfc3676_parser_t handle)795 static void nonflowed_line_end(rfc3676_parser_t handle)
796 {
797 if (handle->lb)
798 {
799 int rc=unicode_lbc_end(handle->lb);
800
801 if (rc && handle->errflag == 0)
802 handle->errflag=rc;
803
804 handle->lb=NULL;
805 }
806
807 (*handle->nonflowed_line_end)(handle);
808 emit_line_end(handle); /* FALLTHRU */
809 }
810
811 /*
812 ** Callback from the linebreaking API, gives us the next unicode character
813 ** and its linebreak property. Look up the unicode character's width, then
814 ** invoke the current handler.
815 */
nonflowed_line_process(int linebreak_opportunity,char32_t ch,void * dummy)816 static int nonflowed_line_process(int linebreak_opportunity,
817 char32_t ch, void *dummy)
818 {
819 rfc3676_parser_t handle=(rfc3676_parser_t)dummy;
820
821 (*handle->nonflowed_line_process)(handle, linebreak_opportunity, ch,
822 unicode_wcwidth(ch));
823
824 return 0;
825 }
826
827 /*
828 ** Collecting initial nonflowed line.
829 */
830
initial_nonflowed_line(rfc3676_parser_t handle,int linebreak_opportunity,char32_t ch,size_t ch_width)831 static void initial_nonflowed_line(rfc3676_parser_t handle,
832 int linebreak_opportunity,
833 char32_t ch,
834 size_t ch_width)
835 {
836 /*
837 ** Collect words into nonflowed_line as long as it fits within the
838 ** targeted width.
839 */
840 if (linebreak_opportunity != UNICODE_LB_NONE &&
841 handle->nonflowed_line_width + handle->nonflowed_next_word_width
842 <= handle->nonflowed_line_target_width)
843 {
844 unicode_buf_append_buf(&handle->nonflowed_line,
845 &handle->nonflowed_next_word);
846 handle->nonflowed_line_width +=
847 handle->nonflowed_next_word_width;
848
849 unicode_buf_clear(&handle->nonflowed_next_word);
850 handle->nonflowed_next_word_width=0;
851 }
852
853 /*
854 ** Add the character to the growing word.
855 **
856 ** If the line's size now exceeds the target width by quite a bit,
857 ** we've had enough!
858 */
859
860 unicode_buf_append(&handle->nonflowed_next_word, &ch, 1);
861 handle->nonflowed_next_word_width += ch_width;
862
863 if (handle->nonflowed_line_width + handle->nonflowed_next_word_width
864 > handle->nonflowed_line_target_width
865 + NONFLOWED_THRESHOLD_EXCEEDED)
866 begin_forced_rewrap(handle);
867 }
868
869 /*
870 ** End of line handler. The line did not reach its threshold, so output it.
871 */
initial_nonflowed_end(rfc3676_parser_t handle)872 static void initial_nonflowed_end(rfc3676_parser_t handle)
873 {
874 emit_line_contents(handle,
875 unicode_buf_ptr(&handle->nonflowed_line),
876 unicode_buf_len(&handle->nonflowed_line));
877
878 emit_line_contents(handle,
879 unicode_buf_ptr(&handle->nonflowed_next_word),
880 unicode_buf_len(&handle->nonflowed_next_word));
881 }
882
883 /*
884 ** Check for the abnormal situation where we're ready to wrap something but
885 ** nonflowed_line is empty because all this text did not have a linebreaking
886 ** opportunity.
887 */
888
check_abnormal_line(rfc3676_parser_t handle)889 static void check_abnormal_line(rfc3676_parser_t handle)
890 {
891 size_t n, i;
892 const char32_t *p;
893
894 if (unicode_buf_len(&handle->nonflowed_line) > 0)
895 return;
896
897 /* Extreme times call for extreme measures */
898
899 n=unicode_buf_len(&handle->nonflowed_next_word);
900 p=unicode_buf_ptr(&handle->nonflowed_next_word);
901
902 for (i=n; i>0; --i)
903 {
904 if (i < n && unicode_grapheme_break(p[i-1], p[i]))
905 {
906 n=i;
907 break;
908 }
909 }
910
911 unicode_buf_append(&handle->nonflowed_line, p, n);
912 unicode_buf_remove(&handle->nonflowed_next_word, 0, n);
913
914 /*
915 ** Recalculate the width of the growing word, now.
916 */
917
918 handle->nonflowed_next_word_width=0;
919 p=unicode_buf_ptr(&handle->nonflowed_next_word);
920
921 for (i=0; i<unicode_buf_len(&handle->nonflowed_next_word); ++i)
922 handle->nonflowed_next_word_width +=
923 unicode_wcwidth(p[i]);
924 }
925
926 /*
927 ** We've decided that the line is too long, so begin rewrapping it.
928 */
929
930 static void forced_rewrap_line(rfc3676_parser_t handle,
931 int linebreak_opportunity,
932 char32_t ch,
933 size_t ch_width);
934
935 static void forced_rewrap_end(rfc3676_parser_t handle);
936
937 /*
938 ** Emit nonflowed_line as the rewrapped line. Clear the buffer.
939 */
emit_rewrapped_line(rfc3676_parser_t handle)940 static void emit_rewrapped_line(rfc3676_parser_t handle)
941 {
942 check_abnormal_line(handle);
943 emit_line_contents(handle, unicode_buf_ptr(&handle->nonflowed_line),
944 unicode_buf_len(&handle->nonflowed_line));
945
946 emit_line_flowed_wrap(handle);
947
948 /* nonflowed_line is now empty */
949 unicode_buf_clear(&handle->nonflowed_line);
950 handle->nonflowed_line_width=0;
951 }
952
begin_forced_rewrap(rfc3676_parser_t handle)953 static void begin_forced_rewrap(rfc3676_parser_t handle)
954 {
955 handle->nonflowed_line_process=forced_rewrap_line;
956 handle->nonflowed_line_end=forced_rewrap_end;
957 emit_rewrapped_line(handle);
958 }
959
forced_rewrap_line(rfc3676_parser_t handle,int linebreak_opportunity,char32_t ch,size_t ch_width)960 static void forced_rewrap_line(rfc3676_parser_t handle,
961 int linebreak_opportunity,
962 char32_t ch,
963 size_t ch_width)
964 {
965 if (linebreak_opportunity != UNICODE_LB_NONE)
966 {
967 /* Found a linebreaking opportunity */
968
969 if (handle->nonflowed_line_width
970 + handle->nonflowed_next_word_width
971 > handle->nonflowed_line_target_width)
972 {
973 /* Accumulated word is too long */
974 emit_rewrapped_line(handle);
975 }
976
977 unicode_buf_append_buf(&handle->nonflowed_line,
978 &handle->nonflowed_next_word);
979
980 handle->nonflowed_line_width +=
981 handle->nonflowed_next_word_width;
982 unicode_buf_clear(&handle->nonflowed_next_word);
983 handle->nonflowed_next_word_width=0;
984 }
985
986 /*
987 ** Check for another excessively long line.
988 */
989
990 if (handle->nonflowed_line_width == 0 &&
991 handle->nonflowed_next_word_width + ch_width
992 > handle->nonflowed_line_target_width)
993 {
994 emit_rewrapped_line(handle);
995 }
996
997 unicode_buf_append(&handle->nonflowed_next_word, &ch, 1);
998 handle->nonflowed_next_word_width += ch_width;
999 }
1000
forced_rewrap_end(rfc3676_parser_t handle)1001 static void forced_rewrap_end(rfc3676_parser_t handle)
1002 {
1003 initial_nonflowed_end(handle); /* Same logic, for now */
1004 }
1005