1 /*
2 ** Copyright 2011 Double Precision, Inc.  See COPYING for
3 ** distribution information.
4 */
5 
6 #include "rfc2045_config.h"
7 #include	"rfc3676parser.h"
8 #include	<stdlib.h>
9 #include	<string.h>
10 
11 #define NONFLOWED_WRAP_REDUCE	74
12 
13 #define NONFLOWED_THRESHOLD_EXCEEDED	30
14 
15 
16 static void emit_line_begin(rfc3676_parser_t handle);
17 
18 static void emit_line_contents(rfc3676_parser_t handle,
19 			       const char32_t *uc,
20 			       size_t cnt);
21 
22 static void emit_line_flowed_wrap(rfc3676_parser_t handle);
23 
24 static void emit_line_end(rfc3676_parser_t handle);
25 
26 
27 static void nonflowed_line_begin(rfc3676_parser_t handle);
28 
29 static void nonflowed_line_contents(rfc3676_parser_t handle,
30 				    const char32_t *uc,
31 				    size_t cnt);
32 
33 static void nonflowed_line_end(rfc3676_parser_t handle);
34 
35 static int nonflowed_line_process(int linebreak_opportunity,
36 				  char32_t ch, void *dummy);
37 
38 #define EMIT_LINE_BEGIN(h) do {			\
39 		(*(h)->line_begin_handler)(h);	\
40 	} while (0)
41 
42 #define EMIT_LINE_CONTENTS(h, uc, cnt) do {			\
43 		(*(h)->line_content_handler)((h),(uc),(cnt));	\
44 	} while (0)
45 
46 #define EMIT_LINE_END(h) do {			\
47 		(*(h)->line_end_handler)(h);	\
48 	} while (0)
49 
50 struct rfc3676_parser_struct {
51 
52 	struct rfc3676_parser_info info;
53 	unicode_convert_handle_t uhandle;
54 
55 	int errflag;
56 
57 	/* Receive raw text stream, converted to unicode */
58 	size_t (*line_handler)(rfc3676_parser_t,
59 			       const char32_t *ptr, size_t cnt);
60 
61 	/*
62 	** Receive mostly raw text stream: CRs that precede an LF
63 	** are removed from the stream received by content_handler.
64 	*/
65 	size_t (*content_handler)(rfc3676_parser_t,
66 				  const char32_t *ptr, size_t cnt);
67 
68 	size_t quote_level;
69 	size_t sig_block_index;
70 
71 	/*
72 	** Flag: previous line ended in a flowed space, and the previous
73 	** line's quoting level was this.
74 	*/
75 	int has_previous_quote_level;
76 	size_t previous_quote_level;
77 
78 	/*
79 	** Flag: current line was flowed into from a previous line with the
80 	** same quoting level.
81 	*/
82 	int was_previous_quote_level;
83 
84 	/* A line has begun */
85 	void (*line_begin_handler)(rfc3676_parser_t handle);
86 
87 	/* Content of this line */
88 	void (*line_content_handler)(rfc3676_parser_t handle,
89 				     const char32_t *uc,
90 				     size_t cnt);
91 
92 	/* End of this line */
93 	void (*line_end_handler)(rfc3676_parser_t handle);
94 
95 
96 	/*
97 	** When non-flowed text is getting rewrapped, we utilize the services
98 	** of the unicode_lbc_info API.
99 	*/
100 
101 	unicode_lbc_info_t lb;
102 
103 	struct unicode_buf nonflowed_line;
104 	/* Collect unflowed line until it reaches the given size */
105 
106 	struct unicode_buf nonflowed_next_word;
107 	/* Collects unicode stream until a linebreaking opportunity */
108 
109 	size_t nonflowed_line_target_width;
110 	/* Targeted width of nonflowed lines */
111 
112 	size_t nonflowed_line_width; /* Width of nonflowed_line */
113 
114 	size_t nonflowed_next_word_width; /* Width of nonflowed_next_word */
115 
116 	/* Current handle of non-flowd content. */
117 	void (*nonflowed_line_process)(struct rfc3676_parser_struct *handle,
118 				       int linebreak_opportunity,
119 				       char32_t ch,
120 				       size_t ch_width);
121 
122 	void (*nonflowed_line_end)(struct rfc3676_parser_struct *handle);
123 };
124 
125 static int parse_unicode(const char *, size_t, void *);
126 
127 static size_t scan_crlf(rfc3676_parser_t handle,
128 			const char32_t *ptr, size_t cnt);
129 
130 static size_t scan_crlf_seen_cr(rfc3676_parser_t handle,
131 				const char32_t *ptr, size_t cnt);
132 
133 static size_t start_of_line(rfc3676_parser_t handle,
134 			    const char32_t *ptr, size_t cnt);
135 
136 static size_t count_quote_level(rfc3676_parser_t handle,
137 				const char32_t *ptr, size_t cnt);
138 
139 static size_t counted_quote_level(rfc3676_parser_t handle,
140 				  const char32_t *ptr, size_t cnt);
141 
142 static size_t check_signature_block(rfc3676_parser_t handle,
143 				    const char32_t *ptr, size_t cnt);
144 
145 static size_t start_content_line(rfc3676_parser_t handle,
146 				const char32_t *ptr, size_t cnt);
147 
148 static size_t scan_content_line(rfc3676_parser_t handle,
149 				const char32_t *ptr, size_t cnt);
150 
151 static size_t seen_sig_block(rfc3676_parser_t handle,
152 			     const char32_t *ptr, size_t cnt);
153 
154 static size_t seen_notsig_block(rfc3676_parser_t handle,
155 				const char32_t *ptr, size_t cnt);
156 
157 static size_t seen_content_sp(rfc3676_parser_t handle,
158 			      const char32_t *ptr, size_t cnt);
159 
160 
161 /*
162 ** The top layer initializes the conversion to unicode.
163 */
164 
rfc3676parser_init(const struct rfc3676_parser_info * info)165 rfc3676_parser_t rfc3676parser_init(const struct rfc3676_parser_info *info)
166 {
167 	rfc3676_parser_t handle=
168 		(rfc3676_parser_t)calloc(1,
169 					 sizeof(struct rfc3676_parser_struct));
170 
171 	if (!handle)
172 		return NULL;
173 
174 	handle->info=*info;
175 	if ((handle->uhandle=unicode_convert_init(info->charset,
176 						    unicode_u_ucs4_native,
177 						    parse_unicode,
178 						    handle)) == NULL)
179 	{
180 		free(handle);
181 		return NULL;
182 	}
183 
184 	if (!handle->info.isflowed)
185 		handle->info.isdelsp=0; /* Sanity check */
186 
187 	handle->line_handler=scan_crlf;
188 	handle->content_handler=start_of_line;
189 	handle->has_previous_quote_level=0;
190 	handle->previous_quote_level=0;
191 
192 	handle->line_begin_handler=emit_line_begin;
193 	handle->line_content_handler=emit_line_contents;
194 	handle->line_end_handler=emit_line_end;
195 
196 	unicode_buf_init(&handle->nonflowed_line, (size_t)-1);
197 	unicode_buf_init(&handle->nonflowed_next_word, (size_t)-1);
198 
199 	if (!handle->info.isflowed)
200 	{
201 		handle->line_begin_handler=nonflowed_line_begin;
202 		handle->line_content_handler=nonflowed_line_contents;
203 		handle->line_end_handler=nonflowed_line_end;
204 	}
205 	return handle;
206 }
207 
rfc3676parser(rfc3676_parser_t handle,const char * txt,size_t txt_cnt)208 int rfc3676parser(rfc3676_parser_t handle,
209 		  const char *txt,
210 		  size_t txt_cnt)
211 {
212 	if (handle->errflag)
213 		return handle->errflag; /* Error occured previously */
214 
215 	/* Convert to unicode and invoke parse_unicode() */
216 
217 	return unicode_convert(handle->uhandle, txt, txt_cnt);
218 }
219 
220 /*
221 ** Convert char stream from iconv into char32_ts, then pass them to the
222 ** current handler, until all converted char32_ts are consumed.
223 */
224 
parse_unicode(const char * ucs4,size_t nbytes,void * arg)225 static int parse_unicode(const char *ucs4, size_t nbytes, void *arg)
226 {
227 	rfc3676_parser_t handle=(rfc3676_parser_t)arg;
228 	char32_t ucs4buf[128];
229 	const char32_t *p;
230 
231 	/* Keep going until there's an error, or everything is consumed. */
232 
233 	while (handle->errflag == 0 && nbytes)
234 	{
235 		/* Do it in pieces, using the temporary char32_t buffer */
236 
237 		size_t cnt=nbytes;
238 
239 		if (cnt > sizeof(ucs4buf))
240 			cnt=sizeof(ucs4buf);
241 
242 		memcpy(ucs4buf, ucs4, cnt);
243 
244 		ucs4 += cnt;
245 		nbytes -= cnt;
246 
247 		cnt /= sizeof(char32_t);
248 		p=ucs4buf;
249 
250 		/* Keep feeding it to the current handler */
251 
252 		while (handle->errflag == 0 && cnt)
253 		{
254 			size_t n=(*handle->line_handler)(handle, p, cnt);
255 
256 			if (handle->errflag == 0)
257 			{
258 				cnt -= n;
259 				p += n;
260 			}
261 		}
262 	}
263 
264 	return handle->errflag;
265 }
266 
rfc3676parser_deinit(rfc3676_parser_t handle,int * errptr)267 int rfc3676parser_deinit(rfc3676_parser_t handle, int *errptr)
268 {
269 	/* Finish unicode conversion */
270 
271 	int rc=unicode_convert_deinit(handle->uhandle, errptr);
272 
273 	if (rc == 0)
274 		rc=handle->errflag;
275 
276 	if (rc == 0)
277 	{
278 		(*handle->line_handler)(handle, NULL, 0);
279 		rc=handle->errflag;
280 	}
281 
282 	if (handle->lb)
283 	{
284 		int rc2=unicode_lbc_end(handle->lb);
285 
286 		if (rc2 && rc == 0)
287 			rc=rc2;
288 	}
289 
290 	unicode_buf_deinit(&handle->nonflowed_line);
291 	unicode_buf_deinit(&handle->nonflowed_next_word);
292 
293 	free(handle);
294 	return rc;
295 }
296 
297 /*
298 ** Look for a CR that might precede an LF.
299 */
300 
scan_crlf(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)301 static size_t scan_crlf(rfc3676_parser_t handle,
302 			const char32_t *ptr, size_t cnt)
303 {
304 	size_t i;
305 
306 	if (ptr == NULL)
307 	{
308 		if (handle->errflag == 0)
309 			(*handle->content_handler)(handle, NULL, 0);
310 		return 0;
311 	}
312 
313 	for (i=0; ptr && i<cnt; ++i)
314 	{
315 		if (ptr[i] == '\r')
316 			break;
317 	}
318 
319 	if (i)
320 	{
321 		size_t consumed=0;
322 
323 		while (i && handle->errflag == 0)
324 		{
325 			size_t n=(*handle->content_handler)(handle, ptr, i);
326 
327 			ptr += n;
328 			consumed += n;
329 			i -= n;
330 		}
331 		return consumed;
332 	}
333 
334 	/* Consume the first character, the CR */
335 
336 	handle->line_handler=scan_crlf_seen_cr;
337 	return 1;
338 }
339 
340 /*
341 ** Check the first character after a CR.
342 */
343 
scan_crlf_seen_cr(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)344 static size_t scan_crlf_seen_cr(rfc3676_parser_t handle,
345 				const char32_t *ptr, size_t cnt)
346 {
347 	char32_t cr='\r';
348 
349 	handle->line_handler=scan_crlf;
350 
351 	if (ptr == NULL || *ptr != '\n')
352 	{
353 		/*
354 		** CR was not followed by a NL.
355 		** Restore it in the char stream.
356 		*/
357 
358 		while (handle->errflag == 0)
359 			if ((*handle->content_handler)(handle, &cr, 1))
360 				break;
361 	}
362 
363 	return scan_crlf(handle, ptr, cnt);
364 }
365 
366 /*
367 ** From this point on, CRLF are collapsed into NLs, so don't need to worry
368 ** about them.
369 */
370 
371 
372 /*
373 ** Check for an EOF indication at the start of the line.
374 */
375 
start_of_line(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)376 static size_t start_of_line(rfc3676_parser_t handle,
377 			    const char32_t *ptr, size_t cnt)
378 {
379 	if (ptr == NULL)
380 	{
381 		if (handle->has_previous_quote_level)
382 			EMIT_LINE_END(handle); /* Last line was flowed */
383 
384 		return cnt; /* EOF */
385 	}
386 
387 	/* Begin counting the quote level */
388 
389 	handle->content_handler=count_quote_level;
390 	handle->quote_level=0;
391 	return count_quote_level(handle, ptr, cnt);
392 }
393 
394 /*
395 ** Count leading > in flowed content.
396 */
397 
count_quote_level(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)398 static size_t count_quote_level(rfc3676_parser_t handle,
399 				const char32_t *ptr, size_t cnt)
400 {
401 	size_t i;
402 
403 	if (ptr == NULL) /* EOF, pretend that the quote level was counted */
404 		return (handle->content_handler=counted_quote_level)
405 			(handle, ptr, cnt);
406 
407 	for (i=0; i<cnt; ++i)
408 	{
409 		if (ptr[i] != '>' || !handle->info.isflowed)
410 		{
411 			handle->content_handler=counted_quote_level;
412 
413 			if (i == 0)
414 				return counted_quote_level(handle, ptr, cnt);
415 			break;
416 		}
417 		++handle->quote_level;
418 	}
419 
420 	return i;
421 }
422 
423 /*
424 ** This line's quote level has now been counted.
425 */
426 
counted_quote_level(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)427 static size_t counted_quote_level(rfc3676_parser_t handle,
428 				  const char32_t *ptr, size_t cnt)
429 {
430 	handle->was_previous_quote_level=0;
431 
432 	/*
433 	** If the previous line was flowed and this line has the same
434 	** quote level, make the flow official.
435 	*/
436 
437 	if (handle->has_previous_quote_level &&
438 	    handle->quote_level == handle->previous_quote_level)
439 	{
440 		/* Remember that this line was flowed into */
441 		handle->was_previous_quote_level=1;
442 	}
443 	else
444 	{
445 		/*
446 		** If the previous line was flowed, but this line carries
447 		** a different quote level, force-terminate the previous
448 		** line, before beginning this line.
449 		*/
450 		if (handle->has_previous_quote_level)
451 			EMIT_LINE_END(handle);
452 
453 		EMIT_LINE_BEGIN(handle);
454 	}
455 
456 	handle->has_previous_quote_level=0;
457 	/* Assume this line won't be flowed, until shown otherwise */
458 
459 
460 	if (!handle->info.isflowed)
461 	{
462 		/*
463 		** No space-stuffing, or sig block checking, if this is not
464 		** flowed content.
465 		*/
466 		handle->content_handler=scan_content_line;
467 		return scan_content_line(handle, ptr, cnt);
468 	}
469 
470 
471 	handle->content_handler=start_content_line;
472 
473 	if (ptr != NULL && *ptr == ' ')
474 		return 1; /* Remove stuffed space */
475 
476 	return start_content_line(handle, ptr, cnt);
477 }
478 
479 /*
480 ** Minor deviation from RFC3676, but this fixes a lot of broken text.
481 **
482 ** If the previous line was flowed, but this is an empty line (optionally
483 ** space-stuffed), unflow the last line (make it fixed), and this becomes
484 ** a fixed line too. Example:
485 **
486 ** this is the last end of a paragraph[SPACE]
487 ** [SPACE]
488 ** This is the first line of the next paragraph.
489 **
490 ** Strict RFC3676 rules will parse this as a flowed line, then a fixed line,
491 ** resulting in no paragraph breaks.
492 */
493 
start_content_line(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)494 static size_t start_content_line(rfc3676_parser_t handle,
495 				const char32_t *ptr, size_t cnt)
496 {
497 	/*
498 	** We'll start scanning for the signature block, as soon as
499 	** this check is done.
500 	*/
501 	handle->content_handler=check_signature_block;
502 	handle->sig_block_index=0;
503 
504 	if (ptr && *ptr == '\n' && handle->was_previous_quote_level)
505 	{
506 		EMIT_LINE_END(handle);
507 		EMIT_LINE_BEGIN(handle);
508 		handle->was_previous_quote_level=0;
509 	}
510 
511 	return check_signature_block(handle, ptr, cnt);
512 }
513 
514 
515 static const char32_t sig_block[]={'-', '-', ' '};
516 
517 /* Checking for a magical sig block */
518 
check_signature_block(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)519 static size_t check_signature_block(rfc3676_parser_t handle,
520 				    const char32_t *ptr, size_t cnt)
521 {
522 	if (ptr && *ptr == sig_block[handle->sig_block_index])
523 	{
524 		if (++handle->sig_block_index == sizeof(sig_block)
525 		    /sizeof(sig_block[0]))
526 
527 			/* Well, it's there, but does a NL follow? */
528 			handle->content_handler=seen_sig_block;
529 		return 1;
530 	}
531 
532 	return seen_notsig_block(handle, ptr, cnt);
533 }
534 
seen_sig_block(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)535 static size_t seen_sig_block(rfc3676_parser_t handle,
536 			     const char32_t *ptr, size_t cnt)
537 {
538 	if (ptr == NULL || *ptr == '\n')
539 	{
540 		/*
541 		** If the previous line was flowed, the sig block is not
542 		** considered to be flowable-into content, so terminate
543 		** the previous line before emitting the sig block.
544 		*/
545 
546 		if (handle->was_previous_quote_level)
547 		{
548 			EMIT_LINE_END(handle);
549 			EMIT_LINE_BEGIN(handle);
550 			handle->was_previous_quote_level=0;
551 		}
552 
553 		/* Pass through the sig block */
554 
555 		handle->content_handler=start_of_line;
556 
557 		EMIT_LINE_CONTENTS(handle, sig_block,
558 				   sizeof(sig_block)/sizeof(sig_block[0]));
559 		EMIT_LINE_END(handle);
560 		return ptr ? 1:0;
561 	}
562 
563 	return seen_notsig_block(handle, ptr, cnt);
564 }
565 
566 /* This is not a sig block line */
567 
seen_notsig_block(rfc3676_parser_t handle,const char32_t * newptr,size_t newcnt)568 static size_t seen_notsig_block(rfc3676_parser_t handle,
569 				 const char32_t *newptr, size_t newcnt)
570 {
571 	const char32_t *ptr;
572 	size_t i;
573 
574 	if (handle->was_previous_quote_level)
575 		emit_line_flowed_wrap(handle);
576 
577 	handle->content_handler=scan_content_line;
578 
579 	ptr=sig_block;
580 	i=handle->sig_block_index;
581 
582 	while (i && handle->errflag == 0)
583 	{
584 		size_t n=(*handle->content_handler)(handle, ptr, i);
585 
586 		ptr += n;
587 		i -= n;
588 	}
589 
590 	return (*handle->content_handler)(handle, newptr, newcnt);
591 }
592 
593 /*
594 ** Pass through the line, until encountering an NL, or a space in flowable
595 ** content.
596 */
597 
scan_content_line(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)598 static size_t scan_content_line(rfc3676_parser_t handle,
599 				const char32_t *ptr, size_t cnt)
600 {
601 	size_t i;
602 
603 	for (i=0; ptr && i<cnt && ptr[i] != '\n' &&
604 		     (ptr[i] != ' ' || !handle->info.isflowed); ++i)
605 		;
606 
607 	/* Pass through anything before the NL or potentially flowable SP */
608 
609  	if (i)
610 		EMIT_LINE_CONTENTS(handle, ptr, i);
611 
612 	if (i)
613 		return i;
614 
615 	if (ptr && ptr[i] == ' ')
616 	{
617 		handle->content_handler=seen_content_sp;
618 		return 1;
619 	}
620 
621 	/* NL. This line does not flow */
622 	EMIT_LINE_END(handle);
623 
624 	handle->content_handler=start_of_line;
625 
626 	return ptr ? 1:0;
627 }
628 
seen_content_sp(rfc3676_parser_t handle,const char32_t * ptr,size_t cnt)629 static size_t seen_content_sp(rfc3676_parser_t handle,
630 			      const char32_t *ptr, size_t cnt)
631 {
632 	char32_t sp=' ';
633 
634 	handle->content_handler=scan_content_line;
635 
636 	if (ptr == NULL || *ptr != '\n')
637 	{
638 		/*
639 		** SP was not followed by the NL. Pass through the space,
640 		** then resume scanning.
641 		*/
642 		EMIT_LINE_CONTENTS(handle, &sp, 1);
643 		return scan_content_line(handle, ptr, cnt);
644 	}
645 
646 	/* NL after a SP -- flowed line */
647 
648 	if (!handle->info.isdelsp)
649 		EMIT_LINE_CONTENTS(handle, &sp, 1);
650 
651 	handle->has_previous_quote_level=1;
652 	handle->previous_quote_level=handle->quote_level;
653 	handle->content_handler=start_of_line;
654 	return ptr ? 1:0;
655 }
656 
657 /**************************************************************************/
658 
659 /*
660 ** At this point, the processing has reduced to the following API:
661 **
662 ** + begin logical line
663 **
664 ** + contents of the logical line (multiple consecutive invocations)
665 **
666 ** + the logical line has flowed onto the next physical line
667 **
668 ** + end of logical line
669 **
670 ** The third one, logical line flowed, is normally used for flowed text,
671 ** by definition. But, it may also be get used if non-flowed text gets
672 ** rewrapped when broken formatting is detected.
673 **
674 ** Provide default implementations of the other three API calls that
675 ** simply invoke the corresponding user callback.
676 */
677 
emit_line_begin(rfc3676_parser_t handle)678 static void emit_line_begin(rfc3676_parser_t handle)
679 {
680 	if (handle->errflag == 0)
681 		handle->errflag=(*handle->info.line_begin)(handle->quote_level,
682 							   handle->info.arg);
683 }
684 
emit_line_flowed_wrap(rfc3676_parser_t handle)685 static void emit_line_flowed_wrap(rfc3676_parser_t handle)
686 {
687 	if (handle->errflag == 0 && handle->info.line_flowed_notify)
688 		handle->errflag=(*handle->info.line_flowed_notify)
689 			(handle->info.arg);
690 }
691 
emit_line_contents(rfc3676_parser_t handle,const char32_t * uc,size_t cnt)692 static void emit_line_contents(rfc3676_parser_t handle,
693 			       const char32_t *uc,
694 			       size_t cnt)
695 {
696 	if (handle->errflag == 0 && cnt > 0)
697 		handle->errflag=(*handle->info.line_contents)
698 			(uc, cnt, handle->info.arg);
699 }
700 
emit_line_end(rfc3676_parser_t handle)701 static void emit_line_end(rfc3676_parser_t handle)
702 {
703 	if (handle->errflag == 0)
704 		handle->errflag=(*handle->info.line_end)(handle->info.arg);
705 }
706 
707 /*
708 ** When processing a non-flowed text, handle broken mail formatters (I'm
709 ** looking at you, Apple Mail) that spew out quoted-printable content with
710 ** each decoded line forming a single paragraph. This is heuristically
711 ** detected by looking for lines that exceed a wrapping threshold, then
712 ** rewrapping them.
713 **
714 ** Redefine the three line API calls to launder the logical line via
715 ** the linebreak API.
716 */
717 
718 static void initial_nonflowed_line(rfc3676_parser_t handle,
719 				   int linebreak_opportunity,
720 				   char32_t ch,
721 				   size_t ch_width);
722 
723 static void initial_nonflowed_end(rfc3676_parser_t handle);
724 
725 static void begin_forced_rewrap(rfc3676_parser_t handle);
726 
727 /*
728 ** A non-flowed line begins. Initialize the linebreaking module.
729 */
nonflowed_line_begin(rfc3676_parser_t handle)730 static void nonflowed_line_begin(rfc3676_parser_t handle)
731 {
732 	if (handle->lb)
733 	{
734 		/* Just in case */
735 
736 		int rc=unicode_lbc_end(handle->lb);
737 
738 		if (rc && handle->errflag == 0)
739 			handle->errflag=rc;
740 	}
741 
742 	if ((handle->lb=unicode_lbc_init(nonflowed_line_process, handle))
743 	    == NULL)
744 	{
745 		if (handle->errflag == 0)
746 			handle->errflag=-1;
747 	}
748 
749 	if (handle->lb)
750 		unicode_lbc_set_opts(handle->lb,
751 				     UNICODE_LB_OPT_PRBREAK
752 				     | UNICODE_LB_OPT_SYBREAK);
753 
754 	unicode_buf_clear(&handle->nonflowed_line);
755 	unicode_buf_clear(&handle->nonflowed_next_word);
756 
757 	handle->nonflowed_line_width=0;
758 	handle->nonflowed_next_word_width=0;
759 
760 	handle->nonflowed_line_process=initial_nonflowed_line;
761 	handle->nonflowed_line_end=initial_nonflowed_end;
762 	emit_line_begin(handle); /* Fallthru - user callback */
763 
764 	handle->nonflowed_line_target_width=
765 		handle->quote_level < NONFLOWED_WRAP_REDUCE - 20 ?
766 		NONFLOWED_WRAP_REDUCE - handle->quote_level:20;
767 }
768 
769 /*
770 ** Process contents of non-flowed lines. The contents are submitted to the
771 ** linebreaking API.
772 */
773 
nonflowed_line_contents(rfc3676_parser_t handle,const char32_t * uc,size_t cnt)774 static void nonflowed_line_contents(rfc3676_parser_t handle,
775 				    const char32_t *uc,
776 				    size_t cnt)
777 {
778 	if (!handle->lb)
779 		return;
780 
781 	while (cnt)
782 	{
783 		if (handle->errflag == 0)
784 			handle->errflag=unicode_lbc_next(handle->lb, *uc);
785 
786 		++uc;
787 		--cnt;
788 	}
789 }
790 
791 /*
792 ** End of non-flowed content. Terminate the linebreaking API, then invoke
793 ** the current end-of-line handler.
794 */
nonflowed_line_end(rfc3676_parser_t handle)795 static void nonflowed_line_end(rfc3676_parser_t handle)
796 {
797 	if (handle->lb)
798 	{
799 		int rc=unicode_lbc_end(handle->lb);
800 
801 		if (rc && handle->errflag == 0)
802 			handle->errflag=rc;
803 
804 		handle->lb=NULL;
805 	}
806 
807 	(*handle->nonflowed_line_end)(handle);
808 	emit_line_end(handle); /* FALLTHRU */
809 }
810 
811 /*
812 ** Callback from the linebreaking API, gives us the next unicode character
813 ** and its linebreak property. Look up the unicode character's width, then
814 ** invoke the current handler.
815 */
nonflowed_line_process(int linebreak_opportunity,char32_t ch,void * dummy)816 static int nonflowed_line_process(int linebreak_opportunity,
817 				  char32_t ch, void *dummy)
818 {
819 	rfc3676_parser_t handle=(rfc3676_parser_t)dummy;
820 
821 	(*handle->nonflowed_line_process)(handle, linebreak_opportunity, ch,
822 					  unicode_wcwidth(ch));
823 
824 	return 0;
825 }
826 
827 /*
828 ** Collecting initial nonflowed line.
829 */
830 
initial_nonflowed_line(rfc3676_parser_t handle,int linebreak_opportunity,char32_t ch,size_t ch_width)831 static void initial_nonflowed_line(rfc3676_parser_t handle,
832 				   int linebreak_opportunity,
833 				   char32_t ch,
834 				   size_t ch_width)
835 {
836 	/*
837 	** Collect words into nonflowed_line as long as it fits within the
838 	** targeted width.
839 	*/
840 	if (linebreak_opportunity != UNICODE_LB_NONE &&
841 	    handle->nonflowed_line_width + handle->nonflowed_next_word_width
842 	    <= handle->nonflowed_line_target_width)
843 	{
844 		unicode_buf_append_buf(&handle->nonflowed_line,
845 				       &handle->nonflowed_next_word);
846 		handle->nonflowed_line_width +=
847 			handle->nonflowed_next_word_width;
848 
849 		unicode_buf_clear(&handle->nonflowed_next_word);
850 		handle->nonflowed_next_word_width=0;
851 	}
852 
853 	/*
854 	** Add the character to the growing word.
855 	**
856 	** If the line's size now exceeds the target width by quite a bit,
857 	** we've had enough!
858 	*/
859 
860 	unicode_buf_append(&handle->nonflowed_next_word, &ch, 1);
861 	handle->nonflowed_next_word_width += ch_width;
862 
863 	if (handle->nonflowed_line_width + handle->nonflowed_next_word_width
864 	    > handle->nonflowed_line_target_width
865 	    + NONFLOWED_THRESHOLD_EXCEEDED)
866 		begin_forced_rewrap(handle);
867 }
868 
869 /*
870 ** End of line handler. The line did not reach its threshold, so output it.
871 */
initial_nonflowed_end(rfc3676_parser_t handle)872 static void initial_nonflowed_end(rfc3676_parser_t handle)
873 {
874 	emit_line_contents(handle,
875 			   unicode_buf_ptr(&handle->nonflowed_line),
876 			   unicode_buf_len(&handle->nonflowed_line));
877 
878 	emit_line_contents(handle,
879 			   unicode_buf_ptr(&handle->nonflowed_next_word),
880 			   unicode_buf_len(&handle->nonflowed_next_word));
881 }
882 
883 /*
884 ** Check for the abnormal situation where we're ready to wrap something but
885 ** nonflowed_line is empty because all this text did not have a linebreaking
886 ** opportunity.
887 */
888 
check_abnormal_line(rfc3676_parser_t handle)889 static void check_abnormal_line(rfc3676_parser_t handle)
890 {
891 	size_t n, i;
892 	const char32_t *p;
893 
894 	if (unicode_buf_len(&handle->nonflowed_line) > 0)
895 		return;
896 
897 	/* Extreme times call for extreme measures */
898 
899 	n=unicode_buf_len(&handle->nonflowed_next_word);
900 	p=unicode_buf_ptr(&handle->nonflowed_next_word);
901 
902 	for (i=n; i>0; --i)
903 	{
904 		if (i < n && unicode_grapheme_break(p[i-1], p[i]))
905 		{
906 			n=i;
907 			break;
908 		}
909 	}
910 
911 	unicode_buf_append(&handle->nonflowed_line, p, n);
912 	unicode_buf_remove(&handle->nonflowed_next_word, 0, n);
913 
914 	/*
915 	** Recalculate the width of the growing word, now.
916 	*/
917 
918 	handle->nonflowed_next_word_width=0;
919 	p=unicode_buf_ptr(&handle->nonflowed_next_word);
920 
921 	for (i=0; i<unicode_buf_len(&handle->nonflowed_next_word); ++i)
922 		handle->nonflowed_next_word_width +=
923 			unicode_wcwidth(p[i]);
924 }
925 
926 /*
927 ** We've decided that the line is too long, so begin rewrapping it.
928 */
929 
930 static void forced_rewrap_line(rfc3676_parser_t handle,
931 			       int linebreak_opportunity,
932 			       char32_t ch,
933 			       size_t ch_width);
934 
935 static void forced_rewrap_end(rfc3676_parser_t handle);
936 
937 /*
938 ** Emit nonflowed_line as the rewrapped line. Clear the buffer.
939 */
emit_rewrapped_line(rfc3676_parser_t handle)940 static void emit_rewrapped_line(rfc3676_parser_t handle)
941 {
942 	check_abnormal_line(handle);
943 	emit_line_contents(handle, unicode_buf_ptr(&handle->nonflowed_line),
944 			   unicode_buf_len(&handle->nonflowed_line));
945 
946 	emit_line_flowed_wrap(handle);
947 
948 	/* nonflowed_line is now empty */
949 	unicode_buf_clear(&handle->nonflowed_line);
950 	handle->nonflowed_line_width=0;
951 }
952 
begin_forced_rewrap(rfc3676_parser_t handle)953 static void begin_forced_rewrap(rfc3676_parser_t handle)
954 {
955 	handle->nonflowed_line_process=forced_rewrap_line;
956 	handle->nonflowed_line_end=forced_rewrap_end;
957 	emit_rewrapped_line(handle);
958 }
959 
forced_rewrap_line(rfc3676_parser_t handle,int linebreak_opportunity,char32_t ch,size_t ch_width)960 static void forced_rewrap_line(rfc3676_parser_t handle,
961 			       int linebreak_opportunity,
962 			       char32_t ch,
963 			       size_t ch_width)
964 {
965 	if (linebreak_opportunity != UNICODE_LB_NONE)
966 	{
967 		/* Found a linebreaking opportunity */
968 
969 		if (handle->nonflowed_line_width
970 		    + handle->nonflowed_next_word_width
971 		    > handle->nonflowed_line_target_width)
972 		{
973 			/* Accumulated word is too long */
974 			emit_rewrapped_line(handle);
975 		}
976 
977 		unicode_buf_append_buf(&handle->nonflowed_line,
978 				       &handle->nonflowed_next_word);
979 
980 		handle->nonflowed_line_width +=
981 			handle->nonflowed_next_word_width;
982 		unicode_buf_clear(&handle->nonflowed_next_word);
983 		handle->nonflowed_next_word_width=0;
984 	}
985 
986 	/*
987 	** Check for another excessively long line.
988 	*/
989 
990 	if (handle->nonflowed_line_width == 0 &&
991 	    handle->nonflowed_next_word_width + ch_width
992 	    > handle->nonflowed_line_target_width)
993 	{
994 		emit_rewrapped_line(handle);
995 	}
996 
997 	unicode_buf_append(&handle->nonflowed_next_word, &ch, 1);
998 	handle->nonflowed_next_word_width += ch_width;
999 }
1000 
forced_rewrap_end(rfc3676_parser_t handle)1001 static void forced_rewrap_end(rfc3676_parser_t handle)
1002 {
1003 	initial_nonflowed_end(handle); /* Same logic, for now */
1004 }
1005