1 // Copyright (c) 2012 The WebM project authors. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the LICENSE file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS.  All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 
9 #include "webvttparser.h"
10 
11 #include <ctype.h>
12 
13 #include <climits>
14 #include <cstddef>
15 
16 namespace libwebvtt {
17 
18 // NOLINT'ing this enum because clang-format puts it in a single line which
19 // makes it look really unreadable.
20 enum {
21   kNUL = '\x00',
22   kSPACE = ' ',
23   kTAB = '\x09',
24   kLF = '\x0A',
25   kCR = '\x0D'
26 };  // NOLINT
27 
~Reader()28 Reader::~Reader() {}
29 
~LineReader()30 LineReader::~LineReader() {}
31 
GetLine(std::string * line_ptr)32 int LineReader::GetLine(std::string* line_ptr) {
33   if (line_ptr == NULL)
34     return -1;
35 
36   std::string& ln = *line_ptr;
37   ln.clear();
38 
39   // Consume characters from the stream, until we
40   // reach end-of-line (or end-of-stream).
41 
42   // The WebVTT spec states that lines may be
43   // terminated in any of these three ways:
44   //  LF
45   //  CR
46   //  CR LF
47 
48   // We interrogate each character as we read it from the stream.
49   // If we detect an end-of-line character, we consume the full
50   // end-of-line indication, and we're done; otherwise, accumulate
51   // the character and repeat.
52 
53   for (;;) {
54     char c;
55     const int e = GetChar(&c);
56 
57     if (e < 0)  // error
58       return e;
59 
60     if (e > 0)  // EOF
61       return (ln.empty()) ? 1 : 0;
62 
63     // We have a character, so we must first determine
64     // whether we have reached end-of-line.
65 
66     if (c == kLF)
67       return 0;  // handle the easy end-of-line case immediately
68 
69     if (c == kCR)
70       break;  // handle the hard end-of-line case outside of loop
71 
72     if (c == '\xFE' || c == '\xFF')  // not UTF-8
73       return -1;
74 
75     // To defend against pathological or malicious streams, we
76     // cap the line length at some arbitrarily-large value:
77     enum { kMaxLineLength = 10000 };  // arbitrary
78 
79     if (ln.length() >= kMaxLineLength)
80       return -1;
81 
82     // We don't have an end-of-line character, so accumulate
83     // the character in our line buffer.
84     ln.push_back(c);
85   }
86 
87   // We detected a CR.  We must interrogate the next character
88   // in the stream, to determine whether we have a LF (which
89   // would make it part of this same line).
90 
91   char c;
92   const int e = GetChar(&c);
93 
94   if (e < 0)  // error
95     return e;
96 
97   if (e > 0)  // EOF
98     return 0;
99 
100   // If next character in the stream is not a LF, return it
101   // to the stream (because it's part of the next line).
102   if (c != kLF)
103     UngetChar(c);
104 
105   return 0;
106 }
107 
Parser(Reader * r)108 Parser::Parser(Reader* r) : reader_(r), unget_(-1) {}
109 
~Parser()110 Parser::~Parser() {}
111 
Init()112 int Parser::Init() {
113   int e = ParseBOM();
114 
115   if (e < 0)  // error
116     return e;
117 
118   if (e > 0)  // EOF
119     return -1;
120 
121   // Parse "WEBVTT".  We read from the stream one character at-a-time, in
122   // order to defend against non-WebVTT streams (e.g. binary files) that don't
123   // happen to comprise lines of text demarcated with line terminators.
124 
125   const char kId[] = "WEBVTT";
126 
127   for (const char* p = kId; *p; ++p) {
128     char c;
129     e = GetChar(&c);
130 
131     if (e < 0)  // error
132       return e;
133 
134     if (e > 0)  // EOF
135       return -1;
136 
137     if (c != *p)
138       return -1;
139   }
140 
141   std::string line;
142 
143   e = GetLine(&line);
144 
145   if (e < 0)  // error
146     return e;
147 
148   if (e > 0)  // EOF
149     return 0;  // weird but valid
150 
151   if (!line.empty()) {
152     // Parse optional characters that follow "WEBVTT"
153 
154     const char c = line[0];
155 
156     if (c != kSPACE && c != kTAB)
157       return -1;
158   }
159 
160   // The WebVTT spec requires that the "WEBVTT" line
161   // be followed by an empty line (to separate it from
162   // first cue).
163 
164   e = GetLine(&line);
165 
166   if (e < 0)  // error
167     return e;
168 
169   if (e > 0)  // EOF
170     return 0;  // weird but we allow it
171 
172   if (!line.empty())
173     return -1;
174 
175   return 0;  // success
176 }
177 
Parse(Cue * cue)178 int Parser::Parse(Cue* cue) {
179   if (cue == NULL)
180     return -1;
181 
182   // Parse first non-blank line
183 
184   std::string line;
185   int e;
186 
187   for (;;) {
188     e = GetLine(&line);
189 
190     if (e)  // EOF is OK here
191       return e;
192 
193     if (!line.empty())
194       break;
195   }
196 
197   // A WebVTT cue comprises an optional cue identifier line followed
198   // by a (non-optional) timings line.  You determine whether you have
199   // a timings line by scanning for the arrow token, the lexeme of which
200   // may not appear in the cue identifier line.
201 
202   const char kArrow[] = "-->";
203   std::string::size_type arrow_pos = line.find(kArrow);
204 
205   if (arrow_pos != std::string::npos) {
206     // We found a timings line, which implies that we don't have a cue
207     // identifier.
208 
209     cue->identifier.clear();
210   } else {
211     // We did not find a timings line, so we assume that we have a cue
212     // identifier line, and then try again to find the cue timings on
213     // the next line.
214 
215     cue->identifier.swap(line);
216 
217     e = GetLine(&line);
218 
219     if (e < 0)  // error
220       return e;
221 
222     if (e > 0)  // EOF
223       return -1;
224 
225     arrow_pos = line.find(kArrow);
226 
227     if (arrow_pos == std::string::npos)  // not a timings line
228       return -1;
229   }
230 
231   e = ParseTimingsLine(&line, arrow_pos, &cue->start_time, &cue->stop_time,
232                        &cue->settings);
233 
234   if (e)  // error
235     return e;
236 
237   // The cue payload comprises all the non-empty
238   // lines that follow the timings line.
239 
240   Cue::payload_t& p = cue->payload;
241   p.clear();
242 
243   for (;;) {
244     e = GetLine(&line);
245 
246     if (e < 0)  // error
247       return e;
248 
249     if (line.empty())
250       break;
251 
252     p.push_back(line);
253   }
254 
255   if (p.empty())
256     return -1;
257 
258   return 0;  // success
259 }
260 
GetChar(char * c)261 int Parser::GetChar(char* c) {
262   if (unget_ >= 0) {
263     *c = static_cast<char>(unget_);
264     unget_ = -1;
265     return 0;
266   }
267 
268   return reader_->GetChar(c);
269 }
270 
UngetChar(char c)271 void Parser::UngetChar(char c) { unget_ = static_cast<unsigned char>(c); }
272 
ParseBOM()273 int Parser::ParseBOM() {
274   // Explanation of UTF-8 BOM:
275   // http://en.wikipedia.org/wiki/Byte_order_mark
276 
277   static const char BOM[] = "\xEF\xBB\xBF";  // UTF-8 BOM
278 
279   for (int i = 0; i < 3; ++i) {
280     char c;
281     int e = GetChar(&c);
282 
283     if (e < 0)  // error
284       return e;
285 
286     if (e > 0)  // EOF
287       return 1;
288 
289     if (c != BOM[i]) {
290       if (i == 0) {  // we don't have a BOM
291         UngetChar(c);
292         return 0;  // success
293       }
294 
295       // We started a BOM, so we must finish the BOM.
296       return -1;  // error
297     }
298   }
299 
300   return 0;  // success
301 }
302 
ParseTimingsLine(std::string * line_ptr,std::string::size_type arrow_pos,Time * start_time,Time * stop_time,Cue::settings_t * settings)303 int Parser::ParseTimingsLine(std::string* line_ptr,
304                              std::string::size_type arrow_pos, Time* start_time,
305                              Time* stop_time, Cue::settings_t* settings) {
306   if (line_ptr == NULL)
307     return -1;
308 
309   std::string& line = *line_ptr;
310 
311   if (arrow_pos == std::string::npos || arrow_pos >= line.length())
312     return -1;
313 
314   // Place a NUL character at the start of the arrow token, in
315   // order to demarcate the start time from remainder of line.
316   line[arrow_pos] = kNUL;
317   std::string::size_type idx = 0;
318 
319   int e = ParseTime(line, &idx, start_time);
320   if (e)  // error
321     return e;
322 
323   // Detect any junk that follows the start time,
324   // but precedes the arrow symbol.
325 
326   while (char c = line[idx]) {
327     if (c != kSPACE && c != kTAB)
328       return -1;
329     ++idx;
330   }
331 
332   // Place a NUL character at the end of the line,
333   // so the scanner has a place to stop, and begin
334   // the scan just beyond the arrow token.
335 
336   line.push_back(kNUL);
337   idx = arrow_pos + 3;
338 
339   e = ParseTime(line, &idx, stop_time);
340   if (e)  // error
341     return e;
342 
343   e = ParseSettings(line, idx, settings);
344   if (e)  // error
345     return e;
346 
347   return 0;  // success
348 }
349 
ParseTime(const std::string & line,std::string::size_type * idx_ptr,Time * time)350 int Parser::ParseTime(const std::string& line, std::string::size_type* idx_ptr,
351                       Time* time) {
352   if (idx_ptr == NULL)
353     return -1;
354 
355   std::string::size_type& idx = *idx_ptr;
356 
357   if (idx == std::string::npos || idx >= line.length())
358     return -1;
359 
360   if (time == NULL)
361     return -1;
362 
363   // Consume any whitespace that precedes the timestamp.
364 
365   while (char c = line[idx]) {
366     if (c != kSPACE && c != kTAB)
367       break;
368     ++idx;
369   }
370 
371   // WebVTT timestamp syntax comes in three flavors:
372   //  SS[.sss]
373   //  MM:SS[.sss]
374   //  HH:MM:SS[.sss]
375 
376   // Parse a generic number value.  We don't know which component
377   // of the time we have yet, until we do more parsing.
378 
379   int val = ParseNumber(line, &idx);
380 
381   if (val < 0)  // error
382     return val;
383 
384   Time& t = *time;
385 
386   // The presence of a colon character indicates that we have
387   // an [HH:]MM:SS style syntax.
388 
389   if (line[idx] == ':') {
390     // We have either HH:MM:SS or MM:SS
391 
392     // The value we just parsed is either the hours or minutes.
393     // It must be followed by another number value (that is
394     // either minutes or seconds).
395 
396     const int first_val = val;
397 
398     ++idx;  // consume colon
399 
400     // Parse second value
401 
402     val = ParseNumber(line, &idx);
403 
404     if (val < 0)
405       return val;
406 
407     if (val >= 60)  // either MM or SS
408       return -1;
409 
410     if (line[idx] == ':') {
411       // We have HH:MM:SS
412 
413       t.hours = first_val;
414       t.minutes = val;  // vetted above
415 
416       ++idx;  // consume MM:SS colon
417 
418       // We have parsed the hours and minutes.
419       // We must now parse the seconds.
420 
421       val = ParseNumber(line, &idx);
422 
423       if (val < 0)
424         return val;
425 
426       if (val >= 60)  // SS part of HH:MM:SS
427         return -1;
428 
429       t.seconds = val;
430     } else {
431       // We have MM:SS
432 
433       // The implication here is that the hour value was omitted
434       // from the timestamp (because it was 0).
435 
436       if (first_val >= 60)  // minutes
437         return -1;
438 
439       t.hours = 0;
440       t.minutes = first_val;
441       t.seconds = val;  // vetted above
442     }
443   } else {
444     // We have SS (only)
445 
446     // The time is expressed as total number of seconds,
447     // so the seconds value has no upper bound.
448 
449     t.seconds = val;
450 
451     // Convert SS to HH:MM:SS
452 
453     t.minutes = t.seconds / 60;
454     t.seconds -= t.minutes * 60;
455 
456     t.hours = t.minutes / 60;
457     t.minutes -= t.hours * 60;
458   }
459 
460   // We have parsed the hours, minutes, and seconds.
461   // We must now parse the milliseconds.
462 
463   char c = line[idx];
464 
465   // TODO(matthewjheaney): one option here is to slightly relax the
466   // syntax rules for WebVTT timestamps, to permit the comma character
467   // to also be used as the seconds/milliseconds separator.  This
468   // would handle streams that use localization conventions for
469   // countries in Western Europe.  For now we obey the rules specified
470   // in the WebVTT spec (allow "full stop" only).
471 
472   const bool have_milliseconds = (c == '.');
473 
474   if (!have_milliseconds) {
475     t.milliseconds = 0;
476   } else {
477     ++idx;  // consume FULL STOP
478 
479     val = ParseNumber(line, &idx);
480 
481     if (val < 0)
482       return val;
483 
484     if (val >= 1000)
485       return -1;
486 
487     if (val < 10)
488       t.milliseconds = val * 100;
489     else if (val < 100)
490       t.milliseconds = val * 10;
491     else
492       t.milliseconds = val;
493   }
494 
495   // We have parsed the time proper.  We must check for any
496   // junk that immediately follows the time specifier.
497 
498   c = line[idx];
499 
500   if (c != kNUL && c != kSPACE && c != kTAB)
501     return -1;
502 
503   return 0;  // success
504 }
505 
ParseSettings(const std::string & line,std::string::size_type idx,Cue::settings_t * settings)506 int Parser::ParseSettings(const std::string& line, std::string::size_type idx,
507                           Cue::settings_t* settings) {
508   settings->clear();
509 
510   if (idx == std::string::npos || idx >= line.length())
511     return -1;
512 
513   for (;;) {
514     // We must parse a line comprising a sequence of 0 or more
515     // NAME:VALUE pairs, separated by whitespace.  The line iself is
516     // terminated with a NUL char (indicating end-of-line).
517 
518     for (;;) {
519       const char c = line[idx];
520 
521       if (c == kNUL)  // end-of-line
522         return 0;  // success
523 
524       if (c != kSPACE && c != kTAB)
525         break;
526 
527       ++idx;  // consume whitespace
528     }
529 
530     // We have consumed the whitespace, and have not yet reached
531     // end-of-line, so there is something on the line for us to parse.
532 
533     settings->push_back(Setting());
534     Setting& s = settings->back();
535 
536     // Parse the NAME part of the settings pair.
537 
538     for (;;) {
539       const char c = line[idx];
540 
541       if (c == ':')  // we have reached end of NAME part
542         break;
543 
544       if (c == kNUL || c == kSPACE || c == kTAB)
545         return -1;
546 
547       s.name.push_back(c);
548 
549       ++idx;
550     }
551 
552     if (s.name.empty())
553       return -1;
554 
555     ++idx;  // consume colon
556 
557     // Parse the VALUE part of the settings pair.
558 
559     for (;;) {
560       const char c = line[idx];
561 
562       if (c == kNUL || c == kSPACE || c == kTAB)
563         break;
564 
565       if (c == ':')  // suspicious when part of VALUE
566         return -1;  // TODO(matthewjheaney): verify this behavior
567 
568       s.value.push_back(c);
569 
570       ++idx;
571     }
572 
573     if (s.value.empty())
574       return -1;
575   }
576 }
577 
ParseNumber(const std::string & line,std::string::size_type * idx_ptr)578 int Parser::ParseNumber(const std::string& line,
579                         std::string::size_type* idx_ptr) {
580   if (idx_ptr == NULL)
581     return -1;
582 
583   std::string::size_type& idx = *idx_ptr;
584 
585   if (idx == std::string::npos || idx >= line.length())
586     return -1;
587 
588   if (!isdigit(line[idx]))
589     return -1;
590 
591   int result = 0;
592 
593   while (isdigit(line[idx])) {
594     const char c = line[idx];
595     const int i = c - '0';
596 
597     if (result > INT_MAX / 10)
598       return -1;
599 
600     result *= 10;
601 
602     if (result > INT_MAX - i)
603       return -1;
604 
605     result += i;
606 
607     ++idx;
608   }
609 
610   return result;
611 }
612 
operator ==(const Time & rhs) const613 bool Time::operator==(const Time& rhs) const {
614   if (hours != rhs.hours)
615     return false;
616 
617   if (minutes != rhs.minutes)
618     return false;
619 
620   if (seconds != rhs.seconds)
621     return false;
622 
623   return (milliseconds == rhs.milliseconds);
624 }
625 
operator <(const Time & rhs) const626 bool Time::operator<(const Time& rhs) const {
627   if (hours < rhs.hours)
628     return true;
629 
630   if (hours > rhs.hours)
631     return false;
632 
633   if (minutes < rhs.minutes)
634     return true;
635 
636   if (minutes > rhs.minutes)
637     return false;
638 
639   if (seconds < rhs.seconds)
640     return true;
641 
642   if (seconds > rhs.seconds)
643     return false;
644 
645   return (milliseconds < rhs.milliseconds);
646 }
647 
operator >(const Time & rhs) const648 bool Time::operator>(const Time& rhs) const { return rhs.operator<(*this); }
649 
operator <=(const Time & rhs) const650 bool Time::operator<=(const Time& rhs) const { return !this->operator>(rhs); }
651 
operator >=(const Time & rhs) const652 bool Time::operator>=(const Time& rhs) const { return !this->operator<(rhs); }
653 
presentation() const654 presentation_t Time::presentation() const {
655   const presentation_t h = 1000LL * 3600LL * presentation_t(hours);
656   const presentation_t m = 1000LL * 60LL * presentation_t(minutes);
657   const presentation_t s = 1000LL * presentation_t(seconds);
658   const presentation_t result = h + m + s + milliseconds;
659   return result;
660 }
661 
presentation(presentation_t d)662 Time& Time::presentation(presentation_t d) {
663   if (d < 0) {  // error
664     hours = 0;
665     minutes = 0;
666     seconds = 0;
667     milliseconds = 0;
668 
669     return *this;
670   }
671 
672   seconds = static_cast<int>(d / 1000);
673   milliseconds = static_cast<int>(d - 1000 * seconds);
674 
675   minutes = seconds / 60;
676   seconds -= 60 * minutes;
677 
678   hours = minutes / 60;
679   minutes -= 60 * hours;
680 
681   return *this;
682 }
683 
operator +=(presentation_t rhs)684 Time& Time::operator+=(presentation_t rhs) {
685   const presentation_t d = this->presentation();
686   const presentation_t dd = d + rhs;
687   this->presentation(dd);
688   return *this;
689 }
690 
operator +(presentation_t d) const691 Time Time::operator+(presentation_t d) const {
692   Time t(*this);
693   t += d;
694   return t;
695 }
696 
operator -=(presentation_t d)697 Time& Time::operator-=(presentation_t d) { return this->operator+=(-d); }
698 
operator -(const Time & t) const699 presentation_t Time::operator-(const Time& t) const {
700   const presentation_t rhs = t.presentation();
701   const presentation_t lhs = this->presentation();
702   const presentation_t result = lhs - rhs;
703   return result;
704 }
705 
706 }  // namespace libwebvtt
707