source/webvtt/webvttparser.cc

// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the LICENSE file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS.  All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.

#include "webvttparser.h"

#include <ctype.h>

#include <climits>
#include <cstddef>

namespace libwebvtt {

// NOLINT'ing this enum because clang-format puts it in a single line which
// makes it look really unreadable.
enum {
  kNUL = '\x00',
  kSPACE = ' ',
  kTAB = '\x09',
  kLF = '\x0A',
  kCR = '\x0D'
};  // NOLINT

Reader::~Reader() {}

LineReader::~LineReader() {}

int LineReader::GetLine(std::string* line_ptr) {
  if (line_ptr == NULL)
    return -1;

  std::string& ln = *line_ptr;
  ln.clear();

  // Consume characters from the stream, until we
  // reach end-of-line (or end-of-stream).

  // The WebVTT spec states that lines may be
  // terminated in any of these three ways:
  //  LF
  //  CR
  //  CR LF

  // We interrogate each character as we read it from the stream.
  // If we detect an end-of-line character, we consume the full
  // end-of-line indication, and we're done; otherwise, accumulate
  // the character and repeat.

  for (;;) {
    char c;
    const int e = GetChar(&c);

    if (e < 0)  // error
      return e;

    if (e > 0)  // EOF
      return (ln.empty()) ? 1 : 0;

    // We have a character, so we must first determine
    // whether we have reached end-of-line.

    if (c == kLF)
      return 0;  // handle the easy end-of-line case immediately

    if (c == kCR)
      break;  // handle the hard end-of-line case outside of loop

    if (c == '\xFE' || c == '\xFF')  // not UTF-8
      return -1;

    // To defend against pathological or malicious streams, we
    // cap the line length at some arbitrarily-large value:
    enum { kMaxLineLength = 10000 };  // arbitrary

    if (ln.length() >= kMaxLineLength)
      return -1;

    // We don't have an end-of-line character, so accumulate
    // the character in our line buffer.
    ln.push_back(c);
  }

  // We detected a CR.  We must interrogate the next character
  // in the stream, to determine whether we have a LF (which
  // would make it part of this same line).

  char c;
  const int e = GetChar(&c);

  if (e < 0)  // error
    return e;

  if (e > 0)  // EOF
    return 0;

  // If next character in the stream is not a LF, return it
  // to the stream (because it's part of the next line).
  if (c != kLF)
    UngetChar(c);

  return 0;
}

Parser::Parser(Reader* r) : reader_(r), unget_(-1) {}

Parser::~Parser() {}

int Parser::Init() {
  int e = ParseBOM();

  if (e < 0)  // error
    return e;

  if (e > 0)  // EOF
    return -1;

  // Parse "WEBVTT".  We read from the stream one character at-a-time, in
  // order to defend against non-WebVTT streams (e.g. binary files) that don't
  // happen to comprise lines of text demarcated with line terminators.

  const char kId[] = "WEBVTT";

  for (const char* p = kId; *p; ++p) {
    char c;
    e = GetChar(&c);

    if (e < 0)  // error
      return e;

    if (e > 0)  // EOF
      return -1;

    if (c != *p)
      return -1;
  }

  std::string line;

  e = GetLine(&line);

  if (e < 0)  // error
    return e;

  if (e > 0)  // EOF
    return 0;  // weird but valid

  if (!line.empty()) {
    // Parse optional characters that follow "WEBVTT"

    const char c = line[0];

    if (c != kSPACE && c != kTAB)
      return -1;
  }

  // The WebVTT spec requires that the "WEBVTT" line
  // be followed by an empty line (to separate it from
  // first cue).

  e = GetLine(&line);

  if (e < 0)  // error
    return e;

  if (e > 0)  // EOF
    return 0;  // weird but we allow it

  if (!line.empty())
    return -1;

  return 0;  // success
}

int Parser::Parse(Cue* cue) {
  if (cue == NULL)
    return -1;

  // Parse first non-blank line

  std::string line;
  int e;

  for (;;) {
    e = GetLine(&line);

    if (e)  // EOF is OK here
      return e;

    if (!line.empty())
      break;
  }

  // A WebVTT cue comprises an optional cue identifier line followed
  // by a (non-optional) timings line.  You determine whether you have
  // a timings line by scanning for the arrow token, the lexeme of which
  // may not appear in the cue identifier line.

  const char kArrow[] = "-->";
  std::string::size_type arrow_pos = line.find(kArrow);

  if (arrow_pos != std::string::npos) {
    // We found a timings line, which implies that we don't have a cue
    // identifier.

    cue->identifier.clear();
  } else {
    // We did not find a timings line, so we assume that we have a cue
    // identifier line, and then try again to find the cue timings on
    // the next line.

    cue->identifier.swap(line);

    e = GetLine(&line);

    if (e < 0)  // error
      return e;

    if (e > 0)  // EOF
      return -1;

    arrow_pos = line.find(kArrow);

    if (arrow_pos == std::string::npos)  // not a timings line
      return -1;
  }

  e = ParseTimingsLine(&line, arrow_pos, &cue->start_time, &cue->stop_time,
                       &cue->settings);

  if (e)  // error
    return e;

  // The cue payload comprises all the non-empty
  // lines that follow the timings line.

  Cue::payload_t& p = cue->payload;
  p.clear();

  for (;;) {
    e = GetLine(&line);

    if (e < 0)  // error
      return e;

    if (line.empty())
      break;

    p.push_back(line);
  }

  if (p.empty())
    return -1;

  return 0;  // success
}

int Parser::GetChar(char* c) {
  if (unget_ >= 0) {
    *c = static_cast<char>(unget_);
    unget_ = -1;
    return 0;
  }

  return reader_->GetChar(c);
}

void Parser::UngetChar(char c) { unget_ = static_cast<unsigned char>(c); }

int Parser::ParseBOM() {
  // Explanation of UTF-8 BOM:
  // http://en.wikipedia.org/wiki/Byte_order_mark

  static const char BOM[] = "\xEF\xBB\xBF";  // UTF-8 BOM

  for (int i = 0; i < 3; ++i) {
    char c;
    int e = GetChar(&c);

    if (e < 0)  // error
      return e;

    if (e > 0)  // EOF
      return 1;

    if (c != BOM[i]) {
      if (i == 0) {  // we don't have a BOM
        UngetChar(c);
        return 0;  // success
      }

      // We started a BOM, so we must finish the BOM.
      return -1;  // error
    }
  }

  return 0;  // success
}

int Parser::ParseTimingsLine(std::string* line_ptr,
                             std::string::size_type arrow_pos, Time* start_time,
                             Time* stop_time, Cue::settings_t* settings) {
  if (line_ptr == NULL)
    return -1;

  std::string& line = *line_ptr;

  if (arrow_pos == std::string::npos || arrow_pos >= line.length())
    return -1;

  // Place a NUL character at the start of the arrow token, in
  // order to demarcate the start time from remainder of line.
  line[arrow_pos] = kNUL;
  std::string::size_type idx = 0;

  int e = ParseTime(line, &idx, start_time);
  if (e)  // error
    return e;

  // Detect any junk that follows the start time,
  // but precedes the arrow symbol.

  while (char c = line[idx]) {
    if (c != kSPACE && c != kTAB)
      return -1;
    ++idx;
  }

  // Place a NUL character at the end of the line,
  // so the scanner has a place to stop, and begin
  // the scan just beyond the arrow token.

  line.push_back(kNUL);
  idx = arrow_pos + 3;

  e = ParseTime(line, &idx, stop_time);
  if (e)  // error
    return e;

  e = ParseSettings(line, idx, settings);
  if (e)  // error
    return e;

  return 0;  // success
}

int Parser::ParseTime(const std::string& line, std::string::size_type* idx_ptr,
                      Time* time) {
  if (idx_ptr == NULL)
    return -1;

  std::string::size_type& idx = *idx_ptr;

  if (idx == std::string::npos || idx >= line.length())
    return -1;

  if (time == NULL)
    return -1;

  // Consume any whitespace that precedes the timestamp.

  while (char c = line[idx]) {
    if (c != kSPACE && c != kTAB)
      break;
    ++idx;
  }

  // WebVTT timestamp syntax comes in three flavors:
  //  SS[.sss]
  //  MM:SS[.sss]
  //  HH:MM:SS[.sss]

  // Parse a generic number value.  We don't know which component
  // of the time we have yet, until we do more parsing.

  int val = ParseNumber(line, &idx);

  if (val < 0)  // error
    return val;

  Time& t = *time;

  // The presence of a colon character indicates that we have
  // an [HH:]MM:SS style syntax.

  if (line[idx] == ':') {
    // We have either HH:MM:SS or MM:SS

    // The value we just parsed is either the hours or minutes.
    // It must be followed by another number value (that is
    // either minutes or seconds).

    const int first_val = val;

    ++idx;  // consume colon

    // Parse second value

    val = ParseNumber(line, &idx);

    if (val < 0)
      return val;

    if (val >= 60)  // either MM or SS
      return -1;

    if (line[idx] == ':') {
      // We have HH:MM:SS

      t.hours = first_val;
      t.minutes = val;  // vetted above

      ++idx;  // consume MM:SS colon

      // We have parsed the hours and minutes.
      // We must now parse the seconds.

      val = ParseNumber(line, &idx);

      if (val < 0)
        return val;

      if (val >= 60)  // SS part of HH:MM:SS
        return -1;

      t.seconds = val;
    } else {
      // We have MM:SS

      // The implication here is that the hour value was omitted
      // from the timestamp (because it was 0).

      if (first_val >= 60)  // minutes
        return -1;

      t.hours = 0;
      t.minutes = first_val;
      t.seconds = val;  // vetted above
    }
  } else {
    // We have SS (only)

    // The time is expressed as total number of seconds,
    // so the seconds value has no upper bound.

    t.seconds = val;

    // Convert SS to HH:MM:SS

    t.minutes = t.seconds / 60;
    t.seconds -= t.minutes * 60;

    t.hours = t.minutes / 60;
    t.minutes -= t.hours * 60;
  }

  // We have parsed the hours, minutes, and seconds.
  // We must now parse the milliseconds.

  char c = line[idx];

  // TODO(matthewjheaney): one option here is to slightly relax the
  // syntax rules for WebVTT timestamps, to permit the comma character
  // to also be used as the seconds/milliseconds separator.  This
  // would handle streams that use localization conventions for
  // countries in Western Europe.  For now we obey the rules specified
  // in the WebVTT spec (allow "full stop" only).

  const bool have_milliseconds = (c == '.');

  if (!have_milliseconds) {
    t.milliseconds = 0;
  } else {
    ++idx;  // consume FULL STOP

    val = ParseNumber(line, &idx);

    if (val < 0)
      return val;

    if (val >= 1000)
      return -1;

    if (val < 10)
      t.milliseconds = val * 100;
    else if (val < 100)
      t.milliseconds = val * 10;
    else
      t.milliseconds = val;
  }

  // We have parsed the time proper.  We must check for any
  // junk that immediately follows the time specifier.

  c = line[idx];

  if (c != kNUL && c != kSPACE && c != kTAB)
    return -1;

  return 0;  // success
}

int Parser::ParseSettings(const std::string& line, std::string::size_type idx,
                          Cue::settings_t* settings) {
  settings->clear();

  if (idx == std::string::npos || idx >= line.length())
    return -1;

  for (;;) {
    // We must parse a line comprising a sequence of 0 or more
    // NAME:VALUE pairs, separated by whitespace.  The line iself is
    // terminated with a NUL char (indicating end-of-line).

    for (;;) {
      const char c = line[idx];

      if (c == kNUL)  // end-of-line
        return 0;  // success

      if (c != kSPACE && c != kTAB)
        break;

      ++idx;  // consume whitespace
    }

    // We have consumed the whitespace, and have not yet reached
    // end-of-line, so there is something on the line for us to parse.

    settings->push_back(Setting());
    Setting& s = settings->back();

    // Parse the NAME part of the settings pair.

    for (;;) {
      const char c = line[idx];

      if (c == ':')  // we have reached end of NAME part
        break;

      if (c == kNUL || c == kSPACE || c == kTAB)
        return -1;

      s.name.push_back(c);

      ++idx;
    }

    if (s.name.empty())
      return -1;

    ++idx;  // consume colon

    // Parse the VALUE part of the settings pair.

    for (;;) {
      const char c = line[idx];

      if (c == kNUL || c == kSPACE || c == kTAB)
        break;

      if (c == ':')  // suspicious when part of VALUE
        return -1;  // TODO(matthewjheaney): verify this behavior

      s.value.push_back(c);

      ++idx;
    }

    if (s.value.empty())
      return -1;
  }
}

int Parser::ParseNumber(const std::string& line,
                        std::string::size_type* idx_ptr) {
  if (idx_ptr == NULL)
    return -1;

  std::string::size_type& idx = *idx_ptr;

  if (idx == std::string::npos || idx >= line.length())
    return -1;

  if (!isdigit(line[idx]))
    return -1;

  int result = 0;

  while (isdigit(line[idx])) {
    const char c = line[idx];
    const int i = c - '0';

    if (result > INT_MAX / 10)
      return -1;

    result *= 10;

    if (result > INT_MAX - i)
      return -1;

    result += i;

    ++idx;
  }

  return result;
}

bool Time::operator==(const Time& rhs) const {
  if (hours != rhs.hours)
    return false;

  if (minutes != rhs.minutes)
    return false;

  if (seconds != rhs.seconds)
    return false;

  return (milliseconds == rhs.milliseconds);
}

bool Time::operator<(const Time& rhs) const {
  if (hours < rhs.hours)
    return true;

  if (hours > rhs.hours)
    return false;

  if (minutes < rhs.minutes)
    return true;

  if (minutes > rhs.minutes)
    return false;

  if (seconds < rhs.seconds)
    return true;

  if (seconds > rhs.seconds)
    return false;

  return (milliseconds < rhs.milliseconds);
}

bool Time::operator>(const Time& rhs) const { return rhs.operator<(*this); }

bool Time::operator<=(const Time& rhs) const { return !this->operator>(rhs); }

bool Time::operator>=(const Time& rhs) const { return !this->operator<(rhs); }

presentation_t Time::presentation() const {
  const presentation_t h = 1000LL * 3600LL * presentation_t(hours);
  const presentation_t m = 1000LL * 60LL * presentation_t(minutes);
  const presentation_t s = 1000LL * presentation_t(seconds);
  const presentation_t result = h + m + s + milliseconds;
  return result;
}

Time& Time::presentation(presentation_t d) {
  if (d < 0) {  // error
    hours = 0;
    minutes = 0;
    seconds = 0;
    milliseconds = 0;

    return *this;
  }

  seconds = static_cast<int>(d / 1000);
  milliseconds = static_cast<int>(d - 1000 * seconds);

  minutes = seconds / 60;
  seconds -= 60 * minutes;

  hours = minutes / 60;
  minutes -= 60 * hours;

  return *this;
}

Time& Time::operator+=(presentation_t rhs) {
  const presentation_t d = this->presentation();
  const presentation_t dd = d + rhs;
  this->presentation(dd);
  return *this;
}

Time Time::operator+(presentation_t d) const {
  Time t(*this);
  t += d;
  return t;
}

Time& Time::operator-=(presentation_t d) { return this->operator+=(-d); }

presentation_t Time::operator-(const Time& t) const {
  const presentation_t rhs = t.presentation();
  const presentation_t lhs = this->presentation();
  const presentation_t result = lhs - rhs;
  return result;
}

}  // namespace libwebvtt