class CaptionCrunch::Adapters::VTT

Constants

ARROW_REGEX
COMMENT_REGEX
NEWLINE_REGEX
SIGNATURE_REGEX
TIME_REGEX

Format: hour:minute:second.milliseconds

hour: is optional.
11:22:33
00:11:22:333
102:01:43:204

dev.w3.org/html5/webvtt/#dfn-collect-a-webvtt-timestamp

Public Class Methods

parse(file) click to toggle source

Reads a file (or string) and returns a CaptionCrunch::Track instance. Raises CaptionCrunch::ParseError if the input is malformed.

# File lib/caption_crunch/adapters/vtt.rb, line 19
def parse(file)
  contents = remove_bom(read_file(file))
  normalized = normalize_linefeeds(contents)
  segments = split_segments(normalized)
  ensure_signature(segments.shift)

  Track.new.tap do |track|
    segments.each do |segment|
      next if comment?(segment)
      track.cues << parse_cue(segment)
    end
  end
end

Protected Class Methods

comment?(segment) click to toggle source
# File lib/caption_crunch/adapters/vtt.rb, line 72
def comment?(segment)
  segment =~ COMMENT_REGEX
end
ensure_signature(segment) click to toggle source
# File lib/caption_crunch/adapters/vtt.rb, line 66
def ensure_signature(segment)
  if segment !~ SIGNATURE_REGEX
    raise ParseError, 'File must start with WEBVTT'
  end
end
normalize_linefeeds(string) click to toggle source
# File lib/caption_crunch/adapters/vtt.rb, line 57
def normalize_linefeeds(string)
  string.encode(string.encoding, universal_newline: true)
end
parse_cue(segment) click to toggle source

Turns a segment into a new CaptionCrunch::Cue instance.

# File lib/caption_crunch/adapters/vtt.rb, line 77
def parse_cue(segment)
  parts = segment.split(NEWLINE_REGEX)
  # ignore optional identifier for now
  parts.shift unless parts[0] =~ ARROW_REGEX

  # parse time and cue settings
  times = parts.shift.to_s.split(ARROW_REGEX)
  raise ParseError, "Cue timings missing: #{segment}" if times.size != 2
  start_time = times.first.strip
  end_time, settings = times.last.strip.split(/\s+/, 2)

  # parse payload
  payload = parts.map(&:strip).join("\n")

  Cue.new.tap do |cue|
    cue.start_time = parse_time(start_time)
    cue.end_time = parse_time(end_time)
    cue.payload = payload
  end
end
parse_time(timestamp) click to toggle source

Converts a timestamp into an integer representing the milliseconds.

# File lib/caption_crunch/adapters/vtt.rb, line 99
def parse_time(timestamp)
  match = TIME_REGEX.match(timestamp.strip)
  raise ParseError, "Invalid timestamp: #{timestamp}" unless match
  captures = match.captures
  integer = 0
  integer += captures.pop.to_i                  # msecs
  integer += captures.pop.to_i * 1000           # secs
  integer += captures.pop.to_i * 1000 * 60      # mins
  integer += captures.pop.to_i * 1000 * 60 * 60 # hours
  integer
end
read_file(file) click to toggle source

Returns a string corresponding to the contents of a File instance. Alternatively, if the argument is not a File, simply calls `.to_s`.

# File lib/caption_crunch/adapters/vtt.rb, line 37
def read_file(file)
  contents = if file.respond_to?(:read)
    file.read
  else
    file.to_s
  end

  raise ParseError, "Invalid encoding" unless contents.valid_encoding?

  contents.strip
end
remove_bom(string) click to toggle source
# File lib/caption_crunch/adapters/vtt.rb, line 49
def remove_bom(string)
  if string[0] == "\uFEFF"
    string.slice(1..-1)
  else
    string
  end
end
split_segments(string) click to toggle source

The WebVTT spec separates segments by two newlines or more.

# File lib/caption_crunch/adapters/vtt.rb, line 62
def split_segments(string)
  string.split(/#{NEWLINE_REGEX}{2,}/)
end