class CaptionCrunch::Adapters::VTT
Constants
- ARROW_REGEX
- COMMENT_REGEX
- NEWLINE_REGEX
- SIGNATURE_REGEX
- TIME_REGEX
Format: hour:minute:second.milliseconds
hour: is optional. 11:22:33 00:11:22:333 102:01:43:204
Public Class Methods
parse(file)
click to toggle source
Reads a file (or string) and returns a CaptionCrunch::Track
instance. Raises CaptionCrunch::ParseError
if the input is malformed.
# File lib/caption_crunch/adapters/vtt.rb, line 19 def parse(file) contents = remove_bom(read_file(file)) normalized = normalize_linefeeds(contents) segments = split_segments(normalized) ensure_signature(segments.shift) Track.new.tap do |track| segments.each do |segment| next if comment?(segment) track.cues << parse_cue(segment) end end end
Protected Class Methods
comment?(segment)
click to toggle source
# File lib/caption_crunch/adapters/vtt.rb, line 72 def comment?(segment) segment =~ COMMENT_REGEX end
ensure_signature(segment)
click to toggle source
# File lib/caption_crunch/adapters/vtt.rb, line 66 def ensure_signature(segment) if segment !~ SIGNATURE_REGEX raise ParseError, 'File must start with WEBVTT' end end
normalize_linefeeds(string)
click to toggle source
# File lib/caption_crunch/adapters/vtt.rb, line 57 def normalize_linefeeds(string) string.encode(string.encoding, universal_newline: true) end
parse_cue(segment)
click to toggle source
Turns a segment into a new CaptionCrunch::Cue
instance.
# File lib/caption_crunch/adapters/vtt.rb, line 77 def parse_cue(segment) parts = segment.split(NEWLINE_REGEX) # ignore optional identifier for now parts.shift unless parts[0] =~ ARROW_REGEX # parse time and cue settings times = parts.shift.to_s.split(ARROW_REGEX) raise ParseError, "Cue timings missing: #{segment}" if times.size != 2 start_time = times.first.strip end_time, settings = times.last.strip.split(/\s+/, 2) # parse payload payload = parts.map(&:strip).join("\n") Cue.new.tap do |cue| cue.start_time = parse_time(start_time) cue.end_time = parse_time(end_time) cue.payload = payload end end
parse_time(timestamp)
click to toggle source
Converts a timestamp into an integer representing the milliseconds.
# File lib/caption_crunch/adapters/vtt.rb, line 99 def parse_time(timestamp) match = TIME_REGEX.match(timestamp.strip) raise ParseError, "Invalid timestamp: #{timestamp}" unless match captures = match.captures integer = 0 integer += captures.pop.to_i # msecs integer += captures.pop.to_i * 1000 # secs integer += captures.pop.to_i * 1000 * 60 # mins integer += captures.pop.to_i * 1000 * 60 * 60 # hours integer end
read_file(file)
click to toggle source
Returns a string corresponding to the contents of a File instance. Alternatively, if the argument is not a File, simply calls `.to_s`.
# File lib/caption_crunch/adapters/vtt.rb, line 37 def read_file(file) contents = if file.respond_to?(:read) file.read else file.to_s end raise ParseError, "Invalid encoding" unless contents.valid_encoding? contents.strip end
remove_bom(string)
click to toggle source
# File lib/caption_crunch/adapters/vtt.rb, line 49 def remove_bom(string) if string[0] == "\uFEFF" string.slice(1..-1) else string end end
split_segments(string)
click to toggle source
The WebVTT spec separates segments by two newlines or more.
# File lib/caption_crunch/adapters/vtt.rb, line 62 def split_segments(string) string.split(/#{NEWLINE_REGEX}{2,}/) end