class PROIEL::Commands::Tokenize
Constants
- VALID_METADATA_FIELDS
Public Class Methods
init_with_program(prog)
click to toggle source
# File lib/proiel/cli/commands/tokenize.rb, line 5 def init_with_program(prog) prog.command(:tokenize) do |c| c.syntax 'tokenize' c.description 'Tokenize raw text' c.syntax '[options] filename' c.action { |args, options| process(args, options) } end end
process(args, options)
click to toggle source
# File lib/proiel/cli/commands/tokenize.rb, line 15 def process(args, options) if args.empty? STDERR.puts 'Missing filename. Use --help for more information.' exit 1 end if args.length > 1 STDERR.puts 'Too many filenames. Use --help for more information.' exit 1 end builder = Builder::XmlMarkup.new(target: STDOUT, indent: 2) builder.instruct! :xml, version: '1.0', encoding: 'UTF-8' filename = args.first File.open(filename, 'r') do |file| header = read_header(file) body = read_body(file) builder.proiel('export-time' => header.export_time, 'schema-version' => '2.0') do builder.source(id: header.id, language: header.language) do builder.title header.title builder.author header.author builder.tag!('citation-part', header.citation_part) tokenize(builder, body) end end end end
read_body(f)
click to toggle source
# File lib/proiel/cli/commands/tokenize.rb, line 143 def read_body(f) f.rewind Array.new.tap do |bdy| f.each_line do |l| case l when /^%/ # Ignore header when /^\s*$/ # Ignore empty lines when /^#/ # New source division started bdy << { title: l.sub(/^#/, '').strip, contents: '' } else bdy << { title: '', contents: '' } if bdy.empty? bdy.last[:contents] += l end end end end
read_header(f)
click to toggle source
# File lib/proiel/cli/commands/tokenize.rb, line 116 def read_header(f) f.rewind OpenStruct.new.tap do |hdr| # We expect a header first, each line starting with %, and we # assume that the header ends with the first line that does # not start with %. f.each_line do |l| l.chomp! case l when /^%/ field, value = l.sub(/^%\s*/, '').split(/\s*=\s*/, 2) case field when 'id', 'export_time', *VALID_METADATA_FIELDS hdr[field] = value.strip else STDERR.puts "Invalid header field #{field}. Ignoring.".yellow end else break end end end end
tokenize(builder, body)
click to toggle source
# File lib/proiel/cli/commands/tokenize.rb, line 47 def tokenize(builder, body) citation_part = nil body.each_with_index do |sd_body, i| builder.div do builder.title sd_body[:title] sd_body[:contents].split(/(@[^ ]+|§[^ ]+ )/).map do |s| if s[0] == '§' or s[0] == '@' s else # It's sensible to place the break not immediately after probable # sentence-breaking punctuation like periods and question marks, but # after the punctuation mark and characters typically used in pairs, # like brackets and apostrophes. s.gsub(/([\.:;\?!]+[\s†\]\)"']*|\s*[\n\r]+)/, '\1|') end end.join.split('|').each_with_index do |s_body, j| builder.sentence(status: 'unannotated') do leftover_before = '' # Preserve linebreaks in the text. s_body.gsub!(/\s*[\n\r]+/, "\u2028") s_body.scan(/([^@§\p{Word}]*)([\p{Word}]+|@[^ ]+|§[^ ]+ )([^@§\p{Word}]*)/).each do |(before, form, after)| case form when /^@(.*)$/ leftover_before += before unless before.nil? leftover_before += $1 leftover_before += after unless after.nil? when /^§(.*)$/ leftover_before += before unless before.nil? citation_part = $1.strip leftover_before += after unless after.nil? else before = leftover_before + before leftover_before = '' attrs = { :"citation-part" => citation_part, form: form } attrs[:"presentation-before"] = before unless before == '' attrs[:"presentation-after"] = after unless after == '' builder.token(attrs) end end end end end end end