class Tml::Tokenizers::Dom

Attributes

context[RW]
options[RW]
tokens[RW]

Public Class Methods

new(context = {}, options = {}) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 41
def initialize(context = {}, options = {})
  self.context = context
  self.options = options
  reset_context
end

Public Instance Methods

adjust_name(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 319
def adjust_name(node)
  name = node.name.downcase
  map = option('name_mapping')
  map[name.to_sym] ? map[name.to_sym] : name
end
between_separators?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 132
def between_separators?(node)
  (separator_node?(node.previous_sibling) and !valid_text_node?(node.next_sibling)) or
  (separator_node?(node.next_sibling) and !valid_text_node?(node.previous_sibling))
end
container_node?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 208
def container_node?(node)
  node.type == 1 && !inline_node?(node)
end
contextualize(name, context) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 325
def contextualize(name, context)
  if self.tokens[name] and self.tokens[name] != context
    index = 0
    matches = name.match(/\d+$/)
    if matches and matches.length > 0
      index = matches[matches.length-1].to_i
      name = name.gsub(index.to_s, '')
    end
    name += (index + 1).to_s
    return contextualize(name, context)
  end

  self.tokens[name] = context
  name
end
debug(doc) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 341
def debug(doc)
  self.doc = doc
  debug_tree(self.doc, 0)
end
debug_translation(translation) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 162
def debug_translation(translation)
  option('debug_format').gsub('{$0}', translation)
end
debug_tree(node, depth) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 346
def debug_tree(node, depth)
  padding = ('=' * (depth+1))

  Tml.logger.log(padding + '=> ' + (node) + ': ' + node_info(node))

  (node.children || []).each do |child|
    debug_tree(child, depth+1)
  end
end
empty_string?(tml) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 166
def empty_string?(tml)
  tml = tml.gsub(/[\s\n\r\t]/, '').gsub(/[\u0080-\u00ff]/, '')
  return true if tml == ''
  return true if tml.match(/\A\$\{[^\}]+\}\z/)  # ignore variables ${var_name}
  return true if tml.match(/\A\$?\d+\.?\d+\z/) # ignore prices and numbers

  false
end
generate_data_tokens(text) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 235
def generate_data_tokens(text)
  if option('data_tokens.special.enabled')
    matches = text.scan(option('data_tokens.special.regex'))
    matches.each do  |match|
      token = match[1, - 2]
      self.context[token] = match
      text = text.gsub(match, "{#{token}}")
    end
  end

  if option('data_tokens.date.enabled')
    token_name = option('data_tokens.date.name')
    formats = option('data_tokens.date.formats')
    formats.each do |format|
      regex = format[0]
      # date_format = format[1]

      matches = text.scan(regex)
      if matches
        matches.each do |match|
          next if match.first.nil? or match.first == ''
          date = match.first
          token = self.contextualize(token_name, date)
          replacement = "{#{token}}"
          text = text.gsub(date, replacement)
        end
      end
    end
  end

  rules = option('data_tokens.rules')
  if rules
    rules.each do |rule|
      next unless rule[:enabled]
      matches = text.scan(rule[:regex])

      if matches
        matches.each do |match|
          next if match.first.nil? or match.first == ''
          value = match.first.strip

          unless value == ''
            token = contextualize(rule[:name], value.gsub(/[.,;\s]/, '').to_i)
            text = text.gsub(value, value.gsub(value, "{#{token}}"))
          end
        end
      end
    end
  end

  text
end
generate_html_token(node, value = nil) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 288
def generate_html_token(node, value = nil)
  name = node.name.downcase
  attributes = node.attributes
  attributes_hash = {}
  value = (!value ? '{$0}' : value)

  if attributes.length == 0
    if self_closing_node?(node)
      return '<' + name + '/>' if %w(br hr).index(name)
      return '<' + name + '>' + '</' + name + '>'
    end
    return '<' + name + '>' + value + '</' + name + '>'
  end

  attributes.each do |name, attribute|
    attributes_hash[name] = attribute.value
  end

  keys = attributes_hash.keys.sort

  attr = []
  keys.each do |key|
    quote = attributes_hash[key].index("'") ? '"' : "'"
    attr << (key + '=' + quote + attributes_hash[key] + quote)
  end
  attr = attr.join(' ')

  return '<' + name + ' ' + attr + '>' + '</' + name + '>' if self_closing_node?(node)
  '<' + name + ' ' + attr + '>' + value + '</' + name + '>'
end
generate_tml_tags(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 137
def generate_tml_tags(node)
  buffer = ''
  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    else
      buffer += generate_tml_tags(child)
    end
  end

  token_context = generate_html_token(node)
  token = contextualize(adjust_name(node), token_context)
  value = sanitize_value(buffer)

  return '{' + token + '}' if self_closing_node?(node)
  # return '[' + token + ': ' + value + ']' if short_token?(token, value)

  '<' + token + '>' + value + '</' + token + '>'
end
has_child_nodes?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 128
def has_child_nodes?(node)
  node.children and node.children.length > 0
end
has_inline_or_text_siblings?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 188
def has_inline_or_text_siblings?(node)
  return false unless node.parent

  node.parent.children.each do |child|
    unless child == node
      return true if inline_node?(child) || valid_text_node?(child)
    end
  end

  false
end
ignored_node?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 216
def ignored_node?(node)
  return true if (node.type != 1)
  (option('nodes.ignored') || []).index(node.name.downcase)
end
inline_node?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 200
def inline_node?(node)
  (
    node.type == 1 and
    (option('nodes.inline') || []).index(node.name.downcase) and
    !only_child?(node)
  )
end
no_translate_node?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 88
def no_translate_node?(node)
  return unless node && node.type == 1 && node.attributes
  node.attributes.each do |name, attribute|
    return true if name == 'notranslate' or attribute.value.index('notranslate')
  end
  false
end
node_info(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 356
def node_info(node)
  info = []
  info << node.type

  info << node.tagName if node.type == 1

  if inline_node?(node)
    info << 'inline'
    if has_inline_or_text_siblings?(node)
      info << 'sentence'
    else
      info << 'only translatable'
    end
  end

  info << 'self closing' if self_closing_node?(node)
  info << 'only child' if only_child?(node)

  return "[#{info.join(', ')}]: " + node.inner_text if node.type == 3
  "[#{info.join(', ')}]"
end
non_translatable_node?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 96
def non_translatable_node?(node)
  return false unless node
  return true if node.type == 1 && (option('nodes.scripts') || []).index(node.name.downcase)
  return true if node.type == 1 && node.children.length === 0 && node.inner_text == ''
  return true if no_translate_node?(node)
  false
end
only_child?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 183
def only_child?(node)
  return false unless node.parent
  node.parent.children.count == 1
end
option(name) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 157
def option(name)
  value = Tml::Utils.hash_value(self.options, name)
  value || Tml.config.translator_option(name)
end
reset_context() click to toggle source
# File lib/tml/tokenizers/dom.rb, line 175
def reset_context
  self.tokens = {}.merge(self.context)
end
sanitize_value(value) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 231
def sanitize_value(value)
  value.gsub(/^\s+/, '')
end
self_closing_node?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 212
def self_closing_node?(node)
  !node.children || !node.children.first
end
separator_node?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 226
def separator_node?(node)
  return false unless node
  node.type == 1 && (option('nodes.splitters') || []).index(node.name.downcase)
end
short_token?(token, value) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 179
def short_token?(token, value)
  option('nodes.short').index(token.downcase) || value.length < 20
end
translate(doc) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 47
def translate(doc)
  translate_tree(doc.is_a?(String) ? Nokogiri::HTML.fragment(doc) : doc)
end
translate_tml(tml) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 104
def translate_tml(tml)
  return tml if empty_string?(tml)
  # pp tml

  tml = generate_data_tokens(tml)

  if option('split_sentences')
    sentences = Tml::Utils.split_sentences(tml)
    translation = tml
    sentences.each do |sentence|
      sentence_translation = option('debug') ? debug_translation(sentence) : Tml.session.current_language.translate(sentence, tokens, options.dup)
      translation = translation.gsub(sentence, sentence_translation)
    end
    reset_context
    return translation
  end

  tml = tml.gsub(/[\n]/, '').gsub(/\s\s+/, ' ').strip

  translation = option('debug') ? debug_translation(tml) : Tml.session.target_language.translate(tml, tokens, options.dup)
  reset_context
  translation
end
translate_tree(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 51
def translate_tree(node)
  if non_translatable_node?(node)
    return node.inner_html
  end

  return translate_tml(node.inner_text) if node.type == 3

  html = ''
  buffer = ''

  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    elsif inline_node?(child) and has_inline_or_text_siblings?(child) and !between_separators?(child)
      buffer += generate_tml_tags(child)
    elsif separator_node?(child)
      html += translate_tml(buffer) if buffer != ''
      html += generate_html_token(child)
      buffer = ''
    else
      html += translate_tml(buffer) if buffer != ''

      container_value = translate_tree(child)
      if ignored_node?(child)
        html += container_value
      else
        html += generate_html_token(child, container_value)
      end

      buffer = ''
    end
  end

  html += translate_tml(buffer) if buffer != ''
  html
end
valid_text_node?(node) click to toggle source
# File lib/tml/tokenizers/dom.rb, line 221
def valid_text_node?(node)
  return false unless node
  node.type == 3 && !empty_string?(node.inner_text)
end