class Tr8n::Tokenizers::Dom

Constants

HTML_SPECIAL_CHAR_REGEX
INDEPENDENT_NUMBER_REGEX
VERBOSE_DATE_REGEX

Attributes

context[RW]
options[RW]
tokens[RW]

Public Class Methods

new(context = {}, options = {}) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 45
def initialize(context = {}, options = {})
  self.context = context
  self.options = options
  reset_context
end

Public Instance Methods

adjust_name(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 285
def adjust_name(node)
  name = node.name.downcase
  map = option('name_mapping')
  map[name.to_sym] ? map[name.to_sym] : name
end
between_separators?(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 126
def between_separators?(node)
  (separator_node?(node.previous_sibling) and !valid_text_node?(node.next_sibling)) or
  (separator_node?(node.next_sibling) and !valid_text_node?(node.previous_sibling))
end
container_node?(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 198
def container_node?(node)
  node.type == 1 && !inline_node?(node)
end
contextualize(name, context) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 291
def contextualize(name, context)
  if self.tokens[name] and self.tokens[name] != context
    index = 0
    matches = name.match(/\d+$/)
    if matches and matches.length > 0
      index = matches[matches.length-1].to_i
      name = name.gsub(index.to_s, '')
    end
    name += (index + 1).to_s
    return contextualize(name, context)
  end

  self.tokens[name] = context
  name
end
debug(doc) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 307
def debug(doc)
  self.doc = doc
  debug_tree(self.doc, 0)
end
debug_translation(translation) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 156
def debug_translation(translation)
  option('debug_format').gsub('{$0}', translation)
end
debug_tree(node, depth) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 312
def debug_tree(node, depth)
  padding = ('=' * (depth+1))

  Tr8n.logger.log(padding + '=> ' + (node) + ': ' + node_info(node))

  (node.children || []).each do |child|
    debug_tree(child, depth+1)
  end
end
empty_string?(tml) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 160
def empty_string?(tml)
  tml = tml.gsub(/[\s\n\r\t]/, '')
  tml == ''
end
generate_data_tokens(text) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 238
def generate_data_tokens(text)
  return text unless option('data_tokens.numeric')

  matches = text.match(INDEPENDENT_NUMBER_REGEX) || []
  token_name = option('data_tokens.numeric_name')

  matches.each do |match|
    value = match.gsub(/[.,;\s]/, '')
    token = contextualize(token_name, value.to_i)
    replacement = match.replace(value, "{#{token}}")
    text = text.gsub(match, match.gsub(value, replacement))
  end

  text
end
generate_html_token(node, value = nil) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 254
def generate_html_token(node, value = nil)
  name = node.name.downcase
  attributes = node.attributes
  attributes_hash = {}
  value = (!value ? '{$0}' : value)

  if attributes.length == 0
    if self_closing_node?(node)
      return '<' + name + '/>' if %w(br hr).index(name)
      return '<' + name + '>' + '</' + name + '>'
    end
    return '<' + name + '>' + value + '</' + name + '>'
  end

  attributes.each do |name, attribute|
    attributes_hash[name] = attribute.value
  end

  keys = attributes_hash.keys.sort

  attr = []
  keys.each do |key|
    quote = attributes_hash[key].index("'") ? '"' : "'"
    attr << (key + '=' + quote + attributes_hash[key] + quote)
  end
  attr = attr.join(' ')

  return '<' + name + ' ' + attr + '>' + '</' + name + '>' if self_closing_node?(node)
  '<' + name + ' ' + attr + '>' + value + '</' + name + '>'
end
generate_tml_tags(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 131
def generate_tml_tags(node)
  buffer = ''
  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    else
      buffer += generate_tml_tags(child)
    end
  end

  token_context = generate_html_token(node)
  token = contextualize(adjust_name(node), token_context)
  value = sanitize_value(buffer)

  return '{' + token + '}' if self_closing_node?(node)
  return '[' + token + ': ' + value + ']' if short_token?(token, value)

  '[' + token + ']' + value + '[/' + token + ']'
end
has_child_nodes?(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 122
def has_child_nodes?(node)
  node.children and node.children.length > 0
end
has_inline_or_text_siblings?(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 178
def has_inline_or_text_siblings?(node)
  return false unless node.parent

  node.parent.children.each do |child|
    unless child == node
      return true if inline_node?(child) || valid_text_node?(child)
    end
  end

  false
end
ignored_node?(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 206
def ignored_node?(node)
  return true if (node.type != 1)
  (option('nodes.ignored') || []).index(node.name.downcase)
end
inline_node?(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 190
def inline_node?(node)
  (
    node.type == 1 and
    (option('nodes.inline') || []).index(node.name.downcase) and
    !only_child?(node)
  )
end
node_info(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 322
def node_info(node)
  info = []
  info << node.type

  info << node.tagName if node.type == 1

  if inline_node?(node)
    info << 'inline'
    if has_inline_or_text_siblings?(node)
      info << 'sentence'
    else
      info << 'only translatable'
    end
  end

  info << 'self closing' if self_closing_node?(node)
  info << 'only child' if only_child?(node)

  return "[#{info.join(', ')}]: " + node.inner_text if node.type == 3
  "[#{info.join(', ')}]"
end
non_translatable_node?(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 93
def non_translatable_node?(node)
  return false unless node
  return true if node.type == 1 && (option('nodes.scripts') || []).index(node.name.downcase)
  return true if node.type == 1 && node.children.length === 0 && node.inner_text == ''
  false
end
only_child?(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 173
def only_child?(node)
  return false unless node.parent
  node.parent.children.count == 1
end
option(name) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 151
def option(name)
  value = Tr8n::Utils.hash_value(self.options, name)
  value || Tr8n.config.translator_option(name)
end
replace_special_characters(text) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 225
def replace_special_characters(text)
  return text if option('data_tokens.special')

  matches = text.match(HTML_SPECIAL_CHAR_REGEX)
  matches.each do  |match|
    token = match[1, - 2]
    self.context[token] = match
    text = text.gsub(match, "{#{token}}")
  end

  text
end
reset_context() click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 165
def reset_context
  self.tokens = {}.merge(self.context)
end
sanitize_value(value) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 221
def sanitize_value(value)
  value.gsub(/^\s+/, '')
end
self_closing_node?(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 202
def self_closing_node?(node)
  !node.children || !node.children.first
end
separator_node?(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 216
def separator_node?(node)
  return false unless node
  node.type == 1 && (option('nodes.splitters') || []).index(node.name.downcase)
end
short_token?(token, value) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 169
def short_token?(token, value)
  option('nodes.short').index(token.downcase) || value.length < 20
end
translate(doc) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 51
def translate(doc)
  translate_tree(doc.is_a?(String) ? Nokogiri::HTML.fragment(doc) : doc)
end
translate_tml(tml) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 100
def translate_tml(tml)
  return tml if empty_string?(tml)
  tml = generate_data_tokens(tml)

  if option('split_sentences')
    sentences = Tr8n::Utils.split_sentences(tml)
    translation = tml
    sentences.each do |sentence|
      sentence_translation = option('debug') ? debug_translation(sentence) : Tr8n.session.current_language.translate(sentence, tokens, options)
      translation = translation.gsub(sentence, sentence_translation)
    end
    reset_context
    return translation
  end

  tml = tml.gsub(/[\n]/, '').gsub(/\s\s+/, ' ').strip

  translation = option('debug') ? debug_translation(tml) : Tr8n.session.target_language.translate(tml, tokens, options)
  reset_context
  translation
end
translate_tree(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 55
def translate_tree(node)
  if non_translatable_node?(node)
    return node.children.first.inner_text if node.children.count == 1
    return ''
  end

  return translate_tml(node.inner_text) if node.type == 3

  html = ''
  buffer = ''

  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    elsif inline_node?(child) and has_inline_or_text_siblings?(child) and !between_separators?(child)
      buffer += generate_tml_tags(child)
    elsif separator_node?(child)
      html += translate_tml(buffer) if buffer != ''
      html += generate_html_token(child)
      buffer = ''
    else
      html += translate_tml(buffer) if buffer != ''

      container_value = translate_tree(child)
      if ignored_node?(child)
        html += container_value
      else
        html += generate_html_token(child, container_value)
      end

      buffer = ''
    end
  end

  html += translate_tml(buffer) if buffer != ''
  html
end
valid_text_node?(node) click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 211
def valid_text_node?(node)
  return false unless node
  node.type == 3 && !empty_string?(node.inner_text)
end