class Tr8n::Tokenizers::Dom
Constants
- HTML_SPECIAL_CHAR_REGEX
- INDEPENDENT_NUMBER_REGEX
- VERBOSE_DATE_REGEX
Attributes
context[RW]
options[RW]
tokens[RW]
Public Class Methods
new(context = {}, options = {})
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 45 def initialize(context = {}, options = {}) self.context = context self.options = options reset_context end
Public Instance Methods
adjust_name(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 285 def adjust_name(node) name = node.name.downcase map = option('name_mapping') map[name.to_sym] ? map[name.to_sym] : name end
between_separators?(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 126 def between_separators?(node) (separator_node?(node.previous_sibling) and !valid_text_node?(node.next_sibling)) or (separator_node?(node.next_sibling) and !valid_text_node?(node.previous_sibling)) end
container_node?(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 198 def container_node?(node) node.type == 1 && !inline_node?(node) end
contextualize(name, context)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 291 def contextualize(name, context) if self.tokens[name] and self.tokens[name] != context index = 0 matches = name.match(/\d+$/) if matches and matches.length > 0 index = matches[matches.length-1].to_i name = name.gsub(index.to_s, '') end name += (index + 1).to_s return contextualize(name, context) end self.tokens[name] = context name end
debug(doc)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 307 def debug(doc) self.doc = doc debug_tree(self.doc, 0) end
debug_translation(translation)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 156 def debug_translation(translation) option('debug_format').gsub('{$0}', translation) end
debug_tree(node, depth)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 312 def debug_tree(node, depth) padding = ('=' * (depth+1)) Tr8n.logger.log(padding + '=> ' + (node) + ': ' + node_info(node)) (node.children || []).each do |child| debug_tree(child, depth+1) end end
empty_string?(tml)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 160 def empty_string?(tml) tml = tml.gsub(/[\s\n\r\t]/, '') tml == '' end
generate_data_tokens(text)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 238 def generate_data_tokens(text) return text unless option('data_tokens.numeric') matches = text.match(INDEPENDENT_NUMBER_REGEX) || [] token_name = option('data_tokens.numeric_name') matches.each do |match| value = match.gsub(/[.,;\s]/, '') token = contextualize(token_name, value.to_i) replacement = match.replace(value, "{#{token}}") text = text.gsub(match, match.gsub(value, replacement)) end text end
generate_html_token(node, value = nil)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 254 def generate_html_token(node, value = nil) name = node.name.downcase attributes = node.attributes attributes_hash = {} value = (!value ? '{$0}' : value) if attributes.length == 0 if self_closing_node?(node) return '<' + name + '/>' if %w(br hr).index(name) return '<' + name + '>' + '</' + name + '>' end return '<' + name + '>' + value + '</' + name + '>' end attributes.each do |name, attribute| attributes_hash[name] = attribute.value end keys = attributes_hash.keys.sort attr = [] keys.each do |key| quote = attributes_hash[key].index("'") ? '"' : "'" attr << (key + '=' + quote + attributes_hash[key] + quote) end attr = attr.join(' ') return '<' + name + ' ' + attr + '>' + '</' + name + '>' if self_closing_node?(node) '<' + name + ' ' + attr + '>' + value + '</' + name + '>' end
has_child_nodes?(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 122 def has_child_nodes?(node) node.children and node.children.length > 0 end
has_inline_or_text_siblings?(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 178 def has_inline_or_text_siblings?(node) return false unless node.parent node.parent.children.each do |child| unless child == node return true if inline_node?(child) || valid_text_node?(child) end end false end
ignored_node?(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 206 def ignored_node?(node) return true if (node.type != 1) (option('nodes.ignored') || []).index(node.name.downcase) end
inline_node?(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 190 def inline_node?(node) ( node.type == 1 and (option('nodes.inline') || []).index(node.name.downcase) and !only_child?(node) ) end
node_info(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 322 def node_info(node) info = [] info << node.type info << node.tagName if node.type == 1 if inline_node?(node) info << 'inline' if has_inline_or_text_siblings?(node) info << 'sentence' else info << 'only translatable' end end info << 'self closing' if self_closing_node?(node) info << 'only child' if only_child?(node) return "[#{info.join(', ')}]: " + node.inner_text if node.type == 3 "[#{info.join(', ')}]" end
non_translatable_node?(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 93 def non_translatable_node?(node) return false unless node return true if node.type == 1 && (option('nodes.scripts') || []).index(node.name.downcase) return true if node.type == 1 && node.children.length === 0 && node.inner_text == '' false end
only_child?(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 173 def only_child?(node) return false unless node.parent node.parent.children.count == 1 end
option(name)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 151 def option(name) value = Tr8n::Utils.hash_value(self.options, name) value || Tr8n.config.translator_option(name) end
replace_special_characters(text)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 225 def replace_special_characters(text) return text if option('data_tokens.special') matches = text.match(HTML_SPECIAL_CHAR_REGEX) matches.each do |match| token = match[1, - 2] self.context[token] = match text = text.gsub(match, "{#{token}}") end text end
reset_context()
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 165 def reset_context self.tokens = {}.merge(self.context) end
sanitize_value(value)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 221 def sanitize_value(value) value.gsub(/^\s+/, '') end
self_closing_node?(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 202 def self_closing_node?(node) !node.children || !node.children.first end
separator_node?(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 216 def separator_node?(node) return false unless node node.type == 1 && (option('nodes.splitters') || []).index(node.name.downcase) end
short_token?(token, value)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 169 def short_token?(token, value) option('nodes.short').index(token.downcase) || value.length < 20 end
translate(doc)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 51 def translate(doc) translate_tree(doc.is_a?(String) ? Nokogiri::HTML.fragment(doc) : doc) end
translate_tml(tml)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 100 def translate_tml(tml) return tml if empty_string?(tml) tml = generate_data_tokens(tml) if option('split_sentences') sentences = Tr8n::Utils.split_sentences(tml) translation = tml sentences.each do |sentence| sentence_translation = option('debug') ? debug_translation(sentence) : Tr8n.session.current_language.translate(sentence, tokens, options) translation = translation.gsub(sentence, sentence_translation) end reset_context return translation end tml = tml.gsub(/[\n]/, '').gsub(/\s\s+/, ' ').strip translation = option('debug') ? debug_translation(tml) : Tr8n.session.target_language.translate(tml, tokens, options) reset_context translation end
translate_tree(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 55 def translate_tree(node) if non_translatable_node?(node) return node.children.first.inner_text if node.children.count == 1 return '' end return translate_tml(node.inner_text) if node.type == 3 html = '' buffer = '' node.children.each do |child| if child.type == 3 buffer += child.inner_text elsif inline_node?(child) and has_inline_or_text_siblings?(child) and !between_separators?(child) buffer += generate_tml_tags(child) elsif separator_node?(child) html += translate_tml(buffer) if buffer != '' html += generate_html_token(child) buffer = '' else html += translate_tml(buffer) if buffer != '' container_value = translate_tree(child) if ignored_node?(child) html += container_value else html += generate_html_token(child, container_value) end buffer = '' end end html += translate_tml(buffer) if buffer != '' html end
valid_text_node?(node)
click to toggle source
# File lib/tr8n/tokenizers/dom.rb, line 211 def valid_text_node?(node) return false unless node node.type == 3 && !empty_string?(node.inner_text) end