class Tml::Tokenizers::Dom
Attributes
context[RW]
options[RW]
tokens[RW]
Public Class Methods
new(context = {}, options = {})
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 41 def initialize(context = {}, options = {}) self.context = context self.options = options reset_context end
Public Instance Methods
adjust_name(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 319 def adjust_name(node) name = node.name.downcase map = option('name_mapping') map[name.to_sym] ? map[name.to_sym] : name end
between_separators?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 132 def between_separators?(node) (separator_node?(node.previous_sibling) and !valid_text_node?(node.next_sibling)) or (separator_node?(node.next_sibling) and !valid_text_node?(node.previous_sibling)) end
container_node?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 208 def container_node?(node) node.type == 1 && !inline_node?(node) end
contextualize(name, context)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 325 def contextualize(name, context) if self.tokens[name] and self.tokens[name] != context index = 0 matches = name.match(/\d+$/) if matches and matches.length > 0 index = matches[matches.length-1].to_i name = name.gsub(index.to_s, '') end name += (index + 1).to_s return contextualize(name, context) end self.tokens[name] = context name end
debug(doc)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 341 def debug(doc) self.doc = doc debug_tree(self.doc, 0) end
debug_translation(translation)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 162 def debug_translation(translation) option('debug_format').gsub('{$0}', translation) end
debug_tree(node, depth)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 346 def debug_tree(node, depth) padding = ('=' * (depth+1)) Tml.logger.log(padding + '=> ' + (node) + ': ' + node_info(node)) (node.children || []).each do |child| debug_tree(child, depth+1) end end
empty_string?(tml)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 166 def empty_string?(tml) tml = tml.gsub(/[\s\n\r\t]/, '').gsub(/[\u0080-\u00ff]/, '') return true if tml == '' return true if tml.match(/\A\$\{[^\}]+\}\z/) # ignore variables ${var_name} return true if tml.match(/\A\$?\d+\.?\d+\z/) # ignore prices and numbers false end
generate_data_tokens(text)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 235 def generate_data_tokens(text) if option('data_tokens.special.enabled') matches = text.scan(option('data_tokens.special.regex')) matches.each do |match| token = match[1, - 2] self.context[token] = match text = text.gsub(match, "{#{token}}") end end if option('data_tokens.date.enabled') token_name = option('data_tokens.date.name') formats = option('data_tokens.date.formats') formats.each do |format| regex = format[0] # date_format = format[1] matches = text.scan(regex) if matches matches.each do |match| next if match.first.nil? or match.first == '' date = match.first token = self.contextualize(token_name, date) replacement = "{#{token}}" text = text.gsub(date, replacement) end end end end rules = option('data_tokens.rules') if rules rules.each do |rule| next unless rule[:enabled] matches = text.scan(rule[:regex]) if matches matches.each do |match| next if match.first.nil? or match.first == '' value = match.first.strip unless value == '' token = contextualize(rule[:name], value.gsub(/[.,;\s]/, '').to_i) text = text.gsub(value, value.gsub(value, "{#{token}}")) end end end end end text end
generate_html_token(node, value = nil)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 288 def generate_html_token(node, value = nil) name = node.name.downcase attributes = node.attributes attributes_hash = {} value = (!value ? '{$0}' : value) if attributes.length == 0 if self_closing_node?(node) return '<' + name + '/>' if %w(br hr).index(name) return '<' + name + '>' + '</' + name + '>' end return '<' + name + '>' + value + '</' + name + '>' end attributes.each do |name, attribute| attributes_hash[name] = attribute.value end keys = attributes_hash.keys.sort attr = [] keys.each do |key| quote = attributes_hash[key].index("'") ? '"' : "'" attr << (key + '=' + quote + attributes_hash[key] + quote) end attr = attr.join(' ') return '<' + name + ' ' + attr + '>' + '</' + name + '>' if self_closing_node?(node) '<' + name + ' ' + attr + '>' + value + '</' + name + '>' end
has_child_nodes?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 128 def has_child_nodes?(node) node.children and node.children.length > 0 end
has_inline_or_text_siblings?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 188 def has_inline_or_text_siblings?(node) return false unless node.parent node.parent.children.each do |child| unless child == node return true if inline_node?(child) || valid_text_node?(child) end end false end
ignored_node?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 216 def ignored_node?(node) return true if (node.type != 1) (option('nodes.ignored') || []).index(node.name.downcase) end
inline_node?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 200 def inline_node?(node) ( node.type == 1 and (option('nodes.inline') || []).index(node.name.downcase) and !only_child?(node) ) end
no_translate_node?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 88 def no_translate_node?(node) return unless node && node.type == 1 && node.attributes node.attributes.each do |name, attribute| return true if name == 'notranslate' or attribute.value.index('notranslate') end false end
node_info(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 356 def node_info(node) info = [] info << node.type info << node.tagName if node.type == 1 if inline_node?(node) info << 'inline' if has_inline_or_text_siblings?(node) info << 'sentence' else info << 'only translatable' end end info << 'self closing' if self_closing_node?(node) info << 'only child' if only_child?(node) return "[#{info.join(', ')}]: " + node.inner_text if node.type == 3 "[#{info.join(', ')}]" end
non_translatable_node?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 96 def non_translatable_node?(node) return false unless node return true if node.type == 1 && (option('nodes.scripts') || []).index(node.name.downcase) return true if node.type == 1 && node.children.length === 0 && node.inner_text == '' return true if no_translate_node?(node) false end
only_child?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 183 def only_child?(node) return false unless node.parent node.parent.children.count == 1 end
option(name)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 157 def option(name) value = Tml::Utils.hash_value(self.options, name) value || Tml.config.translator_option(name) end
reset_context()
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 175 def reset_context self.tokens = {}.merge(self.context) end
sanitize_value(value)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 231 def sanitize_value(value) value.gsub(/^\s+/, '') end
self_closing_node?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 212 def self_closing_node?(node) !node.children || !node.children.first end
separator_node?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 226 def separator_node?(node) return false unless node node.type == 1 && (option('nodes.splitters') || []).index(node.name.downcase) end
short_token?(token, value)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 179 def short_token?(token, value) option('nodes.short').index(token.downcase) || value.length < 20 end
translate(doc)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 47 def translate(doc) translate_tree(doc.is_a?(String) ? Nokogiri::HTML.fragment(doc) : doc) end
translate_tml(tml)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 104 def translate_tml(tml) return tml if empty_string?(tml) # pp tml tml = generate_data_tokens(tml) if option('split_sentences') sentences = Tml::Utils.split_sentences(tml) translation = tml sentences.each do |sentence| sentence_translation = option('debug') ? debug_translation(sentence) : Tml.session.current_language.translate(sentence, tokens, options.dup) translation = translation.gsub(sentence, sentence_translation) end reset_context return translation end tml = tml.gsub(/[\n]/, '').gsub(/\s\s+/, ' ').strip translation = option('debug') ? debug_translation(tml) : Tml.session.target_language.translate(tml, tokens, options.dup) reset_context translation end
translate_tree(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 51 def translate_tree(node) if non_translatable_node?(node) return node.inner_html end return translate_tml(node.inner_text) if node.type == 3 html = '' buffer = '' node.children.each do |child| if child.type == 3 buffer += child.inner_text elsif inline_node?(child) and has_inline_or_text_siblings?(child) and !between_separators?(child) buffer += generate_tml_tags(child) elsif separator_node?(child) html += translate_tml(buffer) if buffer != '' html += generate_html_token(child) buffer = '' else html += translate_tml(buffer) if buffer != '' container_value = translate_tree(child) if ignored_node?(child) html += container_value else html += generate_html_token(child, container_value) end buffer = '' end end html += translate_tml(buffer) if buffer != '' html end
valid_text_node?(node)
click to toggle source
# File lib/tml/tokenizers/dom.rb, line 221 def valid_text_node?(node) return false unless node node.type == 3 && !empty_string?(node.inner_text) end