class Keynote::Extractor::TextExtractor
Public Class Methods
extract_iwa(file)
click to toggle source
# File lib/keynote/extractor/text_extractor.rb, line 36 def self.extract_iwa(file) res = [] words = file.encode!('UTF-8', 'UTF-8', :invalid => :replace) .gsub(/[^0-9a-z ]/i, '').split(" ") words.each do |w| res << w unless w.length > 20 or w.include? "Transition" or w.length <= 2 end res end
get_from_bodies(body_text)
click to toggle source
# File lib/keynote/extractor/text_extractor.rb, line 32 def self.get_from_bodies(body_text) body_text.child.content end
get_text(content)
click to toggle source
# File lib/keynote/extractor/text_extractor.rb, line 7 def self.get_text(content) text = [] if content.kind_of?(Array) content.each do |c| extracted = extract_iwa(c) extracted.each do |word| text << word end end else text_nodes = search_for_bodies(content) text_nodes.each do |tn| content = get_from_bodies(tn) text << content unless content.empty? end end text.uniq end
search_for_bodies(content)
click to toggle source
# File lib/keynote/extractor/text_extractor.rb, line 27 def self.search_for_bodies(content) doc = Nokogiri::XML(content) doc.xpath("//sf:text-body") end