class Deba::Extractor
Constants
- BLOCK_INITIATING_TAGS
- ENHANCERS
- HEADING_TAGS
- SKIP_TAGS
Attributes
blocks[R]
Public Class Methods
new(doc, options = {})
click to toggle source
# File lib/deba/extractor.rb, line 35 def initialize(doc, options = {}) @node = doc.root @options = options end
Public Instance Methods
extract()
click to toggle source
# File lib/deba/extractor.rb, line 40 def extract @just_appended_br = false @in_blockquote = false @document = Deba::Document.new(self) process(@node) @document.content.chomp("\n") end
in_blockquote?()
click to toggle source
# File lib/deba/extractor.rb, line 155 def in_blockquote? @in_blockquote end
process(node)
click to toggle source
# File lib/deba/extractor.rb, line 50 def process(node) if @options.key?(:exclude) return if Array(@options[:exclude]).any? { |selector| node.matches?(selector) } end node_name = node.name.downcase return if SKIP_TAGS.include?(node_name) #Handle repeated brs by making a paragraph break if node_name == 'br' if @just_appended_br @just_appended_br = false @document.break(Deba::Paragraph) return else @just_appended_br = true end elsif @just_appended_br @just_appended_br = false @document << "\n" end if node.text? @document << Deba::Span.new(node.inner_text) if Deba::Utils.present?(node.inner_text) return end if ENHANCERS.keys.flatten.include?(node_name) ENHANCERS.each_pair do |tags, nsf_rep| if tags.include?(node_name) @document << nsf_rep node.children.each { |n| process(n) } @document << nsf_rep end end return end if node_name == 'blockquote' @in_blockquote = true @document.break(Deba::Paragraph) node.children.each { |n| process(n) } @document.break(Deba::Paragraph) @in_blockquote = false return end if node_name == 'li' last_item = node.xpath('count(following-sibling::li)').to_i == 0 index = node.xpath('boolean(ancestor::ol)') ? (node.xpath('count(preceding-sibling::li)').to_i + 1) : nil @document.break(Deba::ListItem, last_item, index) node.children.each { |n| process(n) } @document.break(Deba::Paragraph) return end if node_name == 'dt' @document.break(Deba::DefinitionTerm) node.children.each { |n| process(n) } @document.break(Deba::Paragraph) return end if node_name == 'dd' last_item = node.xpath('count(following-sibling::dd)').to_i == 0 @document.break(Deba::DefinitionDescription, last_item) node.children.each { |n| process(n) } @document.break(Deba::Paragraph) return end #These tags terminate the current paragraph, if present, and start a new paragraph if BLOCK_INITIATING_TAGS.include?(node_name) @document.break(Deba::Paragraph) node.children.each { |n| process(n) } @document.break(Deba::Paragraph) return end if HEADING_TAGS.include?(node_name) @document.break(Deba::Heading, node_name[1..-1].to_i) node.children.each { |n| process(n) } @document.break(Deba::Paragraph) return end #Pretend that the children of this node were siblings of this node (move them one level up the tree) node.children.each { |n| process(n) } end