class HTMLHierarchyExtractor
Extract content from an HTML page in the form of items with associated hierarchy data
Public Class Methods
new(input, options: {})
click to toggle source
# File lib/html-hierarchy-extractor.rb, line 7 def initialize(input, options: {}) @dom = Nokogiri::HTML(input) default_options = { css_selector: 'p' } @options = default_options.merge(options) warn '[DEPRECATION] The gem html-hierarchy-extractor has been renamed '\ 'to algolia_html_extractor and will no longer be supported. '\ 'Please switch to algolia_html_extractor as soon as possible.' end
Public Instance Methods
extract()
click to toggle source
# File lib/html-hierarchy-extractor.rb, line 87 def extract heading_selector = 'h1,h2,h3,h4,h5,h6' # We select all nodes that match either the headings or the elements to # extract. This will allow us to loop over it in order it appears in the DOM all_selector = "#{heading_selector},#{@options[:css_selector]}" items = [] current_hierarchy = { lvl0: nil, lvl1: nil, lvl2: nil, lvl3: nil, lvl4: nil, lvl5: nil } current_position = 0 # Position of the DOM node in the tree current_lvl = nil # Current closest hierarchy level current_anchor = nil # Current closest anchor @dom.css(all_selector).each do |node| # If it's a heading, we update our current hierarchy if node.matches?(heading_selector) # Which level heading is it? current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1 # Update this level, and set all the following ones to nil current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node) (current_lvl + 1..6).each do |lvl| current_hierarchy["lvl#{lvl}".to_sym] = nil end # Update the anchor, if the new heading has one new_anchor = extract_anchor(node) current_anchor = new_anchor if new_anchor end # Stop if node is not to be extracted next unless node.matches?(@options[:css_selector]) # Stop if node is empty text = extract_text(node) next if text.empty? item = { html: extract_html(node), text: text, tag_name: extract_tag_name(node), hierarchy: current_hierarchy.clone, anchor: current_anchor, node: node, weight: { position: current_position, heading: heading_weight(current_lvl) } } item[:uuid] = uuid(item) items << item current_position += 1 end items end
extract_anchor(node)
click to toggle source
Returns the anchor to the node
eg. <h1 name=“anchor”>Foo</h1> => anchor <h1 id=“anchor”>Foo</h1> => anchor <h1><a name=“anchor”>Foo</a></h1> => anchor
# File lib/html-hierarchy-extractor.rb, line 49 def extract_anchor(node) anchor = node.attr('name') || node.attr('id') || nil return anchor unless anchor.nil? # No anchor found directly in the header, search on children subelement = node.css('[name],[id]') return extract_anchor(subelement[0]) unless subelement.empty? nil end
extract_html(node)
click to toggle source
Returns the outer HTML of a given node
eg. <p>foo</p> => <p>foo</p>
# File lib/html-hierarchy-extractor.rb, line 23 def extract_html(node) node.to_s.strip end
extract_tag_name(node)
click to toggle source
Returns the tag name of a given node
eg <p>foo</p> => p
# File lib/html-hierarchy-extractor.rb, line 39 def extract_tag_name(node) node.name.downcase end
extract_text(node)
click to toggle source
Returns the inner HTML of a given node
eg. <p>foo</p> => foo
# File lib/html-hierarchy-extractor.rb, line 31 def extract_text(node) node.content end
heading_weight(heading_level)
click to toggle source
Get a relative numeric value of the importance of the heading 100 for top level, then -10 per heading
# File lib/html-hierarchy-extractor.rb, line 81 def heading_weight(heading_level) weight = 100 return weight if heading_level.nil? weight - ((heading_level + 1) * 10) end
uuid(item)
click to toggle source
Generate a unique identifier for the item
# File lib/html-hierarchy-extractor.rb, line 62 def uuid(item) # We first get all the keys of the object, sorted alphabetically... ordered_keys = item.keys.sort # ...then we build a huge array of "key=value" pairs... ordered_array = ordered_keys.map do |key| value = item[key] # We apply the method recursively on other hashes value = uuid(value) if value.is_a?(Hash) "#{key}=#{value}" end # ...then we build a unique md5 hash of it Digest::MD5.hexdigest(ordered_array.join(',')) end