module AlgoliaHTMLExtractor
Extract content from an HTML page in the form of items with associated headings data
Public Class Methods
css(input, selector)
click to toggle source
Getting a list of HTML nodes from an input and a CSS selector
# File lib/algolia_html_extractor.rb, line 20 def self.css(input, selector) Nokogiri::HTML(input).css(selector) end
default_options(options)
click to toggle source
Extractor options, applying default options when none set
# File lib/algolia_html_extractor.rb, line 10 def self.default_options(options) default_options = { css_selector: 'p', heading_selector: 'h1,h2,h3,h4,h5,h6', tags_to_exclude: '' } default_options.merge(options) end
extract_anchor(node)
click to toggle source
Returns the anchor to the node
eg. <h1 name=“anchor”>Foo</h1> => anchor <h1 id=“anchor”>Foo</h1> => anchor <h1><a name=“anchor”>Foo</a></h1> => anchor
# File lib/algolia_html_extractor.rb, line 120 def self.extract_anchor(node) anchor = node.attr('name') || node.attr('id') || nil return anchor unless anchor.nil? # No anchor found directly in the header, search on children subelement = node.css('[name],[id]') return extract_anchor(subelement[0]) unless subelement.empty? nil end
extract_html(node)
click to toggle source
Returns the outer HTML of a given node
eg. <p>foo</p> => <p>foo</p>
# File lib/algolia_html_extractor.rb, line 94 def self.extract_html(node) node.to_s.strip end
extract_tag_name(node)
click to toggle source
Returns the tag name of a given node
eg <p>foo</p> => p
# File lib/algolia_html_extractor.rb, line 110 def self.extract_tag_name(node) node.name.downcase end
extract_text(node)
click to toggle source
Returns the inner HTML of a given node
eg. <p>foo</p> => foo
# File lib/algolia_html_extractor.rb, line 102 def self.extract_text(node) node.content end
heading_weight(heading_level)
click to toggle source
Get a relative numeric value of the importance of the heading 100 for top level, then -10 per heading
# File lib/algolia_html_extractor.rb, line 155 def self.heading_weight(heading_level) weight = 100 return weight if heading_level.nil? weight - ((heading_level + 1) * 10) end
run(input, options: {})
click to toggle source
# File lib/algolia_html_extractor.rb, line 24 def self.run(input, options: {}) options = default_options(options) heading_selector = options[:heading_selector] css_selector = options[:css_selector] tags_to_exclude = options[:tags_to_exclude] items = [] current_hierarchy = { lvl0: nil, lvl1: nil, lvl2: nil, lvl3: nil, lvl4: nil, lvl5: nil } current_position = 0 # Position of the DOM node in the tree current_lvl = nil # Current closest headings level current_anchor = nil # Current closest anchor # We select all nodes that match either the headings or the elements to # extract. This will allow us to loop over it in order it appears in the DOM css(input, "#{heading_selector},#{css_selector}").each do |node| # If it's a heading, we update our current hierarchy if node.matches?(heading_selector) # Which level heading is it? current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1 # Update this level, and set all the following ones to nil current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node) (current_lvl + 1..6).each do |lvl| current_hierarchy["lvl#{lvl}".to_sym] = nil end # Update the anchor, if the new heading has one new_anchor = extract_anchor(node) current_anchor = new_anchor if new_anchor end # Stop if node is not to be extracted next unless node.matches?(css_selector) # Removing excluded child from the node node.search(tags_to_exclude).each(&:remove) unless tags_to_exclude.empty? # Stop if node is empty content = extract_text(node) next if content.empty? item = { html: extract_html(node), content: content, headings: current_hierarchy.values.compact, anchor: current_anchor, node: node, custom_ranking: { position: current_position, heading: heading_weight(current_lvl) } } item[:objectID] = uuid(item) items << item current_position += 1 end items end
uuid(item)
click to toggle source
Generate a unique identifier for the item
# File lib/algolia_html_extractor.rb, line 133 def self.uuid(item) # We don't use the objectID as part of the hash algorithm item.delete(:objectID) # We first get all the keys of the object, sorted alphabetically... ordered_keys = item.keys.sort # ...then we build a huge array of "key=value" pairs... ordered_array = ordered_keys.map do |key| value = item[key] # We apply the method recursively on other hashes value = uuid(value) if value.is_a?(Hash) "#{key}=#{value}" end # ...then we build a unique md5 hash of it Digest::MD5.hexdigest(ordered_array.join(',')) end