class AlgoliaSearchRecordExtractor
Given an HTML file as input, will return an array of records to index
Attributes
Public Class Methods
# File lib/record_extractor.rb, line 9 def initialize(file) @file = file @config = file.site.config default_config = { 'record_css_selector' => 'p' } @config = default_config.merge(file.site.config['algolia']) end
Public Instance Methods
Hook to modify all records after extracting
# File lib/record_extractor.rb, line 24 def custom_hook_all(items) items end
Hook to modify a record after extracting
# File lib/record_extractor.rb, line 19 def custom_hook_each(item, _node) item end
# File lib/record_extractor.rb, line 190 def extract items = [] html_nodes.each_with_index do |node, index| next if node.text.empty? item = metadata.clone item.merge!(node_hierarchy(node)) item[:tag_name] = node.name item[:raw_html] = node_raw_html(node) item[:text] = node_text(node) item[:unique_hierarchy] = unique_hierarchy(item) item[:css_selector] = node_css_selector(node) item[:css_selector_parent] = node_css_selector(node_heading_parent(node)) item[:weight] = weight(item, index) # We pass item through the user defined custom hook item = custom_hook_each(item, node) next if item.nil? items << item end custom_hook_all(items) end
Get the list of all HTML nodes to index
# File lib/record_extractor.rb, line 75 def html_nodes document = Nokogiri::HTML(@file.content) document.css(@config['record_css_selector']) end
Returns metadata from the current file
# File lib/record_extractor.rb, line 29 def metadata metadata = {} @file.data.each { |key, value| metadata[key.to_sym] = value } metadata[:type] = @file.class.name.split('::')[1].downcase metadata[:url] = @file.url metadata[:slug] = slug metadata[:posted_at] = @file.date.to_time.to_i if @file.respond_to? :date metadata[:tags] = tags metadata end
Returns a hash of two CSS selectors. One for the node itself, and one its closest heading parent
# File lib/record_extractor.rb, line 146 def node_css_selector(node) return nil if node.nil? # Use the CSS id if one is set return "##{node['id']}" if node['id'] # Default Nokogiri selector node.css_path.gsub('html > body > ', '') end
Check if node is a heading
# File lib/record_extractor.rb, line 81 def node_heading?(node) %w(h1 h2 h3 h4 h5 h6).include?(node.name) end
Get the closest heading parent
# File lib/record_extractor.rb, line 86 def node_heading_parent(node, level = 'h7') # If initially called on a heading, we only accept stronger headings level = node.name if level == 'h7' && node_heading?(node) previous = node.previous_element # No previous element, we go up to the parent unless previous parent = node.parent # No more parent, then no heading found return nil if parent.name == 'body' return node_heading_parent(parent, level) end # This is a heading, we return it return previous if node_heading?(previous) && previous.name < level node_heading_parent(previous, level) end
Get all the parent headings of the specified node If the node itself is a heading, we include it
# File lib/record_extractor.rb, line 108 def node_hierarchy(node, state = { level: 7 }) tag_name = node.name level = tag_name.delete('h').to_i if node_heading?(node) && level < state[:level] state[tag_name.to_sym] = node_text(node) state[:level] = level end heading = node_heading_parent(node) # No previous heading, we can stop the recursion unless heading state.delete(:level) return state end node_hierarchy(heading, state) end
Return the raw HTML of the element to index
# File lib/record_extractor.rb, line 129 def node_raw_html(node) node.to_s end
Return the text of the element, sanitized to be displayed
# File lib/record_extractor.rb, line 134 def node_text(node) node.content.gsub('<', '<').gsub('>', '>') end
Returns the slug of the document
# File lib/record_extractor.rb, line 45 def slug # Jekyll v3 has it in data return @file.data['slug'] if @file.data.key?('slug') # Old Jekyll v2 has it at the root return @file.slug if @file.respond_to? :slug # Otherwise, we guess it from the filename basename = File.basename(@file.path) extname = File.extname(basename) File.basename(basename, extname) end
Returns a unique string of hierarchy from title to h6, used for distinct
# File lib/record_extractor.rb, line 139 def unique_hierarchy(data) headings = %w(title h1 h2 h3 h4 h5 h6) headings.map { |heading| data[heading.to_sym] }.compact.join(' > ') end
Returns an object of all weights
# File lib/record_extractor.rb, line 182 def weight(item, index) { tag_name: weight_tag_name(item), heading_relevance: weight_heading_relevance(item), position: index } end
The more words are in common between this node and its parent heading, the higher the score
# File lib/record_extractor.rb, line 158 def weight_heading_relevance(data) # Get list of unique words in headings title_words = %i(title h1 h2 h3 h4 h5 h6) .select { |title| data.key?(title) } .map { |title| data[title].to_s.split(/\W+/) } .flatten .compact .map(&:downcase) .uniq # Intersect words in headings with words in test text_words = data[:text].downcase.split(/\W+/) (title_words & text_words).size end
Returns a weight based on the tag_name
# File lib/record_extractor.rb, line 173 def weight_tag_name(item) tag_name = item[:tag_name] # No a heading, no weight return 0 unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name) # h1: 100, h2: 90, ..., h6: 50 100 - (tag_name.delete('h').to_i - 1) * 10 end