class ManBook::Parser

Public Class Methods

parse(html_file) click to toggle source
# File lib/manbook/parser.rb, line 4
def parse(html_file)
  #
  # The way we extract the title is highly dependent of the concrete HTML. Yet, I found no other way
  # to extract the title of a man page ín a reliable way.
  #
  doc = Nokogiri::HTML(File.read(html_file))

  title = doc.xpath("//b[text() = 'NAME']/../following-sibling::p[1]/descendant-or-self::text()").to_s

  if title.blank?
    title = doc.xpath("//h2[text() = 'NAME']/following-sibling::p[1]/descendant-or-self::text()").to_s
  end

  # fall back to document title
  if title.blank?
    title = doc.xpath("//html/head/title/text()").to_s
  end

  author = doc.xpath("//b[text() = 'AUTHORS']/../following-sibling::p[1]/descendant-or-self::text()").to_s

  if author.empty?
    author = doc.xpath("//h2[text() = 'AUTHORS']/following-sibling::p[1]/descendant-or-self::text()").to_s
  end

  Page.new.tap do |page|
    page.file_name = File.basename(html_file)
    page.title     = title.split("\n").join(' ')
    page.author    = author.split("\n").join(' ')
  end
end