module Markitdown

Constants

VERSION

Public Class Methods

from_html(html, language_classifier=nil) click to toggle source
# File lib/markitdown.rb, line 7
def self.from_html(html, language_classifier=nil)
  from_nokogiri(Nokogiri::XML(html).root, language_classifier)
end
from_nokogiri(node, language_classifier=nil) click to toggle source
# File lib/markitdown.rb, line 11
def self.from_nokogiri(node, language_classifier=nil)
  # gsub(/\n\s+\n/,"\n\n") - remove lines with nothing but space characters
  # gsub(/\n{2,}/,"\n\n") - collapse any series of more an than 2 new lines down to 2
  # gsub(/\t+/," ") - collapse consecutive tabs down to a single space. I use tabs to pad divs and span, this causes multiple nested spans and divs to ultimately be surrounded by a single space.
  # gsub(/ ([\.\?])/,'\1') - removes a space before a period or question mark. Things like links get surrounded by spaces. If they appear at the end of a sentence, this makes sure the punctation isn't off.
  self.parse_node(node, [], language_classifier).flatten.compact.join.gsub(/\n\s+\n/,"\n\n").gsub(/\n{2,}/,"\n\n").gsub(/( > \n){2,}/,"\n > \n > ").gsub(/\t+/," ").gsub(/ ([\.\?])/,'\1').gsub(/\s*END_TAG\((.{1,3})\)/, "\\1").gsub(/\u00a0/, " ")
end

Private Class Methods

find_parent(node, tag_name) click to toggle source
# File lib/markitdown.rb, line 245
def self.find_parent(node, tag_name)
  return nil unless node
  return node if node.name == tag_name
  find_parent(node.parent, tag_name)
end
nested_list?(states) click to toggle source
# File lib/markitdown.rb, line 199
def self.nested_list?(states)
  result = false
  states.each_with_index do |state, index|
    next if index==0
    result = true if ["ul","ol","blockquote"].include?(state)
  end
  result
end
newline(pre, line, count=1) click to toggle source
# File lib/markitdown.rb, line 208
def self.newline(pre, line, count=1)
  result = []
  count.times do
    result << pre
    result << line
    result << "\n"
  end
  result
end
parse_node(node, states=[], language_classifier=nil) click to toggle source
# File lib/markitdown.rb, line 20
def self.parse_node(node, states=[], language_classifier=nil)
  results=[]
  after = nil
  states.unshift node.name.downcase
  pre = prefix(states)
  recurse = true
  strip_content = false
  flatten_content = false
  case node.name
  when "head"
    return []
  when "title"
    return []
  when "style"
    return []
  when "div"
    results << "\t"
    after = "\t"
  when "span"
    results << "\t"
    after = "\t"
  when "p"
    results << self.newline(pre, nil, 2)
    after = self.newline(pre, nil,  2)
  when "h1"
    results << self.newline(pre, nil,  2)
    results << "# "
    after = self.newline(pre, nil,  2)
  when "h2"
    results << self.newline(pre, nil,  2)
    results << "## "
    after = self.newline(pre, nil,  2)
  when "h3"
    results << self.newline(pre, nil,  2)
    results << "### "
    after = self.newline(pre, nil,  2)
  when "h4"
    results << self.newline(pre, nil,  2)
    results << "#### "
    after = self.newline(pre, nil,  2)
  when "h5"
    results << self.newline(pre, nil,  2)
    results << "##### "
    after = self.newline(pre, nil,  2)
  when "h6"
    results << self.newline(pre, nil,  2)
    results << "###### "
    after = self.newline(pre, nil,  2)
  when "hr"
    results << self.newline(pre, nil,  2)
    results << "***"
    results << self.newline(pre, nil,  2)
  when "br"
    results << self.newline(pre, nil,  2)
  when "em"
    results << " *"
    after = "END_TAG(*) "
  when "i"
    results << " *"
    after = "END_TAG(*) "
  when "strong"
    results << " __"
    after = "END_TAG(__) "
  when "b"
    results << " __"
    after = "END_TAG(__) "
  when "u"
    results << " _"
    after = "END_TAG(_) "
  when "strike"
    results << " ~~"
    after = "END_TAG(~~) "
  when "del"
    results << " ~~"
    after = "END_TAG(~~) "
  when "mark"
    results << " =="
    after = "END_TAG(==) "
  when "sup"
    results << "^("
    after = "END_TAG(\)) "
  when "blockquote"
    results << "\n\n"
    results << pre
    after = "\n\n"
  when "ol"
    unless self.nested_list?(states)
      results << self.newline(pre, nil)
      after = "\n"
    end
  when "ul"
    unless self.nested_list?(states)
      results << self.newline(pre, nil)
      after = "\n"
    end
  when "li"
    results << "\n"
    results << pre
  when "dl"
    unless self.nested_list?(states)
      results << self.newline(pre, nil)
      after = "\n\n"
    end
  when "dt"
    results << "\n"
    results << pre
  when "dd"
    results << "\n"
    results << pre
    results << " : "
  when "a"
    results << " ["
    after = ["](#{node.attributes["href"].value if node.attributes["href"]}) "]
    strip_content = true
  when "img"
    results << " !["
    results << node.attributes["alt"].value if node.attributes["alt"]
    results << "]("
    results << node.attributes["src"].value if node.attributes["src"]
    results << ") "
  when "text"
    results << node.text.strip.gsub("\n","").gsub(/ {2,}/," ")
  when "code"
    if node.text.include?("\n")
      text = node.text.gsub(/^\n/,"").gsub(/\n\s*$/,"").gsub(/\u00a0/, " ")
      if language_classifier
        language = language_classifier.classify(text)
        results << "\n\n```#{language}\n#{text}\n```\n\n"
      else          
        results << "\n\n```\n#{text}\n```\n\n"
      end
    else
      results << " `#{node.text}` "
    end
    recurse = false
  when "table"
    results << "\n\n"
    after = "\n\n"
  when "th"
    results << "|"
    strip_content = true
    flatten_content = true
  when "td"
    results << "|"
    strip_content = true
    flatten_content = true
  when "tr"
    after = "|\n"
    table = find_parent(node.parent, "table")
    if table
      first_row = table.xpath(".//tr").first
      if first_row == node
        cell_count = node.xpath(".//th|td").count
        after << ("|---"*cell_count) + "|\n"
      end
    end
  end

  if recurse
    node.children.each do |child|
      contents = self.parse_node(child, states, language_classifier)
      contents = contents.flatten.compact.join.strip if strip_content
      contents = [contents].flatten.compact.join.gsub("\n", " ") if flatten_content
      results << contents
    end
  end
  
  if strip_content
    last_tags = results.pop
    after = after.flatten.compact.join if after.is_a?(Array)
    last_tags = "#{last_tags}#{after}"
    results << last_tags
  else
    results << after
  end
  states.shift
  results
end
prefix(states) click to toggle source
# File lib/markitdown.rb, line 218
def self.prefix(states)
  result = []
  states.each_with_index do |state, index|
    if state == "blockquote"
      result.unshift(" > ")
    end          
    next if index==0
    if index==1
      if states.first == "li"
        if state == "ol"
          result.unshift(" 1. ")
        elsif state == "ul"
          result.unshift(" * ")
        end
      end
      next
    end
    case state
    when "ol"
      result.unshift("   ")
    when "ul"
      result.unshift("  ")
    end
  end
  result
end