class WordToMarkdown::Converter

Constants

HEADING_DEPTH

Number of headings to guess, e.g., h6

HEADING_STEP

Percentile step for eaceh eheading

MIN_HEADING_SIZE

Minimum heading size

UNICODE_BULLETS

Unicode bullets to strip when processing

Attributes

document[R]

Public Class Methods

new(document) click to toggle source

@param document [WordToMarkdown::Document] The document to convert

# File lib/word-to-markdown/converter.rb, line 20
def initialize(document)
  @document = document
end

Public Instance Methods

convert!() click to toggle source

Convert the document

Note: this action is destructive!

# File lib/word-to-markdown/converter.rb, line 27
def convert!
  # Fonts and headings
  semanticize_font_styles!
  semanticize_headings!

  # Tables
  remove_paragraphs_from_tables!
  semanticize_table_headers!

  # list items
  remove_paragraphs_from_list_items!
  remove_unicode_bullets_from_list_items!
  remove_whitespace_from_list_items!
  remove_numbering_from_list_items!
end
font_sizes() click to toggle source

@return [Array<Integer>] An array of font-sizes for implicit headings in the document

# File lib/word-to-markdown/converter.rb, line 55
def font_sizes
  @font_sizes ||= begin
    sizes = []
    @document.tree.css('[style]').each do |element|
      sizes.push element.font_size.round(-1) unless element.font_size.nil?
    end
    sizes.uniq.sort
  end
end
guess_heading(node) click to toggle source

Given a Nokogiri node, guess what heading it represents, if any

@param node [Nokigiri::Node] the nokigiri node @return [String, nil] the heading tag (e.g., H1), or nil

# File lib/word-to-markdown/converter.rb, line 69
def guess_heading(node)
  return nil if node.font_size.nil?
  [*1...HEADING_DEPTH].each do |heading|
    return "h#{heading}" if node.font_size >= h(heading)
  end
  nil
end
h(num) click to toggle source

Minimum font size required for a given heading e.g., H(2) would represent the minimum font size of an implicit h2

@param num [Integer] the heading number, e.g., 1, 2

@return [Integer] the minimum font size

# File lib/word-to-markdown/converter.rb, line 83
def h(num)
  font_sizes.percentile(((HEADING_DEPTH - 1) - num) * HEADING_STEP)
end
implicit_headings() click to toggle source

@return [Array<Nokogiri::Node>] Return an array of Nokogiri Nodes that are implicit headings

# File lib/word-to-markdown/converter.rb, line 44
def implicit_headings
  @implicit_headings ||= begin
    headings = []
    @document.tree.css('[style]').each do |element|
      headings.push element unless element.font_size.nil? || element.font_size < MIN_HEADING_SIZE
    end
    headings
  end
end
remove_numbering_from_list_items!() click to toggle source

Remove prepended numbers from list items

# File lib/word-to-markdown/converter.rb, line 117
def remove_numbering_from_list_items!
  path = WordToMarkdown.soffice.major_version == '5' ? 'li span span' : 'li span'
  @document.tree.search(path).each do |span|
    span.inner_html = span.inner_html.gsub(/^[a-zA-Z0-9]+\./m, '')
  end
end
remove_paragraphs_from_list_items!() click to toggle source

Remove top-level paragraphs from list items

# File lib/word-to-markdown/converter.rb, line 104
def remove_paragraphs_from_list_items!
  @document.tree.search('li p').each { |node| node.node_name = 'span' }
end
remove_paragraphs_from_tables!() click to toggle source

Remove top-level paragraphs from table cells

# File lib/word-to-markdown/converter.rb, line 99
def remove_paragraphs_from_tables!
  @document.tree.search('td p').each { |node| node.node_name = 'span' }
end
remove_unicode_bullets_from_list_items!() click to toggle source

Remove prepended unicode bullets from list items

# File lib/word-to-markdown/converter.rb, line 109
def remove_unicode_bullets_from_list_items!
  path = WordToMarkdown.soffice.major_version == '5' ? 'li span span' : 'li span'
  @document.tree.search(path).each do |span|
    span.inner_html = span.inner_html.gsub(/^([#{UNICODE_BULLETS.join("")}]+)/, '')
  end
end
remove_whitespace_from_list_items!() click to toggle source

Remvoe whitespace from list items

# File lib/word-to-markdown/converter.rb, line 125
def remove_whitespace_from_list_items!
  @document.tree.search('li span').each { |span| span.inner_html.strip! }
end
semanticize_font_styles!() click to toggle source

Convert span-based font styles to `strong`s and `em`s

# File lib/word-to-markdown/converter.rb, line 88
def semanticize_font_styles!
  @document.tree.css('span').each do |node|
    if node.bold?
      node.node_name = 'strong'
    elsif node.italic?
      node.node_name = 'em'
    end
  end
end
semanticize_headings!() click to toggle source

Try to guess heading where implicit bassed on font size

# File lib/word-to-markdown/converter.rb, line 135
def semanticize_headings!
  implicit_headings.each do |element|
    heading = guess_heading element
    element.node_name = heading unless heading.nil?
  end
end
semanticize_table_headers!() click to toggle source

Convert table headers to `th`s2

# File lib/word-to-markdown/converter.rb, line 130
def semanticize_table_headers!
  @document.tree.search('table tr:first td').each { |node| node.node_name = 'th' }
end