class WordToMarkdown::Document

Attributes

path[R]
tmpdir[R]

Public Class Methods

new(path, tmpdir = nil) click to toggle source

@param path [string] Path to the Word document @param tmpdir [string] Path to a working directory to use

# File lib/word-to-markdown/document.rb, line 12
def initialize(path, tmpdir = nil)
  @path = File.expand_path path, Dir.pwd
  @tmpdir = tmpdir || Dir.mktmpdir
  raise NotFoundError, "File #{@path} does not exist" unless File.exist?(@path)
end

Public Instance Methods

encoding() click to toggle source

Determine the document encoding

@return [String] the encoding, defaulting to “UTF-8”

# File lib/word-to-markdown/document.rb, line 46
def encoding
  match = raw_html.encode('UTF-8', invalid: :replace, replace: '').match(/charset=([^\"]+)/)
  if match
    match[1].sub('macintosh', 'MacRoman')
  else
    'UTF-8'
  end
end
extension() click to toggle source

@return [String] the document's extension

# File lib/word-to-markdown/document.rb, line 19
def extension
  File.extname path
end
html() click to toggle source

@return [String] the html representation of the document

# File lib/word-to-markdown/document.rb, line 33
def html
  tree.to_html.gsub("</li>\n", '</li>')
end
markdown() click to toggle source

@return [String] the markdown representation of the document

# File lib/word-to-markdown/document.rb, line 38
def markdown
  @markdown ||= scrub_whitespace(ReverseMarkdown.convert(html, WordToMarkdown::REVERSE_MARKDOWN_OPTIONS))
end
Also aliased as: to_s
to_s()
Alias for: markdown
tree() click to toggle source

@return [Nokigiri::Document]

# File lib/word-to-markdown/document.rb, line 24
def tree
  @tree ||= begin
    tree = Nokogiri::HTML(normalized_html)
    tree.css('title').remove
    tree
  end
end

Private Instance Methods

dest_path() click to toggle source

@return [String] the path to the intermediary HTML document

# File lib/word-to-markdown/document.rb, line 88
def dest_path
  dest_filename = File.basename(path).gsub(/#{Regexp.escape(extension)}$/, '.html')
  File.expand_path(dest_filename, tmpdir)
end
filter() click to toggle source

@return [String] the LibreOffice filter to use for conversion

# File lib/word-to-markdown/document.rb, line 105
def filter
  if WordToMarkdown.soffice.major_version == '5'
    'html:XHTML Writer File:UTF8'
  else
    'html'
  end
end
normalized_html() click to toggle source

Perform pre-processing normalization

@return [String] the normalized html

# File lib/word-to-markdown/document.rb, line 60
def normalized_html
  html = raw_html.dup.force_encoding(encoding)
  html = html.encode('UTF-8', invalid: :replace, replace: '')
  html = Premailer.new(html, with_html_string: true, input_encoding: 'UTF-8').to_inline_css
  html.gsub!(/\n|\r/, ' ')  # Remove linebreaks
  html.gsub!(/“|”/, '"')    # Straighten curly double quotes
  html.gsub!(/‘|’/, "'")    # Straighten curly single quotes
  html.gsub!(/>\s+</, '><') # Remove extra whitespace between tags
  html
end
raw_html() click to toggle source

@return [String] the unnormalized HTML representation

# File lib/word-to-markdown/document.rb, line 94
def raw_html
  @raw_html ||= begin
    WordToMarkdown.run_command '--headless', '--convert-to', filter, path, '--outdir', tmpdir
    raise ConversionError, "Failed to convert #{path}" unless File.exist?(dest_path)
    html = File.read dest_path
    File.delete dest_path
    html
  end
end
scrub_whitespace(string) click to toggle source

Perform post-processing normalization of certain Word quirks

@param string [String] the markdown representation of the document

@return [String] the normalized markdown

# File lib/word-to-markdown/document.rb, line 76
def scrub_whitespace(string)
  string = string.dup
  string.gsub!('&nbsp;', ' ')       # HTML encoded spaces
  string.sub!(/\A[[:space:]]+/, '') # document leading whitespace
  string.sub!(/[[:space:]]+\z/, '') # document trailing whitespace
  string.gsub!(/([ ]+)$/, '')       # line trailing whitespace
  string.gsub!(/\n\n\n\n/, "\n\n")  # Quadruple line breaks
  string.delete!(' ')               # Unicode non-breaking spaces, injected as tabs
  string
end