class Slaw::Extract::Extractor

Routines for extracting and cleaning up context from other formats, such as HTML.

Public Instance Methods

extract_from_file(filename) click to toggle source

Extract text from a file.

@param filename [String] filename to extract from

@return [String] extracted text

# File lib/slaw/extract/extractor.rb, line 13
def extract_from_file(filename)
  if filename.end_with? '.html' or filename.end_with? '.htm'
    extract_from_html(filename)
  else
    extract_from_text(filename)
  end
end
extract_from_html(filename) click to toggle source
# File lib/slaw/extract/extractor.rb, line 25
def extract_from_html(filename)
  html_to_text(File.read(filename))
end
extract_from_text(filename) click to toggle source
# File lib/slaw/extract/extractor.rb, line 21
def extract_from_text(filename)
  File.read(filename)
end
get_mimetype(filename) click to toggle source
# File lib/slaw/extract/extractor.rb, line 38
def get_mimetype(filename)
  File.open(filename) { |f| MimeMagic.by_magic(f) } \
    || MimeMagic.by_path(filename)
end
html_to_text(html) click to toggle source
# File lib/slaw/extract/extractor.rb, line 29
def html_to_text(html)
  here = File.dirname(__FILE__)
  xslt = Nokogiri::XSLT(File.open(File.join([here, 'html_to_akn_text.xsl'])))

  text = xslt.transform(Nokogiri::HTML(html)).to_s
  # remove XML encoding at top
  text.sub(/^<\?xml [^>]*>/, '')
end