class Boilerpipe::SAX::BoilerpipeHTMLParser

Public Class Methods

parse(text) click to toggle source
# File lib/boilerpipe/sax/boilerpipe_html_parser.rb, line 3
def self.parse(text)
  # strip out tags that cause issues
  text = Preprocessor.strip(text)

  # use nokogiri to fix any bad tags, errors - keep experimenting with this
  text = Nokogiri::HTML(text).to_html
  handler = HTMLContentHandler.new
  noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
  noko_parser.parse(text)
  handler.text_document
end