class Boilerpipe::SAX::BoilerpipeHTMLParser
Public Class Methods
parse(text)
click to toggle source
# File lib/boilerpipe/sax/boilerpipe_html_parser.rb, line 3 def self.parse(text) # strip out tags that cause issues text = Preprocessor.strip(text) # use nokogiri to fix any bad tags, errors - keep experimenting with this text = Nokogiri::HTML(text).to_html handler = HTMLContentHandler.new noko_parser = Nokogiri::HTML::SAX::Parser.new(handler) noko_parser.parse(text) handler.text_document end