module Wovnrb::Helpers::NokogumboHelper
Public Class Methods
parse_fragment(html_string, encoding = 'UTF-8')
click to toggle source
www.rubydoc.info/gems/nokogumbo/Nokogiri/HTML5#fragment-class_method
Nokogumbo does not properly support parsing fragment and the current implementation of Nokogiri::HTML5.fragment does not handle encoding (second line of code below).
# File lib/wovnrb/helpers/nokogumbo_helper.rb, line 19 def parse_fragment(html_string, encoding = 'UTF-8') doc = Nokogiri::HTML5.parse(html_string) doc.encoding = encoding fragment = Nokogiri::HTML::DocumentFragment.new(doc) if doc.children.length != 1 or doc.children.first.name != 'html' # no HTML? Return document as is fragment = doc else # examine children of HTML element children = doc.children.first.children # head is always first. If present, take children but otherwise # ignore the head element if children.length > 0 and doc.children.first.name = 'head' fragment << children.shift.children end # body may be next, or last. If found, take children but otherwise # ignore the body element. Also take any remaining elements, taking # care to preserve order. if children.length > 0 and doc.children.first.name = 'body' fragment << children.shift.children fragment << children elsif children.length > 0 and doc.children.last.name = 'body' body = children.pop fragment << children fragment << body.children else fragment << children end end # return result fragment end
parse_html(html_string, encoding = 'UTF-8')
click to toggle source
# File lib/wovnrb/helpers/nokogumbo_helper.rb, line 4 def parse_html(html_string, encoding = 'UTF-8') if html_string.strip[0..999] =~ /<html/i d = Nokogiri::HTML5(html_string) d.encoding = encoding d else parse_fragment(html_string, encoding) end end
Private Instance Methods
parse_fragment(html_string, encoding = 'UTF-8')
click to toggle source
www.rubydoc.info/gems/nokogumbo/Nokogiri/HTML5#fragment-class_method
Nokogumbo does not properly support parsing fragment and the current implementation of Nokogiri::HTML5.fragment does not handle encoding (second line of code below).
# File lib/wovnrb/helpers/nokogumbo_helper.rb, line 19 def parse_fragment(html_string, encoding = 'UTF-8') doc = Nokogiri::HTML5.parse(html_string) doc.encoding = encoding fragment = Nokogiri::HTML::DocumentFragment.new(doc) if doc.children.length != 1 or doc.children.first.name != 'html' # no HTML? Return document as is fragment = doc else # examine children of HTML element children = doc.children.first.children # head is always first. If present, take children but otherwise # ignore the head element if children.length > 0 and doc.children.first.name = 'head' fragment << children.shift.children end # body may be next, or last. If found, take children but otherwise # ignore the body element. Also take any remaining elements, taking # care to preserve order. if children.length > 0 and doc.children.first.name = 'body' fragment << children.shift.children fragment << children elsif children.length > 0 and doc.children.last.name = 'body' body = children.pop fragment << children fragment << body.children else fragment << children end end # return result fragment end
parse_html(html_string, encoding = 'UTF-8')
click to toggle source
# File lib/wovnrb/helpers/nokogumbo_helper.rb, line 4 def parse_html(html_string, encoding = 'UTF-8') if html_string.strip[0..999] =~ /<html/i d = Nokogiri::HTML5(html_string) d.encoding = encoding d else parse_fragment(html_string, encoding) end end