class WebPageParser::GuardianPageParserV2
GuardianPageParserV2
parses Guardian web pages using html parsing. It can parse articles old and new but sometimes has slightly different results due to it stripping most html tags (like <strong>) which the V1 parser didn't do.
Public Instance Methods
content()
click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 77 def content return @content if @content story_body = html_doc.css('div#article-body-blocks *, div[itemprop=articleBody] *').select do |e| e.name == 'p' or e.name == 'h2' or e.name == 'h3' end story_body.collect { |p| p.text.empty? ? nil : p.text.strip }.compact end
css_first_text(top_e, *selectors)
click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 59 def css_first_text(top_e, *selectors) selectors.each do |s| top_e.css(s).each do |e| next if e.nil? text = e.text.strip return text unless text.empty? end end nil end
date()
click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 85 def date return @date if @date if date_meta = html_doc.at_css('meta[property="article:published_time"]') @date = DateTime.parse(date_meta['content']) rescue nil end @date end
filter_url(url)
click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 93 def filter_url(url) # some wierd guardian problem with some older articles url.to_s.gsub("www.guprod.gnl", "www.guardian.co.uk") end
html_doc()
click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 55 def html_doc @html_document ||= Nokogiri::HTML(page) end
title()
click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 70 def title return @title if @title @title = css_first_text(html_doc, 'h1[itemprop=headline]', 'div#main-article-info h1:first') @title = html_doc.css('title').text.split('|').first.strip if @title.nil? @title end