class WebPageParser::WashingtonPostPageParserV2
WashingtonPostPageParserV2
parses washpo web pages using html parsing. Works since June 2018.
Public Instance Methods
content()
click to toggle source
# File lib/web-page-parser/parsers/washingtonpost_page_parser.rb, line 79 def content return @content if @content story_body = html_doc.css('article:first p,article:first div.subhead').collect { |p| p.text.strip } @content = story_body.select { |p| !p.empty? } end
date()
click to toggle source
# File lib/web-page-parser/parsers/washingtonpost_page_parser.rb, line 85 def date return @date if @date if date_meta = html_doc.at_css('*[itemprop="datePublished"]') @date = DateTime.parse(date_meta['content']).new_offset(0) rescue nil end @date end
guid_from_url()
click to toggle source
WashPo articles have a uuid in the url
# File lib/web-page-parser/parsers/washingtonpost_page_parser.rb, line 66 def guid_from_url # get the last large number from the url, if there is one url.to_s.scan(/[a-f0-9-]{30,40}/).last end
html_doc()
click to toggle source
# File lib/web-page-parser/parsers/washingtonpost_page_parser.rb, line 71 def html_doc @html_document ||= Nokogiri::HTML(page) end
title()
click to toggle source
# File lib/web-page-parser/parsers/washingtonpost_page_parser.rb, line 75 def title @title ||= html_doc.css('h1[itemprop="headline"]').text.strip end