class WebPageParser::NewYorkTimesPageParserV2

NewYorkTimesPageParserV2 parses New York Times web pages, including the new format change in Janurary 2014

Public Instance Methods

content() click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 84
def content
  return @content if @content
  @content = []
  # 2018
  story_body = html_doc.css('article#story div.StoryBodyCompanionColumn p')
  if story_body.empty?
    # 2017
    story_body = html_doc.css('p.story-content')
  end
  if story_body.empty?
    # older style
    story_body = html_doc.css('p[itemprop=articleBody]')
  end
  story_body.each do |p|
    @content << p.text.strip
  end
  @content
end
date() click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 103
def date
  return @date if @date
  if date_meta = html_doc.at_css('meta[name=dat],meta[itemprop=datePublished]')
    @date = DateTime.parse(date_meta['content']) rescue nil
  end
  @date
end
html_doc() click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 76
def html_doc
  @html_document ||= Nokogiri::HTML(page)
end
title() click to toggle source
# File lib/web-page-parser/parsers/new_york_times_page_parser.rb, line 80
def title
  @title ||= html_doc.css('h1[itemprop=headline]').text.strip
end