class WebPageParser::BbcNewsPageParserV5

Public Instance Methods

content() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 170
def content
  return @content if @content
  @content = []

  story_body = html_doc.css('div.story-body > div.story-body__inner')

  # Pre April 2015
  if story_body.children.empty?
    story_body = html_doc.css('div.story-body')
  end

  # for older bbc articles
  if story_body.children.empty?
    story_body = html_doc.css('div#story-body')
  end

  # for very old bbc articles
  if story_body.children.empty?
    story_body = html_doc.css('td.storybody')
  end

  story_body.children.each do |n|
    @content << n.text.strip if n.name == 'p'
    # Pre-April 2015 headings
    @content << n.text.strip if n.name == 'span' and n['class'].include? 'cross-head'
    # Post April 2015 headings
    @content << n.text.strip if n.name == 'h2' and n['class'].to_s =~ /crosshead/
  end
  @content
end
date() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 201
def date
  return @date if @date
  if date_meta = html_doc.at_css('meta[name=OriginalPublicationDate]')
    @date = DateTime.parse(date_meta['content']) rescue nil
  end
  @date
end
html_doc() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 144
def html_doc
  @html_document ||= Nokogiri::HTML(page)
end
title() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 148
def title
  return @title if @title
  @title = html_doc.css('h1.story-body__h1').text.strip

  # for older bbc articles
  if @title.empty?
    @title = html_doc.css('h1.story-header').text.strip
  end
  if @title.empty?
    @title = html_doc.css('div#meta-information h1').text.strip
  end

  # for very old bbc articles
  if @title.empty?
    if headline_meta = html_doc.at_css('meta[name=Headline]')
      @title = headline_meta['content'].to_s.strip
    end
  end

  @title
end