class WebPageParser::IndependentPageParserV1

IndependentPageParserV1 parses Independent news web pages,

Public Instance Methods

content() click to toggle source
# File lib/web-page-parser/parsers/independent_page_parser.rb, line 34
def content
  return @content if @content
  content = []
  story_body = html_doc.css('div.articleContent p,div[itemprop="articleBody"] > p')
  story_body.each do |p|
    p.search('script,object').remove
    p = p.text
    content << p.strip.gsub(/\n+/,' ') if p
  end
  @content = content.select { |p| !p.empty? }
end
date() click to toggle source
# File lib/web-page-parser/parsers/independent_page_parser.rb, line 46
def date
  return @date if @date
  if date_meta = html_doc.at_css('meta[property="article:published_time"]')
    @date = DateTime.parse(date_meta['content']) rescue nil
  end
  @date
end
guid_from_url() click to toggle source

Independent articles have a guid in the url (as of Jan 2014, a seven digit integer at the end of the url before the html extension)

# File lib/web-page-parser/parsers/independent_page_parser.rb, line 21
def guid_from_url
  # get the last large number from the url, if there is one
  url.to_s.scan(/[0-9]{6,12}/).last
end
html_doc() click to toggle source
# File lib/web-page-parser/parsers/independent_page_parser.rb, line 26
def html_doc
  @html_document ||= Nokogiri::HTML(page)
end
title() click to toggle source
# File lib/web-page-parser/parsers/independent_page_parser.rb, line 30
def title
  @title ||= html_doc.css('div#main h1.title,article h1[itemprop="headline"]').text.strip
end