class WebPageParser::IndependentPageParserV1
IndependentPageParserV1
parses Independent news web pages,
Public Instance Methods
content()
click to toggle source
# File lib/web-page-parser/parsers/independent_page_parser.rb, line 34 def content return @content if @content content = [] story_body = html_doc.css('div.articleContent p,div[itemprop="articleBody"] > p') story_body.each do |p| p.search('script,object').remove p = p.text content << p.strip.gsub(/\n+/,' ') if p end @content = content.select { |p| !p.empty? } end
date()
click to toggle source
# File lib/web-page-parser/parsers/independent_page_parser.rb, line 46 def date return @date if @date if date_meta = html_doc.at_css('meta[property="article:published_time"]') @date = DateTime.parse(date_meta['content']) rescue nil end @date end
guid_from_url()
click to toggle source
Independent articles have a guid in the url (as of Jan 2014, a seven digit integer at the end of the url before the html extension)
# File lib/web-page-parser/parsers/independent_page_parser.rb, line 21 def guid_from_url # get the last large number from the url, if there is one url.to_s.scan(/[0-9]{6,12}/).last end
html_doc()
click to toggle source
# File lib/web-page-parser/parsers/independent_page_parser.rb, line 26 def html_doc @html_document ||= Nokogiri::HTML(page) end
title()
click to toggle source
# File lib/web-page-parser/parsers/independent_page_parser.rb, line 30 def title @title ||= html_doc.css('div#main h1.title,article h1[itemprop="headline"]').text.strip end