class WebPageParser::BbcNewsPageParserV1

BbcNewsPageParserV1 parses BBC News web pages exactly like the old News Sniffer BbcNewsPage class did. This should only ever be used for backwards compatability with News Sniffer and is never supplied for use by a factory.

Constants

CONTENT_RE
DATE_RE
PARA_RE
STRIP_TAGS_RE
TITLE_RE
WHITESPACE_RE

Public Instance Methods

hash() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 34
def hash
  # Old News Sniffer only hashed the content, not the title
  Digest::MD5.hexdigest(content.join('').to_s)
end

Private Instance Methods

content_processor() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 50
def content_processor
  @content = STRIP_TAGS_RE.gsub(@content, '')
  @content = WHITESPACE_RE.gsub(@content, '')
  @content = decode_entities(@content)
  @content = @content.split(PARA_RE)
end
date_processor() click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 41
def date_processor
  begin
    # OPD is in GMT/UTC, which DateTime seems to use by default
    @date = DateTime.parse(@date)
  rescue ArgumentError
    @date = Time.now.utc
  end
end