class WebPageParser::BbcNewsPageParserV2
BbcNewsPageParserV2
parses BBC News web pages
Constants
- CONTENT_RE
- DATE_RE
- PARA_RE
- STRIP_BLOCKS_RE
- STRIP_CAPTIONS_RE
- STRIP_COMMENTS_RE
- STRIP_TAGS_RE
- TITLE_RE
- WHITESPACE_RE
Private Instance Methods
content_processor()
click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 74 def content_processor @content = STRIP_CAPTIONS_RE.gsub(@content, '') @content = STRIP_COMMENTS_RE.gsub(@content, '') @content = STRIP_BLOCKS_RE.gsub(@content, '') @content = STRIP_TAGS_RE.gsub(@content, '') @content = WHITESPACE_RE.gsub(@content, ' ') @content = @content.split(PARA_RE) end
date_processor()
click to toggle source
# File lib/web-page-parser/parsers/bbc_news_page_parser.rb, line 83 def date_processor begin # OPD is in GMT/UTC, which DateTime seems to use by default @date = DateTime.parse(@date) rescue ArgumentError @date = Time.now.utc end end