class WebPageParser::GuardianPageParserV1
GuardianPageParserV1
parses Guardian web pages using regexps
Constants
- CONTENT_RE
- DATE_RE
- PARA_RE
- STRIP_SCRIPTS_RE
- STRIP_TAGS_RE
- TITLE_RE
Private Instance Methods
content_processor()
click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 36 def content_processor @content = STRIP_TAGS_RE.gsub(@content, '') @content = STRIP_SCRIPTS_RE.gsub(@content, '') @content = @content.scan(PARA_RE).collect { |a| a[1] } end
date_processor()
click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 27 def date_processor begin # OPD is in GMT/UTC, which DateTime seems to use by default @date = DateTime.parse(@date) rescue ArgumentError @date = Time.now.utc end end
filter_url(url)
click to toggle source
# File lib/web-page-parser/parsers/guardian_page_parser.rb, line 42 def filter_url(url) url.to_s.gsub("www.guprod.gnl", "www.guardian.co.uk") # some wierd guardian problem with some older articles end