class NewsScraper::Extractors::GoogleNewsRss

Constants

BASE_URL

Public Class Methods

new(query:) click to toggle source
# File lib/news_scraper/extractors/google_news_rss.rb, line 10
def initialize(query:)
  @query = query
end

Public Instance Methods

extract() click to toggle source
# File lib/news_scraper/extractors/google_news_rss.rb, line 14
def extract
  http_request "#{BASE_URL}&q=#{@query}" do |response|
    google_urls = google_urls_from_resp(response.body)
    extract_article_urls(google_urls)
  end
end

Private Instance Methods

extract_article_urls(google_urls) click to toggle source
# File lib/news_scraper/extractors/google_news_rss.rb, line 33
def extract_article_urls(google_urls)
  google_urls.map do |google_url|
    regex = google_url.match(%r{&url=(?<url>https?://.*)})
    regex.nil? ? nil : regex['url']
  end.compact.uniq
end
google_urls_from_resp(body) click to toggle source
# File lib/news_scraper/extractors/google_news_rss.rb, line 23
def google_urls_from_resp(body)
  rss = RSS::Parser.parse(body)

  rss.items.flat_map do |rss_item|
    Nokogiri::HTML(rss_item.description).xpath('//a').map do |anchor|
      anchor['href']
    end
  end
end