class TopHeadlines::Source

Constants

SOURCES

Public Class Methods

all() click to toggle source
# File lib/top-headlines/source.rb, line 145
def self.all
  SOURCES
end
list_all_headlines() click to toggle source
# File lib/top-headlines/source.rb, line 149
def self.list_all_headlines
  SOURCES.keys.sort.each do |source|
    puts "*** #{source} ***"
    scrape_headlines(source)[0,5].each_with_index {|headline, index| puts "#{index+1}. #{headline}"}
    puts "\n"
  end
end
scrape_headlines(source) click to toggle source
# File lib/top-headlines/source.rb, line 157
def self.scrape_headlines(source)
  source = SOURCES[source]
  page_url = source[:url]
  headlines_selector = source[:headlines_selector]

  doc = Nokogiri::HTML(open(page_url))

  headlines = doc.css(headlines_selector).map {|headline| headline.text.strip.gsub("รข", "'").gsub(/\n/,"").gsub(/\t/,"").gsub(/\d+$/,"")}
end
scrape_urls(source) click to toggle source
# File lib/top-headlines/source.rb, line 167
def self.scrape_urls(source)
  source = SOURCES[source]
  page_url = source[:url]
  urls_selector = source[:urls_selector]
  child_selector = source[:child_selector]
  
  doc = Nokogiri::HTML(open(page_url))

  page_url = page_url[0...-5] if page_url[-4...-1]+page_url[-1] == "news" ### EDGE CASES (BBC & YAHOO)
  page_url = "http://www.cnn.com/" if page_url.include?("www.cnn.com") ### EDGE CASE (CNN)

  urls = doc.css(urls_selector).children.css(child_selector).map {|url| url.attribute('href').value[0] == 'h' ? url.attribute('href').value : page_url + url.attribute('href').value}
end