class TopHeadlines::Source
Constants
- SOURCES
Public Class Methods
all()
click to toggle source
# File lib/top-headlines/source.rb, line 145 def self.all SOURCES end
list_all_headlines()
click to toggle source
# File lib/top-headlines/source.rb, line 149 def self.list_all_headlines SOURCES.keys.sort.each do |source| puts "*** #{source} ***" scrape_headlines(source)[0,5].each_with_index {|headline, index| puts "#{index+1}. #{headline}"} puts "\n" end end
scrape_headlines(source)
click to toggle source
# File lib/top-headlines/source.rb, line 157 def self.scrape_headlines(source) source = SOURCES[source] page_url = source[:url] headlines_selector = source[:headlines_selector] doc = Nokogiri::HTML(open(page_url)) headlines = doc.css(headlines_selector).map {|headline| headline.text.strip.gsub("รข", "'").gsub(/\n/,"").gsub(/\t/,"").gsub(/\d+$/,"")} end
scrape_urls(source)
click to toggle source
# File lib/top-headlines/source.rb, line 167 def self.scrape_urls(source) source = SOURCES[source] page_url = source[:url] urls_selector = source[:urls_selector] child_selector = source[:child_selector] doc = Nokogiri::HTML(open(page_url)) page_url = page_url[0...-5] if page_url[-4...-1]+page_url[-1] == "news" ### EDGE CASES (BBC & YAHOO) page_url = "http://www.cnn.com/" if page_url.include?("www.cnn.com") ### EDGE CASE (CNN) urls = doc.css(urls_selector).children.css(child_selector).map {|url| url.attribute('href').value[0] == 'h' ? url.attribute('href').value : page_url + url.attribute('href').value} end