class Scraper

Public Class Methods

check_msnbc_urls(articles) click to toggle source
# File lib/CLI_Headline_Scraper/Scraper.rb, line 120
def self.check_msnbc_urls(articles)
   #checks for and corrects common issue where MSNBC uses partial urls for internal links

  articles.each do |article|
    if !article[1].include?("www")
      article[1] = "http://www.msnbc.com" + article[1]
    end
  end
end
check_reuters_urls(articles) click to toggle source
# File lib/CLI_Headline_Scraper/Scraper.rb, line 36
def self.check_reuters_urls(articles)
   #checks for and corrects common issue where a website uses partial urls for internal links

  articles.each do |article|
    if !article[1].include?("www")
      article[1] = "https://www.reuters.com" + article[1]
    end
  end
end
fox_article(article) click to toggle source
# File lib/CLI_Headline_Scraper/Scraper.rb, line 86
def self.fox_article(article)
  article.html = self.get_page(article.url)
  article.summary = article.html.css("meta[name='description']").attribute("content").value

  article.date = article.html.css("meta[name='dc.date']").attribute("content").value

end
fox_homepage() click to toggle source

<<<<<<<<<<<<<<<<<<FOX SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

# File lib/CLI_Headline_Scraper/Scraper.rb, line 62
def self.fox_homepage
  puts "scraping Fox homepage"
  url = "http://www.foxnews.com"
  homepage = self.get_page(url)
  fox = Network.create_with_url("FOX NEWS", url)
  fox.home_html = homepage
  self.scrape_fox_articles.each{|article| article = Article.create_with_url(article[0],"FOX NEWS", article[1])}

end
get_page(url) click to toggle source
# File lib/CLI_Headline_Scraper/Scraper.rb, line 3
def self.get_page(url)
  doc = Nokogiri::HTML(open(url))
end
msnbc_article(article) click to toggle source
# File lib/CLI_Headline_Scraper/Scraper.rb, line 130
def self.msnbc_article(article)

  article.html = self.get_page(article.url)
  article.summary = article.html.css("meta[name='description']").attribute("content").value

 if !!article.html.css("meta[property='nv:date']")[0]
   article.date = article.html.css("meta[property='nv:date']").attribute("content").value
 else
   article.date = article.html.css("meta[name = 'DC.date.issued']").attribute("content").value
 end

end
msnbc_homepage() click to toggle source

<<<<<<<<<<<<<<<MSNBC SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

# File lib/CLI_Headline_Scraper/Scraper.rb, line 97
def self.msnbc_homepage
  puts "scraping MSNBC homepage"
  url = "http://www.msnbc.com"
  homepage = self.get_page(url)
  msnbc = Network.create_with_url("MSNBC", url)
  msnbc.home_html = homepage
  self.scrape_msnbc_articles.each{|article| article = Article.create_with_url(article[0],"MSNBC", article[1])}

end
reuters_article(article) click to toggle source
# File lib/CLI_Headline_Scraper/Scraper.rb, line 46
def self.reuters_article(article)

  article.html = self.get_page(article.url)
  article.summary = article.html.css("meta[name='description']").attribute("content").value

  article.date = article.html.css("meta[name='REVISION_DATE']").attribute("content").value

  # article.authors = article.html.css("meta[name='Author']").attribute("content").value

end
reuters_homepage() click to toggle source

<<<<<<<<<<<<<<<<<<REUTERS SCRAPING METHODS>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

# File lib/CLI_Headline_Scraper/Scraper.rb, line 9
def self.reuters_homepage
  puts "scraping Reuters homepage"
  url = "https://www.reuters.com"
  homepage = self.get_page(url)
  reuters = Network.create_with_url("REUTERS", url)
  reuters.home_html = homepage
  self.scrape_reuters_articles.each{|article| article = Article.create_with_url(article[0],"REUTERS", article[1])}


end
scrape_fox_articles() click to toggle source
# File lib/CLI_Headline_Scraper/Scraper.rb, line 72
def self.scrape_fox_articles

  html = Network.find_by_name("FOX NEWS").home_html
    leader = [html.css("div.collection.collection-spotlight article.article.story-1 header a").text.strip, html.css("div.collection.collection-spotlight article.article.story-1 header a").attribute("href")]

    second = [html.css("div.main.main-secondary article.article.story-1 h2.title a").text, html.css("div.main.main-secondary article.article.story-1 h2.title a").attribute("href").value]

    third = [html.css("div.main.main-secondary article.article.story-2 h2.title a").text, html.css("div.main.main-secondary article.article.story-2 h2.title a").attribute("href").value]

  articles = [leader, second, third]

end
scrape_msnbc_articles() click to toggle source
# File lib/CLI_Headline_Scraper/Scraper.rb, line 107
def self.scrape_msnbc_articles

  html = Network.find_by_name("MSNBC").home_html
  leader = [html.css("a[data-fragment = '#homepage-item-1'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-1']").attribute("href").value]
  second = [html.css("a[data-fragment = '#homepage-item-2'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-2']").attribute("href").value]
  third = [html.css("a[data-fragment = '#homepage-item-3'] span.featured-slider-menu__item__link__title").text, html.css("a[data-fragment = '#homepage-item-3']").attribute("href").value]

  articles = [leader, second, third]
  self.check_msnbc_urls(articles)

  articles
end
scrape_reuters_articles() click to toggle source
# File lib/CLI_Headline_Scraper/Scraper.rb, line 21
def self.scrape_reuters_articles

  html = Network.find_by_name("REUTERS").home_html
  leader = [html.css("section.right-now-module h2.story-title a").text, html.css("section.right-now-module h2.story-title a").attribute("href").value]
  second = [html.css("section#hp-top-news-top article.story div.story-content a h3.story-title").first.text.strip, html.css("section#hp-top-news-top article.story div.story-content a").first.attribute("href").value]
  third = [html.css("section#hp-top-news-top article.story div.story-content a h3.story-title")[1].text.strip, html.css("section#hp-top-news-top article.story div.story-content a")[1].attribute("href").value]
  articles = [leader, second, third]

  self.check_reuters_urls(articles)

  articles

end