module UrlScraper

Constants

TYPES
VERSION

Public Class Methods

fetch(uri, strict = true) click to toggle source

Fetch Open Graph data from the specified URI. Makes an HTTP GET request and returns an UrlScraper::Object if there is data to be found or false if there isn't.

Pass false for the second argument if you want to see invalid (i.e. missing a required attribute) data.

# File lib/url_scraper.rb, line 28
def self.fetch(uri, strict = true)
  parse(RestClient.get(uri).body, strict, uri)
  rescue RestClient::Exception, SocketError
    false
end
parse(html, strict = true, uri) click to toggle source
# File lib/url_scraper.rb, line 34
def self.parse(html, strict = true, uri)
  logger = Logger.new(STDOUT)
  doc = Nokogiri::HTML.parse(html)
  page = UrlScraper::Object.new
  doc.css('meta').each do |m|
    if m.attribute('property') && m.attribute('property').to_s.match(/^og:(.+)$/i)
      page[$1.gsub('-','_')] = m.attribute('content').to_s
    end
  end

  page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
  if page.description.nil?
    page.description = doc.at_css("meta[name='description']")['content'] unless doc.at_css("meta[name='description']").nil?
  end
  if page.image.nil?
    image_array = []
    doc.css("img").each do |img|
      next if img["src"].to_s.empty?
      image = URI.escape(img["src"].strip)
      image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
      image = URI.parse(uri).merge(URI.parse image.to_s).to_s
      image_array << image
    end
    page.image = image_array unless image_array.empty?
  end
  # return false if page.keys.empty?
  # return false unless page.valid? if strict
  page.image = Array.wrap(page.image)
  page
  # return doc
end