module UrlScraper
Constants
- TYPES
- VERSION
Public Class Methods
fetch(uri, strict = true)
click to toggle source
Fetch Open Graph data from the specified URI. Makes an HTTP GET request and returns an UrlScraper::Object
if there is data to be found or false
if there isn't.
Pass false
for the second argument if you want to see invalid (i.e. missing a required attribute) data.
# File lib/url_scraper.rb, line 28 def self.fetch(uri, strict = true) parse(RestClient.get(uri).body, strict, uri) rescue RestClient::Exception, SocketError false end
parse(html, strict = true, uri)
click to toggle source
# File lib/url_scraper.rb, line 34 def self.parse(html, strict = true, uri) logger = Logger.new(STDOUT) doc = Nokogiri::HTML.parse(html) page = UrlScraper::Object.new doc.css('meta').each do |m| if m.attribute('property') && m.attribute('property').to_s.match(/^og:(.+)$/i) page[$1.gsub('-','_')] = m.attribute('content').to_s end end page.title = (doc.at_css('title').text rescue nil) if page.title.nil? if page.description.nil? page.description = doc.at_css("meta[name='description']")['content'] unless doc.at_css("meta[name='description']").nil? end if page.image.nil? image_array = [] doc.css("img").each do |img| next if img["src"].to_s.empty? image = URI.escape(img["src"].strip) image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get image = URI.parse(uri).merge(URI.parse image.to_s).to_s image_array << image end page.image = image_array unless image_array.empty? end # return false if page.keys.empty? # return false unless page.valid? if strict page.image = Array.wrap(page.image) page # return doc end