class Curation::Page
Constants
- BLACKLIST
Attributes
url[R]
Public Class Methods
new(url, html = nil)
click to toggle source
# File lib/curation.rb, line 22 def initialize(url, html = nil) @url = url @html = html end
Public Instance Methods
date()
click to toggle source
# File lib/curation.rb, line 43 def date @date ||= find_date end
image()
click to toggle source
# File lib/curation.rb, line 31 def image unless @image @image = find_image @image = @image.to_s.gsub('http://', 'https://') end @image end
text()
click to toggle source
# File lib/curation.rb, line 39 def text @text ||= find_text end
title()
click to toggle source
# File lib/curation.rb, line 27 def title @title ||= find_title end
Protected Instance Methods
find_date()
click to toggle source
# File lib/curation.rb, line 117 def find_date if json_ld.any? json_ld.each do |ld| next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type'] return Date.parse ld['datePublished'] if ld.has_key? 'datePublished' end end return Date.parse metatags['date'] rescue nil return Date.parse metatags['pubdate'] rescue nil return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil chunks = html.split('DisplayDate') if chunks.count > 1 value = chunks[1] value = value.split(',').first value = value.gsub('"', '') value = value[1..-1] if value[0] == ':' return Date.parse value rescue nil end begin value = nokogiri.css('.postDate').first value = value.inner_text value = value.gsub(' — ', '') return Date.parse value rescue end begin value = nokogiri.css('.gta_post_date').first value = value.inner_text return Date.parse value rescue end end
find_image()
click to toggle source
# File lib/curation.rb, line 70 def find_image if json_ld.any? json_ld.each do |ld| if ld.has_key? 'image' image_data = ld['image'] return image_data if image_data.is_a? String if image_data.is_a? Array first = image_data.first return first if first.is_a? String return first['url'] if first.is_a? Hash end return image_data['url'] if image_data.is_a? Hash end end end begin [ metainspector.images.best, nokogiri.css('[property="og:image"]').first&.attributes['content'].value ].each do |possibility| return possibility unless possibility.to_s.empty? end rescue puts 'Curation::Page find_image error' end return '' end
find_text()
click to toggle source
# File lib/curation.rb, line 98 def find_text if json_ld.any? json_ld.each do |ld| next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type'] return ld['text'] if ld.has_key? 'text' return ld['articleBody'] if ld.has_key? 'articleBody' end end h = nokogiri.dup BLACKLIST.each do |tag| h.css(tag).remove end nodes = h.css('p') nodes.xpath('//style').remove text = nodes.to_html text.gsub!('<br><br>', '<br>') text end
find_title()
click to toggle source
# File lib/curation.rb, line 49 def find_title if json_ld.any? json_ld.each do |ld| return ld['headline'] if ld.has_key? 'headline' end end begin [ metainspector.best_title, metainspector.title, nokogiri.css('[itemprop="headline"]')&.first&.inner_text, nokogiri.css('title')&.first&.inner_text ].each do |possibility| return possibility unless possibility.to_s.empty? end rescue puts 'Curation::Page find_title error' end return '' end
Private Instance Methods
file()
click to toggle source
# File lib/curation.rb, line 172 def file @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0" rescue puts "Curation::Page file error with url #{url}" end
html()
click to toggle source
# File lib/curation.rb, line 178 def html unless @html file.rewind @html = file.read file.rewind end @html rescue puts "Curation::Page html error" end
json_ld()
click to toggle source
# File lib/curation.rb, line 153 def json_ld unless defined?(@json_ld) @json_ld = [] begin options = nokogiri.css('[type="application/ld+json"]') options.each do |option| string = option.inner_text hash = JSON.parse(string) @json_ld << hash end # Some sites have tables in tables @json_ld.flatten! rescue puts 'Curation::Page json_ld error' end end @json_ld end
metainspector()
click to toggle source
# File lib/curation.rb, line 200 def metainspector unless @metainspector @metainspector = html.nil? ? MetaInspector.new(url) : MetaInspector.new(url, document: html) end @metainspector rescue puts 'Curation::Page metainspector error' end
nokogiri()
click to toggle source
# File lib/curation.rb, line 189 def nokogiri unless @nokogiri file.rewind @nokogiri = Nokogiri::HTML file file.rewind end @nokogiri rescue puts 'Curation::Page nokogiri error' end