class Curation::Page

Constants

BLACKLIST

Attributes

url[R]

Public Class Methods

new(url, html = nil) click to toggle source
# File lib/curation.rb, line 22
def initialize(url, html = nil)
  @url = url
  @html = html
end

Public Instance Methods

date() click to toggle source
# File lib/curation.rb, line 43
def date
  @date ||= find_date
end
image() click to toggle source
# File lib/curation.rb, line 31
def image
  unless @image
    @image = find_image
    @image = @image.to_s.gsub('http://', 'https://')
  end
  @image
end
text() click to toggle source
# File lib/curation.rb, line 39
def text
  @text ||= find_text
end
title() click to toggle source
# File lib/curation.rb, line 27
def title
  @title ||= find_title
end

Protected Instance Methods

find_date() click to toggle source
# File lib/curation.rb, line 117
def find_date
  if json_ld.any?
    json_ld.each do |ld|
      next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
      return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
    end
  end
  return Date.parse metatags['date'] rescue nil
  return Date.parse metatags['pubdate'] rescue nil
  return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
  return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
  chunks = html.split('DisplayDate')
  if chunks.count > 1
    value = chunks[1]
    value = value.split(',').first
    value = value.gsub('"', '')
    value = value[1..-1] if value[0] == ':'
    return Date.parse value rescue nil
  end
  begin
    value = nokogiri.css('.postDate').first
    value = value.inner_text
    value = value.gsub(' — ', '')
    return Date.parse value
  rescue
  end
  begin
    value = nokogiri.css('.gta_post_date').first
    value = value.inner_text
    return Date.parse value
  rescue
  end
end
find_image() click to toggle source
# File lib/curation.rb, line 70
def find_image
  if json_ld.any?
    json_ld.each do |ld|
      if ld.has_key? 'image'
        image_data = ld['image']
        return image_data if image_data.is_a? String
        if image_data.is_a? Array
          first = image_data.first
          return first if first.is_a? String
          return first['url'] if first.is_a? Hash
        end
        return image_data['url'] if image_data.is_a? Hash
      end
    end
  end
  begin
    [
      metainspector.images.best,
      nokogiri.css('[property="og:image"]').first&.attributes['content'].value
    ].each do |possibility|
      return possibility unless possibility.to_s.empty?
    end
  rescue
    puts 'Curation::Page find_image error'
  end
  return ''
end
find_text() click to toggle source
# File lib/curation.rb, line 98
def find_text
  if json_ld.any?
    json_ld.each do |ld|
      next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
      return ld['text'] if ld.has_key? 'text'
      return ld['articleBody'] if ld.has_key? 'articleBody'
    end
  end
  h = nokogiri.dup
  BLACKLIST.each do |tag|
    h.css(tag).remove
  end
  nodes = h.css('p')
  nodes.xpath('//style').remove
  text = nodes.to_html
  text.gsub!('<br><br>', '<br>')
  text
end
find_title() click to toggle source
# File lib/curation.rb, line 49
def find_title
  if json_ld.any?
    json_ld.each do |ld|
      return ld['headline'] if ld.has_key? 'headline'
    end
  end
  begin
    [
      metainspector.best_title,
      metainspector.title,
      nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
      nokogiri.css('title')&.first&.inner_text
    ].each do |possibility|
      return possibility unless possibility.to_s.empty?
    end
  rescue
    puts 'Curation::Page find_title error'
  end
  return ''
end

Private Instance Methods

file() click to toggle source
# File lib/curation.rb, line 172
def file
  @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
rescue
  puts "Curation::Page file error with url #{url}"
end
html() click to toggle source
# File lib/curation.rb, line 178
def html
  unless @html
    file.rewind
    @html = file.read
    file.rewind
  end
  @html
rescue
  puts "Curation::Page html error"
end
json_ld() click to toggle source
# File lib/curation.rb, line 153
def json_ld
  unless defined?(@json_ld)
    @json_ld = []
    begin
      options = nokogiri.css('[type="application/ld+json"]')
      options.each do |option|
        string = option.inner_text
        hash = JSON.parse(string)
        @json_ld << hash
      end
      # Some sites have tables in tables
      @json_ld.flatten!
    rescue
      puts 'Curation::Page json_ld error'
    end
  end
  @json_ld
end
metainspector() click to toggle source
# File lib/curation.rb, line 200
def metainspector
  unless @metainspector
    @metainspector = html.nil?  ? MetaInspector.new(url)
                                : MetaInspector.new(url, document: html)
  end
  @metainspector
rescue
  puts 'Curation::Page metainspector error'
end
metatags() click to toggle source
# File lib/curation.rb, line 210
def metatags
  @metatags ||= metainspector.meta_tag['name']
rescue
  puts 'Curation::Page metatags error'
end
nokogiri() click to toggle source
# File lib/curation.rb, line 189
def nokogiri
  unless @nokogiri
    file.rewind
    @nokogiri = Nokogiri::HTML file
    file.rewind
  end
  @nokogiri
rescue
  puts 'Curation::Page nokogiri error'
end