class Wikipedia::Page

Attributes

json[R]

Public Class Methods

new(json) click to toggle source
# File lib/wikipedia/page.rb, line 5
def initialize(json)
  require 'json'
  @json = json
  @data = JSON.parse(json)
end
sanitize(s) click to toggle source

rubocop:disable Metrics/MethodLength rubocop:disable Metrics/AbcSize

# File lib/wikipedia/page.rb, line 127
def self.sanitize(s)
  return unless s

  # Transform punctuation templates
  # Em dash (https://en.wikipedia.org/wiki/Template:Em_dash)
  s.gsub!(/\{\{(em dash|emdash)\}\}/i, '—')
  # En dash (https://en.wikipedia.org/wiki/Template:En_dash)
  s.gsub!(/\{\{(en dash|ndash|nsndns)\}\}/i, '–')
  # Spaced en dashes (https://en.wikipedia.org/wiki/Template:Spaced_en_dash_space)
  s.gsub!(/\{\{(spaced e?n\s?dash( space)?|snds?|spndsp|sndashs|spndashsp)\}\}/i, ' – ')
  # Bold middot
  s.gsub!(/\{\{(·|dot|middot|\,)\}\}/i, '&nbsp;<b>&middot;</b>')
  # Bullets
  s.gsub!(/\{\{(•|bull(et)?)\}\}/i, '&nbsp;&bull;')
  # Forward Slashes (https://en.wikipedia.org/wiki/Template:%5C)
  s.gsub!(/\{\{\\\}\}/i, '&nbsp;/')

  # Transform language specific blocks
  s.gsub!(/\{\{lang[\-\|]([a-z]+)\|([^\|\{\}]+)(\|[^\{\}]+)?\}\}/i, '<span lang="\1">\2</span>')

  # Parse Old Style Date template blocks
  # Old Style Dates (https://en.wikipedia.org/wiki/Template:OldStyleDate)
  s.gsub!(/\{\{OldStyleDate\|([^\|]*)\|([^\|]*)\|([^\|]*)\}\}/i, '\1 [<abbr title="Old Style">O.S.</abbr> \3] \2')
  # Old Style Dates with different years (https://en.wikipedia.org/wiki/Template:OldStyleDateDY)
  s.gsub!(/\{\{OldStyleDateDY\|([^\|]*)\|([^\|]*)\|([^\|]*)\}\}/i, '\1 \2 [<abbr title="Old Style">O.S.</abbr> \3]')
  # Old Style Dates with no year (https://en.wikipedia.org/wiki/Template:OldStyleDateNY)
  s.gsub!(/\{\{OldStyleDateNY\|([^\|]*)\|([^\|]*)\}\}/i, '\1 [<abbr title="Old Style">O.S.</abbr> \2]')

  # strip anything else inside curly braces!
  s.gsub!(/\{\{[^\{\}]+?\}\}[\;\,]?/, '') while s =~ /\{\{[^\{\}]+?\}\}[\;\,]?/

  # strip info box
  s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '')

  # strip internal links
  s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2')
  s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1')

  # strip images and file links
  s.gsub!(/\[\[Image:(.*?(?=\]\]))??\]\]/, '')
  s.gsub!(/\[\[File:(.*?(?=\]\]))??\]\]/, '')

  # convert bold/italic to html
  s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>')
  s.gsub!(/'''(.+?)'''/, '<b>\1</b>')
  s.gsub!(/''(.+?)''/, '<i>\1</i>')

  # misc
  s.gsub!(/(\d)<ref[^<>]*>[\s\S]*?<\/ref>(\d)/, '\1&nbsp;&ndash;&nbsp;\2')
  s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '')
  s.gsub!(/<ref(.*?(?=\/>))??\/>/, '')
  s.gsub!(/<!--[^>]+?-->/, '')
  s.gsub!(/\(\s+/, '(')
  s.gsub!('  ', ' ')
  s.strip!

  # create paragraphs
  sections = s.split("\n\n")
  s =
    if sections.size > 1
      sections.map { |paragraph| "<p>#{paragraph.strip}</p>" }.join("\n")
    else
      "<p>#{s}</p>"
    end

  s
end

Public Instance Methods

categories() click to toggle source
# File lib/wikipedia/page.rb, line 51
def categories
  page['categories'].map { |c| c['title'] } if page['categories']
end
content() click to toggle source
# File lib/wikipedia/page.rb, line 15
def content
  page['revisions'].first['*'] if page['revisions']
end
coordinates() click to toggle source
# File lib/wikipedia/page.rb, line 104
def coordinates
  page['coordinates'].first.values if page['coordinates']
end
editurl() click to toggle source
# File lib/wikipedia/page.rb, line 39
def editurl
  page['editurl']
end
fullurl() click to toggle source
# File lib/wikipedia/page.rb, line 35
def fullurl
  page['fullurl']
end
image_descriptionurl() click to toggle source
# File lib/wikipedia/page.rb, line 79
def image_descriptionurl
  page['imageinfo'].first['descriptionurl'] if page['imageinfo']
end
image_descriptionurls() click to toggle source
# File lib/wikipedia/page.rb, line 92
def image_descriptionurls
  image_metadata.map(&:image_descriptionurl) unless image_metadata.nil?
end
image_metadata( options = {} ) click to toggle source
# File lib/wikipedia/page.rb, line 112
def image_metadata( options = {} )
  unless @cached_image_metadata
    return if images.nil?
    filtered = images.select { |i| i =~ /:.+\.(jpg|jpeg|png|gif|svg)$/i && !i.include?('LinkFA-star') }
    @cached_image_metadata = filtered.map { |title| Wikipedia.find_image(title, options) }
  end
  @cached_image_metadata || []
end
image_thumburl() click to toggle source
# File lib/wikipedia/page.rb, line 75
def image_thumburl
  page['imageinfo'].first['thumburl'] if page['imageinfo']
end
image_thumburls( width = nil ) click to toggle source
# File lib/wikipedia/page.rb, line 87
def image_thumburls( width = nil )
  options = width.nil? ? {} : { iiurlwidth: width }
  image_metadata( options ).map(&:image_thumburl) unless image_metadata( options ).nil?
end
image_url() click to toggle source
# File lib/wikipedia/page.rb, line 71
def image_url
  page['imageinfo'].first['url'] if page['imageinfo']
end
image_urls() click to toggle source
# File lib/wikipedia/page.rb, line 83
def image_urls
  image_metadata.map(&:image_url) unless image_metadata.nil?
end
images() click to toggle source
# File lib/wikipedia/page.rb, line 67
def images
  page['images'].map { |c| c['title'] } if page['images']
end
main_image_thumburl() click to toggle source
# File lib/wikipedia/page.rb, line 100
def main_image_thumburl
  page['thumbnail']['source'] if page['thumbnail']
end
main_image_url() click to toggle source
# File lib/wikipedia/page.rb, line 96
def main_image_url
  page['thumbnail']['source'].sub(/\/thumb/, '').sub(/\/[^\/]*$/, '') if page['thumbnail']
end
page() click to toggle source
# File lib/wikipedia/page.rb, line 11
def page
  @data['query']['pages'].values.first if @data['query']['pages']
end
raw_data() click to toggle source
# File lib/wikipedia/page.rb, line 108
def raw_data
  @data
end
redirect?() click to toggle source
# File lib/wikipedia/page.rb, line 23
def redirect?
  content && content.match(/\#REDIRECT\s*\[\[(.*?)\]\]/i)
end
redirect_title() click to toggle source
# File lib/wikipedia/page.rb, line 27
def redirect_title
  redirect?[1] rescue nil
end
sanitized_content() click to toggle source
# File lib/wikipedia/page.rb, line 19
def sanitized_content
  self.class.sanitize(content)
end
summary() click to toggle source
# File lib/wikipedia/page.rb, line 47
def summary
  page['extract'].split('==')[0].strip if page['extract'] && page['extract'] != ''
end
templates() click to toggle source
# File lib/wikipedia/page.rb, line 121
def templates
  page['templates'].map { |c| c['title'] } if page['templates']
end
text() click to toggle source
# File lib/wikipedia/page.rb, line 43
def text
  page['extract']
end
title() click to toggle source
# File lib/wikipedia/page.rb, line 31
def title
  page['title']
end