class Wikipedia::Page
Attributes
json[R]
Public Class Methods
new(json)
click to toggle source
# File lib/wikipedia/page.rb, line 5 def initialize(json) require 'json' @json = json @data = JSON.parse(json) end
sanitize(s)
click to toggle source
rubocop:disable Metrics/MethodLength rubocop:disable Metrics/AbcSize
# File lib/wikipedia/page.rb, line 127 def self.sanitize(s) return unless s # Transform punctuation templates # Em dash (https://en.wikipedia.org/wiki/Template:Em_dash) s.gsub!(/\{\{(em dash|emdash)\}\}/i, '—') # En dash (https://en.wikipedia.org/wiki/Template:En_dash) s.gsub!(/\{\{(en dash|ndash|nsndns)\}\}/i, '–') # Spaced en dashes (https://en.wikipedia.org/wiki/Template:Spaced_en_dash_space) s.gsub!(/\{\{(spaced e?n\s?dash( space)?|snds?|spndsp|sndashs|spndashsp)\}\}/i, ' – ') # Bold middot s.gsub!(/\{\{(·|dot|middot|\,)\}\}/i, ' <b>·</b>') # Bullets s.gsub!(/\{\{(•|bull(et)?)\}\}/i, ' •') # Forward Slashes (https://en.wikipedia.org/wiki/Template:%5C) s.gsub!(/\{\{\\\}\}/i, ' /') # Transform language specific blocks s.gsub!(/\{\{lang[\-\|]([a-z]+)\|([^\|\{\}]+)(\|[^\{\}]+)?\}\}/i, '<span lang="\1">\2</span>') # Parse Old Style Date template blocks # Old Style Dates (https://en.wikipedia.org/wiki/Template:OldStyleDate) s.gsub!(/\{\{OldStyleDate\|([^\|]*)\|([^\|]*)\|([^\|]*)\}\}/i, '\1 [<abbr title="Old Style">O.S.</abbr> \3] \2') # Old Style Dates with different years (https://en.wikipedia.org/wiki/Template:OldStyleDateDY) s.gsub!(/\{\{OldStyleDateDY\|([^\|]*)\|([^\|]*)\|([^\|]*)\}\}/i, '\1 \2 [<abbr title="Old Style">O.S.</abbr> \3]') # Old Style Dates with no year (https://en.wikipedia.org/wiki/Template:OldStyleDateNY) s.gsub!(/\{\{OldStyleDateNY\|([^\|]*)\|([^\|]*)\}\}/i, '\1 [<abbr title="Old Style">O.S.</abbr> \2]') # strip anything else inside curly braces! s.gsub!(/\{\{[^\{\}]+?\}\}[\;\,]?/, '') while s =~ /\{\{[^\{\}]+?\}\}[\;\,]?/ # strip info box s.sub!(/^\{\|[^\{\}]+?\n\|\}\n/, '') # strip internal links s.gsub!(/\[\[([^\]\|]+?)\|([^\]\|]+?)\]\]/, '\2') s.gsub!(/\[\[([^\]\|]+?)\]\]/, '\1') # strip images and file links s.gsub!(/\[\[Image:(.*?(?=\]\]))??\]\]/, '') s.gsub!(/\[\[File:(.*?(?=\]\]))??\]\]/, '') # convert bold/italic to html s.gsub!(/'''''(.+?)'''''/, '<b><i>\1</i></b>') s.gsub!(/'''(.+?)'''/, '<b>\1</b>') s.gsub!(/''(.+?)''/, '<i>\1</i>') # misc s.gsub!(/(\d)<ref[^<>]*>[\s\S]*?<\/ref>(\d)/, '\1 – \2') s.gsub!(/<ref[^<>]*>[\s\S]*?<\/ref>/, '') s.gsub!(/<ref(.*?(?=\/>))??\/>/, '') s.gsub!(/<!--[^>]+?-->/, '') s.gsub!(/\(\s+/, '(') s.gsub!(' ', ' ') s.strip! # create paragraphs sections = s.split("\n\n") s = if sections.size > 1 sections.map { |paragraph| "<p>#{paragraph.strip}</p>" }.join("\n") else "<p>#{s}</p>" end s end
Public Instance Methods
categories()
click to toggle source
# File lib/wikipedia/page.rb, line 51 def categories page['categories'].map { |c| c['title'] } if page['categories'] end
content()
click to toggle source
# File lib/wikipedia/page.rb, line 15 def content page['revisions'].first['*'] if page['revisions'] end
coordinates()
click to toggle source
# File lib/wikipedia/page.rb, line 104 def coordinates page['coordinates'].first.values if page['coordinates'] end
editurl()
click to toggle source
# File lib/wikipedia/page.rb, line 39 def editurl page['editurl'] end
extlinks()
click to toggle source
# File lib/wikipedia/page.rb, line 59 def extlinks page['extlinks'].map { |c| c['*'] } if page['extlinks'] end
fullurl()
click to toggle source
# File lib/wikipedia/page.rb, line 35 def fullurl page['fullurl'] end
image_descriptionurl()
click to toggle source
# File lib/wikipedia/page.rb, line 79 def image_descriptionurl page['imageinfo'].first['descriptionurl'] if page['imageinfo'] end
image_descriptionurls()
click to toggle source
# File lib/wikipedia/page.rb, line 92 def image_descriptionurls image_metadata.map(&:image_descriptionurl) unless image_metadata.nil? end
image_metadata( options = {} )
click to toggle source
# File lib/wikipedia/page.rb, line 112 def image_metadata( options = {} ) unless @cached_image_metadata return if images.nil? filtered = images.select { |i| i =~ /:.+\.(jpg|jpeg|png|gif|svg)$/i && !i.include?('LinkFA-star') } @cached_image_metadata = filtered.map { |title| Wikipedia.find_image(title, options) } end @cached_image_metadata || [] end
image_thumburl()
click to toggle source
# File lib/wikipedia/page.rb, line 75 def image_thumburl page['imageinfo'].first['thumburl'] if page['imageinfo'] end
image_thumburls( width = nil )
click to toggle source
# File lib/wikipedia/page.rb, line 87 def image_thumburls( width = nil ) options = width.nil? ? {} : { iiurlwidth: width } image_metadata( options ).map(&:image_thumburl) unless image_metadata( options ).nil? end
image_url()
click to toggle source
# File lib/wikipedia/page.rb, line 71 def image_url page['imageinfo'].first['url'] if page['imageinfo'] end
image_urls()
click to toggle source
# File lib/wikipedia/page.rb, line 83 def image_urls image_metadata.map(&:image_url) unless image_metadata.nil? end
images()
click to toggle source
# File lib/wikipedia/page.rb, line 67 def images page['images'].map { |c| c['title'] } if page['images'] end
langlinks()
click to toggle source
# File lib/wikipedia/page.rb, line 63 def langlinks Hash[page['langlinks'].collect { |c| [c['lang'], c['*']] }] if page['langlinks'] end
links()
click to toggle source
# File lib/wikipedia/page.rb, line 55 def links page['links'].map { |c| c['title'] } if page['links'] end
main_image_thumburl()
click to toggle source
# File lib/wikipedia/page.rb, line 100 def main_image_thumburl page['thumbnail']['source'] if page['thumbnail'] end
main_image_url()
click to toggle source
# File lib/wikipedia/page.rb, line 96 def main_image_url page['thumbnail']['source'].sub(/\/thumb/, '').sub(/\/[^\/]*$/, '') if page['thumbnail'] end
page()
click to toggle source
# File lib/wikipedia/page.rb, line 11 def page @data['query']['pages'].values.first if @data['query']['pages'] end
raw_data()
click to toggle source
# File lib/wikipedia/page.rb, line 108 def raw_data @data end
redirect?()
click to toggle source
# File lib/wikipedia/page.rb, line 23 def redirect? content && content.match(/\#REDIRECT\s*\[\[(.*?)\]\]/i) end
redirect_title()
click to toggle source
# File lib/wikipedia/page.rb, line 27 def redirect_title redirect?[1] rescue nil end
sanitized_content()
click to toggle source
# File lib/wikipedia/page.rb, line 19 def sanitized_content self.class.sanitize(content) end
summary()
click to toggle source
# File lib/wikipedia/page.rb, line 47 def summary page['extract'].split('==')[0].strip if page['extract'] && page['extract'] != '' end
templates()
click to toggle source
# File lib/wikipedia/page.rb, line 121 def templates page['templates'].map { |c| c['title'] } if page['templates'] end
text()
click to toggle source
# File lib/wikipedia/page.rb, line 43 def text page['extract'] end
title()
click to toggle source
# File lib/wikipedia/page.rb, line 31 def title page['title'] end