module EveryPolitician::Wikidata

Constants

WDQ_URL
WIKIDATA_SPARQL_URL

Public Class Methods

morph_wikinames(h) click to toggle source
# File lib/wikidata.rb, line 56
def self.morph_wikinames(h)
  morph_api_url = 'https://api.morph.io/%s/data.json' % h[:source]
  morph_api_key = ENV['MORPH_API_KEY']
  table = h[:table] || 'data'
  result = RestClient.get morph_api_url, params: {
    key:   morph_api_key,
    query: "SELECT DISTINCT(#{h[:column]}) AS wikiname FROM #{table}",
  }
  JSON.parse(result, symbolize_names: true).map { |e| e[:wikiname] }.reject { |n| n.to_s.empty? }
end
noko_for(url) click to toggle source
# File lib/wikidata.rb, line 92
def self.noko_for(url)
  Nokogiri::HTML(open(URI.escape(URI.unescape(url))).read)
end
notify_rebuilder() click to toggle source
# File lib/wikidata.rb, line 136
def self.notify_rebuilder
  RestClient.post ENV['MORPH_REBUILDER_URL'], {} if ENV['MORPH_REBUILDER_URL']
end
scrape_wikidata(h) click to toggle source
# File lib/wikidata.rb, line 100
def self.scrape_wikidata(h)
  langs = ((h[:lang] || (h[:names] ||= {}).keys) + [:en]).flatten.uniq
  langpairs = h[:names].map { |lang, names| WikiData.ids_from_pages(lang.to_s, names) }
  combined  = langpairs.reduce({}) { |a, e| a.merge(e.invert) }
  (h[:ids] ||= []).each { |id| combined[id] ||= nil }
  # Clean out existing data
  ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil

  Hash[combined.to_a.shuffle].each_slice(h[:batch_size] || 10_000) do |slice|
    sliced = Hash[slice]
    found = WikiData::Fetcher.find(sliced.keys)
    sliced.each do |id, name|
      unless found[id]
        warn "No data for #{id}"
        next
      end

      begin
        data = found[id].data(langs)
      rescue StandardError => e
        warn "Problem with #{id}: #{e}"
        next
      end
      next unless data

      data[:original_wikiname] = name
      puts data if h[:output] == true
      ScraperWiki.save_sqlite([:id], data)
    end
  end
end
sparql(query) click to toggle source
# File lib/wikidata.rb, line 46
def self.sparql(query)
  result = RestClient.get WIKIDATA_SPARQL_URL, params: { query: query, format: 'json' }
  json = JSON.parse(result, symbolize_names: true)
  json[:results][:bindings].map { |res| res[:item][:value].split('/').last }
rescue RestClient::Exception => e
  raise "Wikidata query #{query} failed: #{e.message}"
end
wdq(query) click to toggle source
# File lib/wikidata.rb, line 38
def self.wdq(query)
  result = RestClient.get WDQ_URL, params: { q: query }
  json = JSON.parse(result, symbolize_names: true)
  json[:items].map { |id| "Q#{id}" }
end
wikipedia_xpath(h) click to toggle source
# File lib/wikidata.rb, line 68
def self.wikipedia_xpath(h)
  noko = noko_for(URI.decode(h[:url]))

  if h[:after]
    point = noko.xpath(h[:after])
    raise "Can't find #{h[:after]}" if point.empty?
    point.xpath('.//preceding::*').remove
  end

  if h[:before]
    point = noko.xpath(h[:before])
    raise "Can't find #{h[:before]}" if point.empty?
    point.xpath('.//following::*').remove
  end

  names = noko.xpath(h[:xpath]).map(&:text).uniq
  binding.pry if h[:debug] == true
  raise "No names found in #{h[:url]}" if names.count.zero?
  names
end