module EveryPolitician::Wikidata
Constants
- WDQ_URL
- WIKIDATA_SPARQL_URL
Public Class Methods
morph_wikinames(h)
click to toggle source
# File lib/wikidata.rb, line 56 def self.morph_wikinames(h) morph_api_url = 'https://api.morph.io/%s/data.json' % h[:source] morph_api_key = ENV['MORPH_API_KEY'] table = h[:table] || 'data' result = RestClient.get morph_api_url, params: { key: morph_api_key, query: "SELECT DISTINCT(#{h[:column]}) AS wikiname FROM #{table}", } JSON.parse(result, symbolize_names: true).map { |e| e[:wikiname] }.reject { |n| n.to_s.empty? } end
noko_for(url)
click to toggle source
# File lib/wikidata.rb, line 92 def self.noko_for(url) Nokogiri::HTML(open(URI.escape(URI.unescape(url))).read) end
notify_rebuilder()
click to toggle source
# File lib/wikidata.rb, line 136 def self.notify_rebuilder RestClient.post ENV['MORPH_REBUILDER_URL'], {} if ENV['MORPH_REBUILDER_URL'] end
scrape_wikidata(h)
click to toggle source
# File lib/wikidata.rb, line 100 def self.scrape_wikidata(h) langs = ((h[:lang] || (h[:names] ||= {}).keys) + [:en]).flatten.uniq langpairs = h[:names].map { |lang, names| WikiData.ids_from_pages(lang.to_s, names) } combined = langpairs.reduce({}) { |a, e| a.merge(e.invert) } (h[:ids] ||= []).each { |id| combined[id] ||= nil } # Clean out existing data ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil Hash[combined.to_a.shuffle].each_slice(h[:batch_size] || 10_000) do |slice| sliced = Hash[slice] found = WikiData::Fetcher.find(sliced.keys) sliced.each do |id, name| unless found[id] warn "No data for #{id}" next end begin data = found[id].data(langs) rescue StandardError => e warn "Problem with #{id}: #{e}" next end next unless data data[:original_wikiname] = name puts data if h[:output] == true ScraperWiki.save_sqlite([:id], data) end end end
sparql(query)
click to toggle source
# File lib/wikidata.rb, line 46 def self.sparql(query) result = RestClient.get WIKIDATA_SPARQL_URL, params: { query: query, format: 'json' } json = JSON.parse(result, symbolize_names: true) json[:results][:bindings].map { |res| res[:item][:value].split('/').last } rescue RestClient::Exception => e raise "Wikidata query #{query} failed: #{e.message}" end
wdq(query)
click to toggle source
# File lib/wikidata.rb, line 38 def self.wdq(query) result = RestClient.get WDQ_URL, params: { q: query } json = JSON.parse(result, symbolize_names: true) json[:items].map { |id| "Q#{id}" } end
wikipedia_xpath(h)
click to toggle source
# File lib/wikidata.rb, line 68 def self.wikipedia_xpath(h) noko = noko_for(URI.decode(h[:url])) if h[:after] point = noko.xpath(h[:after]) raise "Can't find #{h[:after]}" if point.empty? point.xpath('.//preceding::*').remove end if h[:before] point = noko.xpath(h[:before]) raise "Can't find #{h[:before]}" if point.empty? point.xpath('.//following::*').remove end names = noko.xpath(h[:xpath]).map(&:text).uniq binding.pry if h[:debug] == true raise "No names found in #{h[:url]}" if names.count.zero? names end