class DwcaHunter::ResourceFreebase

Public Class Methods

new(opts = {}) click to toggle source
Calls superclass method DwcaHunter::Resource::new
# File lib/dwca_hunter/resources/freebase.rb, line 5
def initialize(opts = {})
  @command = "freebase"
  @title = "Freebase"
  @uuid = "bacd21f0-44e0-43e2-914c-70929916f257"
  @download_path = File.join(Dir.tmpdir,
                             "dwca_hunter",
                             "freebase",
                             "data.json")
  @data = []
  @all_taxa = {}
  @cleaned_taxa = {}
  @extensions = []
  super
end

Public Instance Methods

download() click to toggle source
# File lib/dwca_hunter/resources/freebase.rb, line 29
def download
  DwcaHunter.logger_write(object_id,
                          "Querying freebase for species information...")
  q = {
    query: [{
      type: "/biology/organism_classification",
      id: nil,
      guid: nil,
      name: nil,
      scientific_name: nil,
      synonym_scientific_name: [],
      higher_classification: {
        id: nil,
        guid: nil,
        scientific_name: nil,
        optional: true
      }
    }],
    cursor: true
  }

  run_query(q)

  data = JSON.pretty_generate @data
  f = open(@download_path, "w:utf-8")
  f.write(data)
  f.close
end
make_dwca() click to toggle source
# File lib/dwca_hunter/resources/freebase.rb, line 24
def make_dwca
  organize_data
  generate_dwca
end
needs_unpack?() click to toggle source
# File lib/dwca_hunter/resources/freebase.rb, line 20
def needs_unpack?
  false
end

Private Instance Methods

generate_dwca() click to toggle source
Calls superclass method DwcaHunter::Resource#generate_dwca
# File lib/dwca_hunter/resources/freebase.rb, line 110
def generate_dwca
  DwcaHunter.logger_write(object_id,
                          "Creating DarwinCore Archive file")
  @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
            "http://rs.tdwg.org/dwc/terms/scientificName",
            "http://rs.tdwg.org/dwc/terms/parentNameUsageID"]]

  @extensions << { data: [[
    "http://rs.tdwg.org/dwc/terms/TaxonID",
    "http://rs.tdwg.org/dwc/terms/scientificName"
  ]], file_name: "synonyms.txt" }
  DwcaHunter.logger_write(object_id,
                          "Creating synonyms extension for DarwinCore Archive file")
  count = 0
  @cleaned_taxa.each do |_key, taxon|
    count += 1
    @core << [taxon[:id], taxon[:scientific_name], taxon[:parent_id]]
    if count % BATCH_SIZE == 0
      DwcaHunter.logger_write(object_id,
                              "Traversing %s extension data record" % count)
    end
    taxon[:synonyms].each do |name|
      @extensions[-1][:data] << [taxon[:id], name]
    end
  end
  @eml = {
    id: @uuid,
    title: @title,
    license: "http://creativecommons.org/licenses/by-sa/3.0/",
    authors: [
      { url: "http://www.freebase.com/home" }
    ],
    abstract: "An entity graph of people, places and things, " \
              "built by a community that loves open data.",
    metadata_providers: [
      { first_name: "Dmitry",
        last_name: "Mozzherin",
        email: "dmozzherin@mbl.edu" }
    ],
    url: "http://www.freebase.com/home"
  }
  super
end
organize_data() click to toggle source
# File lib/dwca_hunter/resources/freebase.rb, line 80
def organize_data
  @data = JSON.load(open(@download_path, "r:utf-8").read)
  @data.each do |d|
    scientific_name = d["scientific_name"].to_s
    id = d["id"]
    parent_id = d["higher_classification"] ?
                d["higher_classification"]["id"] :
                nil
    synonyms = d["synonym_scientific_name"]
    @all_taxa[id] = { id: id,
                      parent_id: parent_id,
                      scientific_name: scientific_name,
                      synonyms: synonyms }
  end

  @all_taxa.each do |k, v|
    next unless v[:scientific_name] && v[:scientific_name].strip != ""

    parent_id = v[:parent_id]
    until (@all_taxa[parent_id] &&
            @all_taxa[parent_id][:scientific_name]) || parent_id.nil?
      puts "did not find parent %s" % parent_id
      parent_id = @all_taxa[parent_id]
    end
    parent_id = nil if v[:id] == parent_id
    v[:parent_id] = parent_id
    @cleaned_taxa[k] = v
  end
end
run_query(q) click to toggle source
# File lib/dwca_hunter/resources/freebase.rb, line 60
def run_query(q)
  count = 0
  requests_num = 0
  loop do
    freebase_url = "http://api.freebase.com/api/service/mqlread?query=%s" %
                   URI.encode(q.to_json)
    res = JSON.load RestClient.get(freebase_url)
    requests_num += 1
    break if res["result"].nil? || res["result"].empty?

    if requests_num % 10 == 0
      DwcaHunter.logger_write(object_id,
                              "Received %s names" % count)
    end
    count += res["result"].size
    res["result"].each { |d| @data << d }
    q[:cursor] = res["cursor"]
  end
end