class DwcaHunter::ResourcePaleoBioDb
Constants
- OCCURANCE_URL
- REFS_URL
- TAXA_REFS_URL
- TAXA_URL
- URLS
Public Class Methods
new(opts = {})
click to toggle source
Calls superclass method
DwcaHunter::Resource::new
# File lib/dwca_hunter/resources/paleobiodb.rb, line 23 def initialize(opts = {}) # opts = {download: false} @command = "paleodb" @title = "The Paleobiology Database" @UUID = "fad9970e-c358-4e1b-8cc3-f9ad2582751f" @download_path = File.join(Dir.tmpdir, "dwca_hunter", "paleobiodb", "fake.csv") @synonyms = [] @names = [] @vernaculars = [] @extensions = [] @synonyms_hash = {} @vernaculars_hash = {} super(opts) end
Public Instance Methods
download()
click to toggle source
# File lib/dwca_hunter/resources/paleobiodb.rb, line 40 def download puts "Downloading from original." URLS.each do |k, v| file_name = k.to_s + ".txt" f = File.open(File.join(@download_dir, file_name), "w:utf-8") puts "Getting #{k}" data = RestClient::Request.execute(method: :get, url: v, timeout: 600) f.write(data) f.close end remove_header_text end
make_dwca()
click to toggle source
# File lib/dwca_hunter/resources/paleobiodb.rb, line 55 def make_dwca DwcaHunter.logger_write(object_id, "Extracting data") harvester = PaleodbHarvester.new(@download_dir) harvester.taxa harvester.refs harvester.taxa_refs harvester.occurences @taxa_json = JSON.parse(File.read( File.join(@download_dir, "json", "taxa.json") ), symbolize_names: true) @name_id_json = JSON.parse(File.read( File.join(@download_dir, "json", "name_id.json") ), symbolize_names: true) get_names generate_dwca end
unpack()
click to toggle source
# File lib/dwca_hunter/resources/paleobiodb.rb, line 53 def unpack; end
Private Instance Methods
append_accepted_species(row)
click to toggle source
# File lib/dwca_hunter/resources/paleobiodb.rb, line 102 def append_accepted_species(row) c = classification({}, row) name = { id: row[:id], acc_id: row[:id], klass: c[:class], order: c[:order], family: c[:family], genus: c[:genus], name: row[:name], auth: row[:auth] } @names << name end
append_synonyms(row)
click to toggle source
# File lib/dwca_hunter/resources/paleobiodb.rb, line 117 def append_synonyms(row) id, acc_id = synonymId(row) syn = { id: id, name: row[:name], auth: row[:auth], acc_id: acc_id } @names << syn end
classification(data, row)
click to toggle source
# File lib/dwca_hunter/resources/paleobiodb.rb, line 137 def classification(data, row) data = {} stack = [[data, row]] until stack.empty? data, row = stack.delete_at(0) next unless @taxa_json[row[:parent_id].to_sym] && row[:parent_id] != row[:id] row = @taxa_json[row[:parent_id].to_sym] data[row[:rank].to_sym] = row[:name] unless data[row[:rank].to_sym] stack << [data, row] end data end
generate_dwca()
click to toggle source
Calls superclass method
DwcaHunter::Resource#generate_dwca
# File lib/dwca_hunter/resources/paleobiodb.rb, line 158 def generate_dwca DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file") @core = [["http://rs.tdwg.org/dwc/terms/taxonID", "http://rs.tdwg.org/dwc/terms/scientificName", "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID", "http://rs.tdwg.org/dwc/terms/class", "http://rs.tdwg.org/dwc/terms/order", "http://rs.tdwg.org/dwc/terms/family", "http://rs.tdwg.org/dwc/terms/genus", "http://rs.tdwg.org/dwc/terms/nomenclaturalCode"]] @names.each do |n| name_string = "#{n[:name]} #{n[:auth]}".strip @core << [n[:id], name_string, n[:acc_id], n[:kingdom], n[:phylum], n[:klass], n[:order], n[:family], n[:genus], n[:code]] end @eml = { id: @uuid, title: @title, authors: [ { email: "admin@paleobiodb.org" } ], metadata_providers: [ { first_name: "Dmitry", last_name: "Mozzherin", email: "dmozzherin@gmail.com" } ], abstract: "The Paleobiology Database (PBDB) is a non-governmental, non-profit public resource for paleontological data. It has been organized and operated by a multi-disciplinary, multi-institutional, international group of paleobiological researchers. Its purpose is to provide global, collection-based occurrence and taxonomic data for organisms of all geological ages, as well data services to allow easy access to data for independent development of analytical tools, visualization software, and applications of all types. The Database’s broader goal is to encourage and enable data-driven collaborative efforts that address large-scale paleobiological questions.", url: @url } super end
get_names()
click to toggle source
# File lib/dwca_hunter/resources/paleobiodb.rb, line 90 def get_names sp, syn = species sp.each_with_index do |r, i| puts format("Processing %s species", i) if (i % 5000).zero? append_accepted_species(r) end syn.each_with_index do |r, i| puts format("Processing %s synonyms", i) if (i % 5000).zero? append_synonyms(r) end end
remove_header_text()
click to toggle source
# File lib/dwca_hunter/resources/paleobiodb.rb, line 74 def remove_header_text URLS.each do |k, _v| file_name = k.to_s + ".csv" fout = File.open(File.join(@download_dir, file_name), "w:utf-8") csv_started = false File.open(File.join(@download_dir, k.to_s + ".txt")).each do |l| unless csv_started csv_started = true if l =~ /"Records:"/ next end fout.write(l) end end end
species()
click to toggle source
# File lib/dwca_hunter/resources/paleobiodb.rb, line 151 def species @taxa_json.values.select { |v| (v[:rank] == "species") }. partition do |v| (v[:name] == v[:acc_name]) || v[:acc_id].nil? end end
synonymId(row)
click to toggle source
# File lib/dwca_hunter/resources/paleobiodb.rb, line 128 def synonymId(row) acc_id = row[:acc_id] id = row[:id] acc_id = @name_id_json[row[:acc_name].to_sym][:id] if id == acc_id [id, acc_id] rescue StandardError puts "Unable to get synonymId" end