class DwcaHunter::ResourceNCBI
Public Class Methods
new(opts = {})
click to toggle source
Calls superclass method
DwcaHunter::Resource::new
# File lib/dwca_hunter/resources/ncbi.rb, line 5 def initialize(opts = {}) @command = "ncbi" @title = "National Center for Biotechnology Information" @url = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" @uuid = "97d7633b-5f79-4307-a397-3c29402d9311" @download_path = File.join(Dir.tmpdir, "dwca_hunter", "ncbi", "data.tar.gz") @names = {} @data = [] @collected_names = ["genbank common name", "common name", "valid"] @core = [] @extensions = [] super end
Public Instance Methods
make_dwca()
click to toggle source
# File lib/dwca_hunter/resources/ncbi.rb, line 26 def make_dwca set_vars get_names get_classification generate_dwca end
unpack()
click to toggle source
# File lib/dwca_hunter/resources/ncbi.rb, line 22 def unpack unpack_tar end
Private Instance Methods
generate_dwca()
click to toggle source
Calls superclass method
DwcaHunter::Resource#generate_dwca
# File lib/dwca_hunter/resources/ncbi.rb, line 106 def generate_dwca DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file") @core = [["http://rs.tdwg.org/dwc/terms/taxonId", "http://purl.org/dc/terms/scientificName", "http://purl.org/dc/terms/parentNameUsageId", "http://purl.org/dc/terms/taxonRank"]] DwcaHunter.logger_write(object_id, "Assembling Core Data") count = 0 @data.map do |d| count += 1 if (count % BATCH_SIZE).zero? DwcaHunter.logger_write(object_id, "Traversing #{count} core " \ "data record" % count) end @core << [d[:id], d[:scientificName], d[:parentNameUsageId], d[:taxonRank]] end @extensions << { data: [["http://rs.tdwg.org/dwc/terms/TaxonID", "http://rs.tdwg.org/dwc/terms/vernacularName"]], file_name: "vernacular_names.txt" } @extensions << { data: [[ "http://rs.tdwg.org/dwc/terms/taxonId", "http://rs.tdwg.org/dwc/terms/scientificName", "http://rs.tdwg.org/dwc/terms/taxonomicStatus" ]], file_name: "synonyms.txt" } DwcaHunter.logger_write(object_id, "Creating verncaular name " \ "extension for DarwinCore Archive file") count = 0 @data.each do |d| count += 1 if (count % BATCH_SIZE).zero? DwcaHunter.logger_write(object_id, "Traversing #{count} extension data record") end d[:vernacularNames].each do |vn| @extensions[0][:data] << [d[:id], vn] end d[:synonyms].each do |synonym| @extensions[1][:data] << [d[:id], synonym[:scientificName], synonym[:taxonomicStatus]] end end @eml = { id: @uuid, title: @title, authors: [{ url: "http://www.ncbi.org" }], abstract: "The National Center for Biotechnology Information " \ "advances science and health by providing access to " \ "biomedical and genomic information.", metadata_providers: [ { first_name: "mitry", last_name: "Mozzherin", email: "dmozzherin@mbl.edu" } ], url: @url } super end
get_classification()
click to toggle source
# File lib/dwca_hunter/resources/ncbi.rb, line 65 def get_classification DwcaHunter.logger_write(object_id, "Building classification...") open(@nodes_file, "r:utf-8").each_with_index do |line, i| DwcaHunter.logger_write(object_id, "Collected %s nodes..." % i) if i > 0 && i % BATCH_SIZE == 0 line = line.split("|").map { |l| cleanup(l) } id = line[0] next if id == 1 parent_tax_id = line[1] rank = line[2] hidden_flag = line[10] comments = line[12] rank = "" if rank == "no rank" parent_tax_id = nil if parent_tax_id == 1 next unless @names[id] && @names[id]["valid"] vernacular_names = [] synonyms = [] @names[id].keys.each do |k| if @collected_names.include? k vernacular_names += @names[id][k] if k != "valid" else synonyms << { scientificName: @names[id][k], taxonomicStatus: k } end end @data << { id: id, scientificName: @names[id]["valid"][0], parentNameUsageId: parent_tax_id, taxonRank: rank, taxonomicStatus: "valid", vernacularNames: vernacular_names, synonyms: [] } @names[id].keys.each do |k| end end end
get_names()
click to toggle source
# File lib/dwca_hunter/resources/ncbi.rb, line 40 def get_names DwcaHunter.logger_write(object_id, "Collecting names...") open(@names_file).each_with_index do |line, i| DwcaHunter.logger_write(object_id, "Collected %s names..." % i) if i > 0 && i % BATCH_SIZE == 0 line = line.split("|").map { |l| cleanup(l) } id = line[0] next if id == 1 name = line[1] name_type = line[3] name_type = "valid" if name_type == "scientific name" begin name = name.gsub(/(^|\s)('|")(.*?)\2(\s|-|$)/, '\1\3\5'). gsub(/\s+/, " ") rescue NoMethodError puts "wrong name: %s" % name next end @names[id] = {} unless @names[id] @names[id][name_type] ? (@names[id][name_type] << name) : (@names[id][name_type] = [name]) end end
set_vars()
click to toggle source
# File lib/dwca_hunter/resources/ncbi.rb, line 35 def set_vars @names_file = File.join(@download_dir, "names.dmp") @nodes_file = File.join(@download_dir, "nodes.dmp") end