class DwcaHunter::ResourceNCBI

Public Class Methods

new(opts = {}) click to toggle source
Calls superclass method DwcaHunter::Resource::new
# File lib/dwca_hunter/resources/ncbi.rb, line 5
def initialize(opts = {})
  @command = "ncbi"
  @title = "National Center for Biotechnology Information"
  @url = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
  @uuid = "97d7633b-5f79-4307-a397-3c29402d9311"
  @download_path = File.join(Dir.tmpdir,
                             "dwca_hunter",
                             "ncbi",
                             "data.tar.gz")
  @names = {}
  @data = []
  @collected_names = ["genbank common name", "common name", "valid"]
  @core = []
  @extensions = []
  super
end

Public Instance Methods

make_dwca() click to toggle source
# File lib/dwca_hunter/resources/ncbi.rb, line 26
def make_dwca
  set_vars
  get_names
  get_classification
  generate_dwca
end
unpack() click to toggle source
# File lib/dwca_hunter/resources/ncbi.rb, line 22
def unpack
  unpack_tar
end

Private Instance Methods

generate_dwca() click to toggle source
Calls superclass method DwcaHunter::Resource#generate_dwca
# File lib/dwca_hunter/resources/ncbi.rb, line 106
def generate_dwca
  DwcaHunter.logger_write(object_id, "Creating DarwinCore Archive file")
  @core = [["http://rs.tdwg.org/dwc/terms/taxonId",
            "http://purl.org/dc/terms/scientificName",
            "http://purl.org/dc/terms/parentNameUsageId",
            "http://purl.org/dc/terms/taxonRank"]]
  DwcaHunter.logger_write(object_id, "Assembling Core Data")
  count = 0
  @data.map do |d|
    count += 1
    if (count % BATCH_SIZE).zero?
      DwcaHunter.logger_write(object_id, "Traversing #{count} core " \
                              "data record" % count)
    end
    @core << [d[:id],
              d[:scientificName],
              d[:parentNameUsageId],
              d[:taxonRank]]
  end
  @extensions << {
    data: [["http://rs.tdwg.org/dwc/terms/TaxonID",
            "http://rs.tdwg.org/dwc/terms/vernacularName"]],
    file_name: "vernacular_names.txt"
  }
  @extensions << { data: [[
    "http://rs.tdwg.org/dwc/terms/taxonId",
    "http://rs.tdwg.org/dwc/terms/scientificName",
    "http://rs.tdwg.org/dwc/terms/taxonomicStatus"
  ]],
                   file_name: "synonyms.txt" }

  DwcaHunter.logger_write(object_id, "Creating verncaular name " \
                          "extension for DarwinCore Archive file")
  count = 0
  @data.each do |d|
    count += 1
    if (count % BATCH_SIZE).zero?
      DwcaHunter.logger_write(object_id,
                              "Traversing #{count} extension data record")
    end
    d[:vernacularNames].each do |vn|
      @extensions[0][:data] << [d[:id], vn]
    end

    d[:synonyms].each do |synonym|
      @extensions[1][:data] << [d[:id],
                                synonym[:scientificName],
                                synonym[:taxonomicStatus]]
    end
  end
  @eml = {
    id: @uuid,
    title: @title,
    authors: [{ url: "http://www.ncbi.org" }],
    abstract: "The National Center for Biotechnology Information " \
              "advances science and health by providing access to " \
              "biomedical and genomic information.",
    metadata_providers: [
      { first_name: "mitry",
        last_name: "Mozzherin",
        email: "dmozzherin@mbl.edu" }
    ],
    url: @url
  }
  super
end
get_classification() click to toggle source
# File lib/dwca_hunter/resources/ncbi.rb, line 65
def get_classification
  DwcaHunter.logger_write(object_id, "Building classification...")
  open(@nodes_file, "r:utf-8").each_with_index do |line, i|
    DwcaHunter.logger_write(object_id, "Collected %s nodes..." % i) if i > 0 && i % BATCH_SIZE == 0
    line = line.split("|").map { |l| cleanup(l) }
    id = line[0]
    next if id == 1

    parent_tax_id = line[1]
    rank = line[2]
    hidden_flag = line[10]
    comments = line[12]

    rank = "" if rank == "no rank"
    parent_tax_id = nil if parent_tax_id == 1
    next unless @names[id] && @names[id]["valid"]

    vernacular_names = []
    synonyms = []
    @names[id].keys.each do |k|
      if @collected_names.include? k
        vernacular_names += @names[id][k] if k != "valid"
      else
        synonyms << { scientificName: @names[id][k],
                      taxonomicStatus: k }
      end
    end
    @data << {
      id: id,
      scientificName: @names[id]["valid"][0],
      parentNameUsageId: parent_tax_id,
      taxonRank: rank,
      taxonomicStatus: "valid",
      vernacularNames: vernacular_names,
      synonyms: []
    }
    @names[id].keys.each do |k|
    end
  end
end
get_names() click to toggle source
# File lib/dwca_hunter/resources/ncbi.rb, line 40
def get_names
  DwcaHunter.logger_write(object_id, "Collecting names...")
  open(@names_file).each_with_index do |line, i|
    DwcaHunter.logger_write(object_id, "Collected %s names..." % i) if i > 0 && i % BATCH_SIZE == 0
    line = line.split("|").map { |l| cleanup(l) }
    id = line[0]
    next if id == 1

    name = line[1]
    name_type = line[3]
    name_type = "valid" if name_type == "scientific name"
    begin
      name = name.gsub(/(^|\s)('|")(.*?)\2(\s|-|$)/, '\1\3\5').
             gsub(/\s+/, " ")
    rescue NoMethodError
      puts "wrong name: %s" % name
      next
    end
    @names[id] = {} unless @names[id]
    @names[id][name_type] ?
      (@names[id][name_type] << name) :
      (@names[id][name_type] = [name])
  end
end
set_vars() click to toggle source
# File lib/dwca_hunter/resources/ncbi.rb, line 35
def set_vars
  @names_file = File.join(@download_dir, "names.dmp")
  @nodes_file = File.join(@download_dir, "nodes.dmp")
end