class DwcaHunter::ResourceITIS

Public Class Methods

new(opts = {}) click to toggle source
Calls superclass method DwcaHunter::Resource::new
# File lib/dwca_hunter/resources/itis.rb, line 5
def initialize(opts = {})
  @command = "itis"
  @title = "Integrated Taxonomic Information SystemITIS"
  @url = "https://www.itis.gov/downloads/itisMySQLTables.tar.gz"
  @uuid = "5d066e84-e512-4a2f-875c-0a605d3d9f35"
  @download_path = File.join(Dir.tmpdir,
                             "dwca_hunter",
                             "itis",
                             "data.tar.gz")
  @ranks = {}
  @kingdoms = {}
  @authors = {}
  @vernaculars = {}
  @synonyms = {}
  @synonym_of = {}
  @names = {}
  @extensions = []
  super(opts)
  @itis_dir = File.join(@download_dir, "itis")
end

Public Instance Methods

make_dwca() click to toggle source
# File lib/dwca_hunter/resources/itis.rb, line 35
def make_dwca
  DwcaHunter.logger_write(object_id, "Extracting data")
  get_ranks
  get_kingdoms
  get_authors
  get_vernaculars
  get_synonyms
  get_names
  generate_dwca
end
unpack() click to toggle source
# File lib/dwca_hunter/resources/itis.rb, line 26
def unpack
  unpack_tar
  dir = Dir.entries(@download_dir).select { |e| e.match(/itisMySQL/) }[0]
  FileUtils.mv(File.join(@download_dir, dir), @itis_dir)

  # Create a file with the same name as the directory we extracted.
  FileUtils.touch(File.join(@itis_dir, "version_" + dir))
end

Private Instance Methods

generate_dwca() click to toggle source
Calls superclass method DwcaHunter::Resource#generate_dwca
# File lib/dwca_hunter/resources/itis.rb, line 215
def generate_dwca
  DwcaHunter.logger_write(object_id,
                          "Creating DarwinCore Archive file")
  @core = [["http://rs.tdwg.org/dwc/terms/taxonID",
            "http://rs.tdwg.org/dwc/terms/parentNameUsageID",
            "http://rs.tdwg.org/dwc/terms/acceptedNameUsageID",
            "http://rs.tdwg.org/dwc/terms/scientificName",
            "http://rs.tdwg.org/ontology/voc/TaxonName#nameComplete",
            "http://rs.tdwg.org/dwc/terms/taxonomicStatus",
            "http://rs.tdwg.org/dwc/terms/taxonRank"]]
  @extensions << { data: [["http://rs.tdwg.org/dwc/terms/taxonID",
                           "http://rs.tdwg.org/dwc/terms/vernacularName",
                           "http://purl.org/dc/terms/language"]],
                   file_name: "vernacular_names.txt",
                   row_type: "http://rs.gbif.org/terms/1.0/VernacularName" }
  @names.keys.each_with_index do |k, _i|
    d = @names[k]
    accepted_id = @synonyms[k] || nil
    parent_id = d[:parent_tsn].to_i == 0 ? nil : d[:parent_tsn]
    row = [k, parent_id, accepted_id, d[:name], d[:canonical_name], d[:status], d[:rank]]
    @core << row
  end

  @vernaculars.keys.each_with_index do |k, _i|
    d = @vernaculars[k]
    @extensions[0][:data] << [k, d[:name], d[:language]]
  end

  @eml = {
    id: @uuid,
    title: @title,
    authors: [
      { email: "itiswebmaster@itis.gov" }
    ],
    metadata_providers: [
      { first_name: "Dmitry",
        last_name: "Mozzherin",
        email: "dmozzherin@gmail.com" }
    ],
    abstract: "The White House Subcommittee on Biodiversity and " \
                "Ecosystem Dynamics has identified systematics as a " \
                "research priority that is fundamental to ecosystem " \
                "management and biodiversity conservation. This primary " \
                "need identified by the Subcommittee requires " \
                "improvements in the organization of, and access to, " \
                "standardized nomenclature. ITIS (originally referred " \
                "to as the Interagency Taxonomic Information System) " \
                "was designed to fulfill these requirements. In the " \
                "future, the ITIS will provide taxonomic data and a " \
                "directory of taxonomic expertise that will support " \
                "the system",
    url: "http://www.itis.gov"
  }
  super
end
get_authors() click to toggle source
# File lib/dwca_hunter/resources/itis.rb, line 79
def get_authors
  # 0    taxon_author_id serial not null
  # 1    taxon_author varchar(100,30) not null
  # 2    update_date date not null
  # 3    kingdom_id smallint not null

  f = open(File.join(@itis_dir, "taxon_authors_lkp"))
  f.each do |l|
    l.encode!("UTF-8",
              "ISO-8859-1",
              invalid: :replace,
              replace: "?")
    data = l.strip.split("|")
    @authors[data[0].strip] = data[1].strip
  end
end
get_kingdoms() click to toggle source
# File lib/dwca_hunter/resources/itis.rb, line 67
def get_kingdoms
  # 0    kingdom_id serial not null
  # 1    kingdom_name char(10) not null
  # 2    update_date date not null

  f = open(File.join(@itis_dir, "kingdoms"))
  f.each do |l|
    data = l.strip.split("|")
    @kingdoms[data[0].strip] = data[1].strip
  end
end
get_names() click to toggle source
# File lib/dwca_hunter/resources/itis.rb, line 146
def get_names
  # 0    tsn serial not null
  # 1    unit_ind1 char(1)
  # 2    unit_name1 char(35) not null
  # 3    unit_ind2 char(1)
  # 4    unit_name2 varchar(35)
  # 5    unit_ind3 varchar(7)
  # 6    unit_name3 varchar(35)
  # 7    unit_ind4 varchar(7)
  # 8    unit_name4 varchar(35)
  # 9    unnamed_taxon_ind char(1)
  # 10   usage varchar(12,5) not null
  # 11   unaccept_reason varchar(50,9)
  # 12   credibility_rtng varchar(40,17) not null
  # 13   completeness_rtng char(10)
  # 14   currency_rating char(7)
  # 15   phylo_sort_seq smallint
  # 16   initial_time_stamp datetime year to second not null
  # 17   parent_tsn integer
  # 18   taxon_author_id integer
  # 19   hybrid_author_id integer
  # 20   kingdom_id smallint not null
  # 21   rank_id smallint not null
  # 22   update_date date not null
  # 23   uncertain_prnt_ind char(3)

  f = open(File.join(@itis_dir, "taxonomic_units"))
  f.each_with_index do |l, i|
    if i % BATCH_SIZE == 0
      DwcaHunter.logger_write(object_id,
                              "Extracted %s names" % i)
    end
    l.encode!("UTF-8",
              "ISO-8859-1",
              invalid: :replace,
              replace: "?")
    data = l.split("|").map(&:strip)
    name_tsn = data[0]
    x1 = data[1]
    name_part1 = data[2]
    x2         = data[3]
    name_part2 = data[4]
    sp_marker1 = data[5]
    name_part3 = data[6]
    sp_marker2 = data[7]
    name_part4 = data[8]
    status     = data[10]
    parent_tsn = data[17]
    author_id  = data[18]
    kingdom_id = data[20]
    rank_id    = data[21]

    parent_tsn = nil if parent_tsn == ""
    name = [x1, name_part1, x2, name_part2,
            sp_marker1, name_part3, sp_marker2, name_part4]
    canonical_name = name.clone
    name << @authors[author_id] if @authors[author_id]
    name = name.join(" ").strip.gsub(/\s+/, " ")
    canonical_name = canonical_name.join(" ").strip.gsub(/\s+/, " ")
    rank = @ranks[kingdom_id + "/" + rank_id] ||
           ""
    @names[name_tsn] = { name: name,
                         canonical_name: canonical_name,
                         status: status,
                         parent_tsn: parent_tsn,
                         rank: rank }
  end
end
get_ranks() click to toggle source
# File lib/dwca_hunter/resources/itis.rb, line 48
def get_ranks
  # 0    kingdom_id integer not null
  # 1    rank_id smallint not null
  # 2    rank_name char(15) not null
  # 3    dir_parent_rank_id smallint not null
  # 4    req_parent_rank_id smallint not null
  # 5    update_date date not null
  rank_file = File.join(@itis_dir, "taxon_unit_types")
  f = open(rank_file, "r:utf-8")
  f.each do |l|
    l.encode!("UTF-8",
              "ISO-8859-1",
              invalid: :replace,
              replace: "?")
    row = l.strip.split("|")
    @ranks[row[0].strip + "/" + row[1].strip] = row[2].strip
  end
end
get_synonyms() click to toggle source
# File lib/dwca_hunter/resources/itis.rb, line 124
def get_synonyms
  # 0    tsn integer not null
  # 1    tsn_accepted integer not null
  # 2    update_date date not null

  f = open(File.join(@itis_dir, "synonym_links"))
  f.each_with_index do |l, i|
    if i % BATCH_SIZE == 0
      DwcaHunter.logger_write(object_id,
                              "Extracted %s synonyms" % i)
    end
    l.encode!("UTF-8",
              "ISO-8859-1",
              invalid: :replace,
              replace: "?")
    data = l.split("|").map(&:strip)
    synonym_name_tsn = data[0]
    accepted_name_tsn = data[1]
    @synonyms[synonym_name_tsn] = accepted_name_tsn
  end
end
get_vernaculars() click to toggle source
# File lib/dwca_hunter/resources/itis.rb, line 96
def get_vernaculars
  # 0    tsn integer not null
  # 1    vernacular_name varchar(80,5) not null
  # 2    language varchar(15) not null
  # 3    approved_ind char(1)
  # 4    update_date date not null
  # 5    primary key (tsn,vernacular_name,language)
  #      constraint "itis".vernaculars_key

  f = open(File.join(@itis_dir, "vernaculars"))
  f.each_with_index do |l, i|
    if i % BATCH_SIZE == 0
      DwcaHunter.logger_write(object_id,
                              "Extracted %s vernacular names" % i)
    end
    l.encode!("UTF-8",
              "ISO-8859-1",
              invalid: :replace,
              replace: "?")
    data = l.split("|").map(&:strip)
    name_tsn = data[0]
    string   = data[1]
    language = data[2]
    language = "Common name" if language == "unspecified"
    @vernaculars[name_tsn] = { name: string, language: language }
  end
end