class EuPathDBSpeciesData

A class dedicated to recording ‘administrative’ data about the databases, answering questions such as “which species are recorded in ToxoDB?” for instance.

It is also meant for dealing with locally cached version of the files, where all the data is stored in a base directory with a specified structure.

TODO: functions for the info and the local caching should probably be separated into separate classes, and the directory structure of the local versions shouldn’t be forced on the user.

Constants

DATABASES
SOURCE_VERSIONS

Public Class Methods

download(base_download_directory, database_name=nil) click to toggle source

Download all the data files from all the EuPathDB databases, or just one single database. Requires wget to be available on the command line

# File lib/eupathdb_species_data.rb, line 370
def self.download(base_download_directory, database_name=nil)
  # by default, download everything
  if database_name.nil?
    EuPathDBSpeciesData::DATABASES.each do |d|
      download base_download_directory, d
    end
  else
    # Download the new files from the relevant database
    EuPathDBSpeciesData.species_data_from_database(database_name, base_download_directory).each do |spd|
      spd.directories_for_mkdir.each do |directory|
        unless File.exists?(directory)
          Dir.mkdir(directory)
        end
      end
      
      Dir.chdir(spd.local_download_directory) do
        p spd.eu_path_db_fasta_download_directory
          
        # protein
        unless File.exists?(spd.protein_fasta_filename)
          `wget #{spd.eu_path_db_fasta_download_directory}/#{spd.protein_fasta_filename}`
        end
        # gff
        unless File.exists?(spd.gff_filename)
          `wget #{spd.eu_path_db_gff_download_directory}/#{spd.gff_filename}`
        end
        # transcripts
        unless File.exists?(spd.transcript_fasta_filename)
          `wget #{spd.eu_path_db_fasta_download_directory}/#{spd.transcript_fasta_filename}`
        end
        # gene information table
        unless File.exists?(spd.gene_information_filename)
          `wget '#{spd.eu_path_db_txt_download_directory}/#{spd.gene_information_filename}'`
        end
        # genomic
        unless File.exists?(spd.genomic_fasta_filename)
          `wget '#{spd.eu_path_db_fasta_download_directory}/#{spd.genomic_fasta_filename}'`
        end
      end
    end
  end
end
new(nickname, base_data_directory=nil, database_version=nil) click to toggle source

Create a new object about one particular species. The species can be specified by a nickname, which is either the full binomal name of the specie e.g. “Plasmodium falciparum”, or by simply the second part (the species name without the genus name) e.g. ‘falciparum’.

base_data_directory is the directory where locally cached version of the downloaded files are stored.

# File lib/eupathdb_species_data.rb, line 190
def initialize(nickname, base_data_directory=nil, database_version=nil)
  @species_data = @@data[nickname] # try the full name
  @species_data ||= @@data[nickname.capitalize.gsub('_',' ')] #try replacing underscores
  if @species_data.nil? # try using just the second word
    splits = nickname.split(' ')
    if splits.length == 2
      @species_data = @@data[splits[1]]
    end
  end
  raise Exception, "Couldn't find species data for #{nickname}" unless @species_data
  
  @base_data_directory = base_data_directory
  
  # record out what version of the db we are looking at, otherwise default
  @database_version = database_version
  @database_version ||= SOURCE_VERSIONS[@species_data[:source]]
end
species_data_from_database(database_name, base_download_directory=nil) click to toggle source

Return a list of the species names that are included in the EuPathDB database

# File lib/eupathdb_species_data.rb, line 358
def self.species_data_from_database(database_name, base_download_directory=nil)
  species = @@data.select {|name, info|
    info[:source].downcase == database_name.downcase and
    name == info[:name] #only allow ones that are fully specified - not shortcut ones
  }
  species.collect do |name_info|
    EuPathDBSpeciesData.new(name_info[0], base_download_directory)
  end
end

Public Instance Methods

database() click to toggle source
# File lib/eupathdb_species_data.rb, line 296
def database
  @species_data[:source]
end
directories_for_mkdir() click to toggle source

an array of directory names. mkdir is called on each of them in order, otherwise mkdir throws errors because there isn’t sufficient folders to build on.

# File lib/eupathdb_species_data.rb, line 338
def directories_for_mkdir
  if @base_data_directory.nil?
    raise Exception, "Unable to generate directories when @base_data_directory is not set"
  end
  
  s = @species_data
  components = [
    @base_data_directory,
  s[:name],
    'genome',
  s[:source],
  @database_version,
  ]
  
   (0..components.length-1).collect do |i|
    components[0..i].join('/')
  end
end
eu_path_db_download_directory() click to toggle source
# File lib/eupathdb_species_data.rb, line 300
def eu_path_db_download_directory
  "http://#{database.downcase}.org/common/downloads/release-#{@database_version}/#{one_word_name}"
end
eu_path_db_fasta_download_directory() click to toggle source
# File lib/eupathdb_species_data.rb, line 304
def eu_path_db_fasta_download_directory
  path = "#{eu_path_db_download_directory}/fasta"
  path = "#{path}/data" if @species_data[:behind_usage_policy]
  path
end
eu_path_db_gff_download_directory() click to toggle source
# File lib/eupathdb_species_data.rb, line 310
def eu_path_db_gff_download_directory
  path = "#{eu_path_db_download_directory}/gff"
  path = "#{path}/data" if @species_data[:behind_usage_policy]
  path
end
eu_path_db_txt_download_directory() click to toggle source
# File lib/eupathdb_species_data.rb, line 316
def eu_path_db_txt_download_directory
  path = "#{eu_path_db_download_directory}/txt"
  path = "#{path}/data" if @species_data[:behind_usage_policy]
  path
end
gene_information_filename() click to toggle source
# File lib/eupathdb_species_data.rb, line 233
def gene_information_filename
  f = @species_data[:gene_information_filename]
  if f
    "#{f.call(version)}"
  else      # TgondiiME49Gene_ToxoDB-5.2.txt.gz
    # PfalciparumGene_PlasmoDB-6.1.txt.gz
    "#{representative_strain_name}Gene_#{database}-#{version}.txt"
  end
end
gene_information_gzfile_filename() click to toggle source

The path to the EuPathDB gene information table (stored as a gzip)

# File lib/eupathdb_species_data.rb, line 220
def gene_information_gzfile_filename
  "#{gene_information_filename}.gz"
end
gene_information_gzfile_path() click to toggle source

The path to the EuPathDB gene information table (stored as a gzip)

# File lib/eupathdb_species_data.rb, line 215
def gene_information_gzfile_path
  "#{local_download_directory}/#{gene_information_gzfile_filename}"
end
gene_information_path() click to toggle source
# File lib/eupathdb_species_data.rb, line 224
def gene_information_path
  "#{local_download_directory}/#{gene_information_filename}"
end
genomic_fasta_filename() click to toggle source
# File lib/eupathdb_species_data.rb, line 275
def genomic_fasta_filename
  genomic = @species_data[:genomic_fasta_filename]
  if genomic
    return "#{genomic.call(version)}"
  else
    return "#{representative_strain_name}Genomic_#{database}-#{version}.fasta"
  end
end
gff_filename() click to toggle source
# File lib/eupathdb_species_data.rb, line 284
def gff_filename
  if @species_data[:gff_filename]
    return @species_data[:gff_filename].call(version)
  else
    return "#{representative_strain_name}_#{database}-#{version}.gff"
  end
end
gff_path() click to toggle source
# File lib/eupathdb_species_data.rb, line 292
def gff_path
  File.join(local_download_directory,gff_filename)
end
local_download_directory() click to toggle source
# File lib/eupathdb_species_data.rb, line 330
def local_download_directory
  s = @species_data
  "#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{@database_version}"
end
method_missing(symbol) click to toggle source
Calls superclass method
# File lib/eupathdb_species_data.rb, line 208
def method_missing(symbol)
  answer = @species_data[symbol]
  return answer unless answer.nil?
  super
end
one_word_name() click to toggle source

Plasmodium chabaudi => Pchabaudi

# File lib/eupathdb_species_data.rb, line 323
def one_word_name
  return @species_data[:database_download_folder] unless @species_data[:database_download_folder].nil?
  splits = @species_data[:name].split(' ')
  raise unless splits.length == 2
  return "#{splits[0][0..0]}#{splits[1]}"
end
protein_blast_database_path() click to toggle source
# File lib/eupathdb_species_data.rb, line 259
def protein_blast_database_path
  "/blastdb/#{protein_fasta_filename}"
end
protein_fasta_file_iterator() click to toggle source
# File lib/eupathdb_species_data.rb, line 413
def protein_fasta_file_iterator
  Bio::EuPathDB::FastaParser.new(fasta_file_species_name, protein_fasta_path)
end
protein_fasta_filename() click to toggle source
# File lib/eupathdb_species_data.rb, line 247
def protein_fasta_filename
  if @species_data[:proteins_fasta_filename]
    return "#{@species_data[:proteins_fasta_filename].call(version)}"
  else
    return "#{representative_strain_name}AnnotatedProteins_#{database}-#{version}.fasta"
  end
end
protein_fasta_path() click to toggle source
# File lib/eupathdb_species_data.rb, line 255
def protein_fasta_path
  return File.join(local_download_directory,protein_fasta_filename)
end
representative_strain_name() click to toggle source
# File lib/eupathdb_species_data.rb, line 228
def representative_strain_name
  return @species_data[:representative_strain_name] unless @species_data[:representative_strain_name].nil?
  return one_word_name
end
transcript_fasta_filename() click to toggle source
# File lib/eupathdb_species_data.rb, line 263
def transcript_fasta_filename
  if @species_data[:transcripts_fasta_filename]
    return "#{@species_data[:transcripts_fasta_filename].call(version)}"
  else
    return "#{representative_strain_name}AnnotatedTranscripts_#{database}-#{version}.fasta"
  end
end
transcript_fasta_path() click to toggle source
# File lib/eupathdb_species_data.rb, line 271
def transcript_fasta_path
  File.join(local_download_directory,transcript_fasta_filename)
end
version() click to toggle source
# File lib/eupathdb_species_data.rb, line 243
def version
  @database_version
end