class EuPathDBSpeciesData
A class dedicated to recording ‘administrative’ data about the databases, answering questions such as “which species are recorded in ToxoDB?” for instance.
It is also meant for dealing with locally cached version of the files, where all the data is stored in a base directory with a specified structure.
TODO: functions for the info and the local caching should probably be separated into separate classes, and the directory structure of the local versions shouldn’t be forced on the user.
Constants
- DATABASES
- SOURCE_VERSIONS
Public Class Methods
Download all the data files from all the EuPathDB databases, or just one single database. Requires wget to be available on the command line
# File lib/eupathdb_species_data.rb, line 370 def self.download(base_download_directory, database_name=nil) # by default, download everything if database_name.nil? EuPathDBSpeciesData::DATABASES.each do |d| download base_download_directory, d end else # Download the new files from the relevant database EuPathDBSpeciesData.species_data_from_database(database_name, base_download_directory).each do |spd| spd.directories_for_mkdir.each do |directory| unless File.exists?(directory) Dir.mkdir(directory) end end Dir.chdir(spd.local_download_directory) do p spd.eu_path_db_fasta_download_directory # protein unless File.exists?(spd.protein_fasta_filename) `wget #{spd.eu_path_db_fasta_download_directory}/#{spd.protein_fasta_filename}` end # gff unless File.exists?(spd.gff_filename) `wget #{spd.eu_path_db_gff_download_directory}/#{spd.gff_filename}` end # transcripts unless File.exists?(spd.transcript_fasta_filename) `wget #{spd.eu_path_db_fasta_download_directory}/#{spd.transcript_fasta_filename}` end # gene information table unless File.exists?(spd.gene_information_filename) `wget '#{spd.eu_path_db_txt_download_directory}/#{spd.gene_information_filename}'` end # genomic unless File.exists?(spd.genomic_fasta_filename) `wget '#{spd.eu_path_db_fasta_download_directory}/#{spd.genomic_fasta_filename}'` end end end end end
Create a new object about one particular species. The species can be specified by a nickname, which is either the full binomal name of the specie e.g. “Plasmodium falciparum”, or by simply the second part (the species name without the genus name) e.g. ‘falciparum’.
base_data_directory is the directory where locally cached version of the downloaded files are stored.
# File lib/eupathdb_species_data.rb, line 190 def initialize(nickname, base_data_directory=nil, database_version=nil) @species_data = @@data[nickname] # try the full name @species_data ||= @@data[nickname.capitalize.gsub('_',' ')] #try replacing underscores if @species_data.nil? # try using just the second word splits = nickname.split(' ') if splits.length == 2 @species_data = @@data[splits[1]] end end raise Exception, "Couldn't find species data for #{nickname}" unless @species_data @base_data_directory = base_data_directory # record out what version of the db we are looking at, otherwise default @database_version = database_version @database_version ||= SOURCE_VERSIONS[@species_data[:source]] end
Return a list of the species names that are included in the EuPathDB database
# File lib/eupathdb_species_data.rb, line 358 def self.species_data_from_database(database_name, base_download_directory=nil) species = @@data.select {|name, info| info[:source].downcase == database_name.downcase and name == info[:name] #only allow ones that are fully specified - not shortcut ones } species.collect do |name_info| EuPathDBSpeciesData.new(name_info[0], base_download_directory) end end
Public Instance Methods
# File lib/eupathdb_species_data.rb, line 296 def database @species_data[:source] end
an array of directory names. mkdir is called on each of them in order, otherwise mkdir throws errors because there isn’t sufficient folders to build on.
# File lib/eupathdb_species_data.rb, line 338 def directories_for_mkdir if @base_data_directory.nil? raise Exception, "Unable to generate directories when @base_data_directory is not set" end s = @species_data components = [ @base_data_directory, s[:name], 'genome', s[:source], @database_version, ] (0..components.length-1).collect do |i| components[0..i].join('/') end end
# File lib/eupathdb_species_data.rb, line 300 def eu_path_db_download_directory "http://#{database.downcase}.org/common/downloads/release-#{@database_version}/#{one_word_name}" end
# File lib/eupathdb_species_data.rb, line 304 def eu_path_db_fasta_download_directory path = "#{eu_path_db_download_directory}/fasta" path = "#{path}/data" if @species_data[:behind_usage_policy] path end
# File lib/eupathdb_species_data.rb, line 310 def eu_path_db_gff_download_directory path = "#{eu_path_db_download_directory}/gff" path = "#{path}/data" if @species_data[:behind_usage_policy] path end
# File lib/eupathdb_species_data.rb, line 316 def eu_path_db_txt_download_directory path = "#{eu_path_db_download_directory}/txt" path = "#{path}/data" if @species_data[:behind_usage_policy] path end
# File lib/eupathdb_species_data.rb, line 233 def gene_information_filename f = @species_data[:gene_information_filename] if f "#{f.call(version)}" else # TgondiiME49Gene_ToxoDB-5.2.txt.gz # PfalciparumGene_PlasmoDB-6.1.txt.gz "#{representative_strain_name}Gene_#{database}-#{version}.txt" end end
The path to the EuPathDB gene information table (stored as a gzip)
# File lib/eupathdb_species_data.rb, line 220 def gene_information_gzfile_filename "#{gene_information_filename}.gz" end
The path to the EuPathDB gene information table (stored as a gzip)
# File lib/eupathdb_species_data.rb, line 215 def gene_information_gzfile_path "#{local_download_directory}/#{gene_information_gzfile_filename}" end
# File lib/eupathdb_species_data.rb, line 224 def gene_information_path "#{local_download_directory}/#{gene_information_filename}" end
# File lib/eupathdb_species_data.rb, line 275 def genomic_fasta_filename genomic = @species_data[:genomic_fasta_filename] if genomic return "#{genomic.call(version)}" else return "#{representative_strain_name}Genomic_#{database}-#{version}.fasta" end end
# File lib/eupathdb_species_data.rb, line 284 def gff_filename if @species_data[:gff_filename] return @species_data[:gff_filename].call(version) else return "#{representative_strain_name}_#{database}-#{version}.gff" end end
# File lib/eupathdb_species_data.rb, line 292 def gff_path File.join(local_download_directory,gff_filename) end
# File lib/eupathdb_species_data.rb, line 330 def local_download_directory s = @species_data "#{@base_data_directory}/#{s[:name]}/genome/#{s[:source]}/#{@database_version}" end
# File lib/eupathdb_species_data.rb, line 208 def method_missing(symbol) answer = @species_data[symbol] return answer unless answer.nil? super end
Plasmodium chabaudi => Pchabaudi
# File lib/eupathdb_species_data.rb, line 323 def one_word_name return @species_data[:database_download_folder] unless @species_data[:database_download_folder].nil? splits = @species_data[:name].split(' ') raise unless splits.length == 2 return "#{splits[0][0..0]}#{splits[1]}" end
# File lib/eupathdb_species_data.rb, line 259 def protein_blast_database_path "/blastdb/#{protein_fasta_filename}" end
# File lib/eupathdb_species_data.rb, line 413 def protein_fasta_file_iterator Bio::EuPathDB::FastaParser.new(fasta_file_species_name, protein_fasta_path) end
# File lib/eupathdb_species_data.rb, line 247 def protein_fasta_filename if @species_data[:proteins_fasta_filename] return "#{@species_data[:proteins_fasta_filename].call(version)}" else return "#{representative_strain_name}AnnotatedProteins_#{database}-#{version}.fasta" end end
# File lib/eupathdb_species_data.rb, line 255 def protein_fasta_path return File.join(local_download_directory,protein_fasta_filename) end
# File lib/eupathdb_species_data.rb, line 228 def representative_strain_name return @species_data[:representative_strain_name] unless @species_data[:representative_strain_name].nil? return one_word_name end
# File lib/eupathdb_species_data.rb, line 263 def transcript_fasta_filename if @species_data[:transcripts_fasta_filename] return "#{@species_data[:transcripts_fasta_filename].call(version)}" else return "#{representative_strain_name}AnnotatedTranscripts_#{database}-#{version}.fasta" end end
# File lib/eupathdb_species_data.rb, line 271 def transcript_fasta_path File.join(local_download_directory,transcript_fasta_filename) end
# File lib/eupathdb_species_data.rb, line 243 def version @database_version end