class Object
Public Instance Methods
db_schema()
click to toggle source
# File lib/snp_db_schema.rb, line 1 def db_schema ActiveRecord::Schema.define do unless table_exists? :strains create_table :strains do |t| t.column :name, :string t.column :description, :string end end unless table_exists? :features_snps create_table :features_snps, :id => false do |t| t.column :feature_id, :integer, :null => false t.column :snp_id, :integer, :null => false end end unless table_exists? :features create_table :features do |t| t.column :name, :string t.column :start, :integer t.column :end, :integer t.column :strand, :integer t.column :sequence, :string end end unless table_exists? :snps create_table :snps do |t| t.column :ref_pos, :integer t.column :qual, :float t.column :reference_allele_id, :integer end end unless table_exists? :alleles create_table :alleles do |t| t.column :snp_id, :integer t.column :base, :string end end unless table_exists? :genotypes create_table :genotypes do |t| t.column :allele_id, :integer t.column :strain_id, :integer t.column :geno_qual, :float end end unless table_exists? :annotations create_table :annotations do |t| t.column :qualifier, :string t.column :value, :string t.column :feature_id, :integer end end # indices unless index_exists? :strains, :id add_index :strains, :id end unless index_exists? :strains, :name add_index :strains, :name end unless index_exists? :features, :id add_index :features, :id end unless index_exists? :features, :name add_index :features, :name end unless index_exists? :features, :start add_index :features, :start end unless index_exists? :features, :end add_index :features, :end end unless index_exists? :features, :strand add_index :features, :strand end unless index_exists? :features, :sequence add_index :features, :sequence end unless index_exists? :snps, :id add_index :snps, :id end unless index_exists? :snps, :ref_pos add_index :snps, :ref_pos end unless index_exists? :snps, :qual add_index :snps, :qual end unless index_exists? :snps, :reference_allele_id add_index :snps, :reference_allele_id end unless index_exists? :features_snps, :feature_id add_index :features_snps, :feature_id end unless index_exists? :features_snps, :snp_id add_index :features_snps, :snp_id end unless index_exists? :snps, :qual add_index :snps, :qual end unless index_exists? :alleles, :snp_id add_index :alleles, :snp_id end unless index_exists? :alleles, :base add_index :alleles, :base end unless index_exists? :genotypes, :id add_index :genotypes, :id end unless index_exists? :genotypes, :allele_id add_index :genotypes, :allele_id end unless index_exists? :genotypes, :strain_id add_index :genotypes, :strain_id end unless index_exists? :genotypes, :geno_qual add_index :genotypes, :geno_qual end unless index_exists? :annotations, :feature_id add_index :annotations, :feature_id end unless index_exists? :annotations, :qualifier add_index :annotations, :qualifier end unless index_exists? :annotations, :value add_index :annotations, :value end end end
establish_connection(db_location)
click to toggle source
# File lib/snp_db_connection.rb, line 5 def establish_connection(db_location) ActiveRecord::Base.establish_connection( :adapter => "sqlite3", :database => db_location) end
find_unqiue_snps(strain_names, out, cutoff_genotype, cutoff_snp)
click to toggle source
# File lib/snp-search.rb, line 10 def find_unqiue_snps(strain_names, out, cutoff_genotype, cutoff_snp) *strain_names = strain_names where_statement = strain_names.collect{|strain_name| "strains.name = '#{strain_name}' OR "}.join("").sub(/ OR $/, "") outfile = File.open(out, "w") snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{cutoff_genotype} AND snps.qual >= #{cutoff_snp} AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}") # puts "The number of unique snps are #{snps.size}" output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, false) end
get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cutoff_genotype, cutoff_snp, tree, fasttree_path)
click to toggle source
# File lib/filter_ignore_snps_methods.rb, line 7 def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cutoff_genotype, cutoff_snp, tree, fasttree_path) strains = Strain.all sequence_hash = Hash.new sequence_hash["ref"] = Array.new strains.each do |strain| sequence_hash[strain.name] = Array.new end snps_array = Array.new snp_positions = Array.new # output opened for data input output = File.open(out, "w") tab_delim_file_name = File.basename(out, File.extname(out)) + "_snps.tsv" tab_delim_file = File.open(tab_delim_file_name, "w") position_map_file_name = File.basename(out, File.extname(out)) + "_snps_positions.txt" position_map_file = File.open(position_map_file_name, "w") snps_within_features_with_annotation = "" # Perform query # puts ignore_snps_on_annotation.inspect if ignore_snps_on_annotation annotations_where = ignore_snps_on_annotation.split(",").map{|annotation| "annotations.value LIKE '%#{annotation}%'"}.join(" OR ") features_with_annotation = Feature.joins(:annotations).where(annotations_where) snps_within_features_with_annotation = Snp.joins(:features).where("features.id IN (?)", features_with_annotation.collect{|feature| feature.id}) end if snps_within_features_with_annotation.empty? snps = Snp.all else snps = Snp.where("snps.id NOT IN (?)", snps_within_features_with_annotation.collect{|snp| snp.id}) end positions_to_ignore = Array.new if ignore_snps_in_range range_strings = ignore_snps_in_range.split(",") range_strings.each do |range| start_position, end_position = range.split("..") positions_to_ignore += (start_position.to_i..end_position.to_i).to_a end end if ignore_strains strains_to_ignore = ignore_strains.split(",") end i = 0 puts "Your Query is submitted and is being processed......." strains = Strain.find(:all) if ignore_strains strains_to_ignore = ignore_strains.split(",") strains.reject!{|strain| strains_to_ignore.include?(strain.name)} end snps.each do |snp| ActiveRecord::Base.transaction do i += 1 next if positions_to_ignore.include?(snp.ref_pos) # Ignore positions that user specified alleles = snp.alleles genotypes = snp.alleles.collect{|allele| allele.genotypes}.flatten snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}") # ignore snp if the snp qual is less than cutoff. next if snp_qual.any?{|snps_quality| snps_quality.qual < cutoff_snp.to_i} next if alleles.any?{|allele| allele.base.length > 1} # indel next unless genotypes.all?{|genotype| genotype.geno_qual >= cutoff_genotype.to_f} # all geno quals > cutoff # puts "#{i} SNPs processed so far" if i % 100 == 0 strain_alleles = Hash.new strains.each do |strain| strain_genotype = genotypes.select{|genotype| genotype.strain_id == strain.id}.first # next if strain_genotype == nil strain_allele = alleles.select{|allele| allele.id == strain_genotype.allele_id}.first strain_alleles[strain.name] = strain_allele.base end if remove_non_informative_snps next if strain_alleles.values.uniq.size == 1 # remove non-informative SNPs end snp_positions << snp.ref_pos snps_array << snp strain_alleles.each do |strain_name, allele_base| sequence_hash[strain_name] << allele_base end sequence_hash["ref"] << snp.reference_allele.base end end # If user has specified a tabular output if tabular_output output_information_methods(snps_array, output, cutoff_genotype, cutoff_snp, true) # If user has specified a fasta output elsif fasta_output # generate FASTA file output.puts ">ref\n#{sequence_hash["ref"].join("")}" tab_delim_file.puts "\t#{snp_positions.join("\t")}" tab_delim_file.puts "ref\t#{sequence_hash["ref"].join("\t")}" strains.each do |strain| output.puts ">#{strain.name}\n#{sequence_hash[strain.name].join("")}" tab_delim_file.puts "#{strain.name}\t#{sequence_hash[strain.name].join("\t")}" end snp_positions.each_with_index do |snp_position, index| position_map_file.puts "#{index+1} => #{snp_position}" end end # If user has chosen a newick output. if tree nwk_out_file_name = File.basename(out, File.extname(out)) + ".nwk" puts "running phylogeny" `#{fasttree_path} -fastest -nt #{output} > #{nwk_out_file_name}` end output.close tab_delim_file.close end
guess_sequence_format(reference_genome)
click to toggle source
This method guesses the reference sequence file format
# File lib/create_methods.rb, line 7 def guess_sequence_format(reference_genome) file_extension = File.extname(reference_genome).downcase file_format = nil case file_extension when ".gbk", ".genbank", ".gb" file_format = :genbank when ".embl", ".emb" file_format = :embl end return file_format end
information()
click to toggle source
This method outputs information for each SNP, e.g. synonymous and non-synonymous etc. Its is called in lib/snp-search.rb
# File lib/information_methods.rb, line 5 def information() strains = Strain.all output = File.open("#{out}", "w") output.puts "start_cds_in_ref\tend_cds_in_ref\tpos_of_SNP_in_ref\tSNP_base\tsynonymous or non-synonymous\tpossible_pseudogene?\tamino_acid_original\tamino_acid_change\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?\t#{strains.map{|strain| strain.name}.join("\t")}" snp_info = 0 snps = Snp.find_by_sql("SELECT snps.* from snps inner join alleles on snps.id = alleles.snp_id") snps.each do |snp| snp.alleles.each do |allele| features = Feature.joins(:snps).where("snps.id = ?", snp.id) features.each do |feature| if feature.feature == "CDS" || "RNA" annotation = Annotation.where("annotations.qualifier = 'product' and annotations.feature_id = ?", feature.id).first else annotation = Annotation.where("annotations.qualifier = 'note' and annotations.feature_id = ?", feature.id).first end all_seqs_mutated = genome_sequence.seq mutated_seq_translated = [] original_seq_translated = [] # Mutate the sequence with the SNP all_seqs_mutated[snp.ref_pos.to_i-1] = allele.base # The reason why we use all the sequence from the genome and not just the feature sequence is because the ref_pos is based on the genome sequence and when we come to mutate it we have to have the full sequence to find the position. mutated_seq = Bio::Sequence.auto(all_seqs_mutated[feature.start-1..feature.end-1]) original_seq = Bio::Sequence.auto(all_seqs_original[feature.start-1..feature.end-1]) #If the strand is negative then reverse complement if feature.strand == -1 mutated_seq_translated << mutated_seq.reverse_complement.translate original_seq_translated << original_seq.reverse_complement.translate else mutated_seq_translated << mutated_seq.translate original_seq_translated << original_seq.translate end # Remove the star at the end of each translated sequence. mutated_seq_translated.zip(original_seq_translated).each do |mut, org| mutated_seq_translated_clean = mut.gsub(/\*$/,"") original_seq_translated_clean = org.gsub(/\*$/,"") # Amino acid properties hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"] non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"] polar = ["R", "N", "D", "E", "Q", "H", "K", "S", "T", "Y"] non_polar = ["A", "C", "G", "I", "L", "M", "F", "P", "W", "V"] small = ["V","C","A","G","D","N","S","T","P"] non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"] alleles_array = [] strains.each do |strain| allele = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first alleles_array << allele.base end snp_info +=1 if original_seq_translated_clean == mutated_seq_translated_clean if mutated_seq_translated_clean =~ /\*/ output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tsynonymous\tYes\t\t\t\t\t\t#{alleles_array.join("\t")}" else output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tsynonymous\t\t\t\t\t\t\t#{alleles_array.join("\t")}" end else diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean) if mutated_seq_translated_clean =~ /\*/ output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}" puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}" else output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\t\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}" puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\t\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}" end end puts "Total SNPs outputted so far: #{snp_info}" if snp_info % 50 == 0 end end end #Take all SNP positions in ref genome # snp_positions = Feature.find_by_sql("select snps.ref_pos from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|snp| snp.ref_pos} # # Take all SNP nucleotide # snps = Feature.find_by_sql("select alleles.base from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|allele| allele.base} # # Mutate (substitute) the original sequence with the SNPs # # Here all_seqs_original are all the nucelotide sequences but with the snps subsituted in them # #Get start position of CDS with SNP # coordinates_start = Feature.find_by_sql("select start from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.start}
output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, info = true)
click to toggle source
This method finds info_snps from a list of strains given by user.
Its is called in lib/snp-search.rb
# File lib/output_information_methods.rb, line 5 def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, info = true) strains = Strain.all outfile.puts "pos_of_SNP_in_ref\tref_base\tSNP_base\tsynonymous or non-synonymous\tGene_annotation\tpossible_pseudogene?\tamino_acid_original\tamino_acid_change\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?\t#{strains.map{|strain| strain.name}.join("\t") if info}" snps_counter = 0 cds_snps_counter = 0 total_number_of_syn_snps = 0 total_number_of_non_syn_snps = 0 total_number_of_pseudo = 0 snps.each do |snp| ActiveRecord::Base.transaction do snp.alleles.each do |allele| next if snp.alleles.any?{|allele| allele.base.length > 1} # indel if allele.id != snp.reference_allele_id # get annotation (if there is any) for each SNP features = Feature.joins(:snps).where("snps.id = ?", snp.id) # get snp quality for each snp snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}") # ignore snp if the snp qual is less than cuttoff. next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i} # get all genotype qualities for each snp. gqs = Genotype.find_by_sql("select geno_qual from genotypes inner join alleles on alleles.id = genotypes.allele_id inner join snps on snps.id = alleles.snp_id where snps.id = #{snp.id}") # ignore snp if any of its genotype qualities is lower than the cuttoff. next if gqs.any?{|genotype_quality| genotype_quality.geno_qual < cuttoff_genotype.to_i} ref_base = Bio::Sequence.auto(Allele.find(snp.reference_allele_id).base) snp_base = Bio::Sequence.auto(allele.base) # count snps now: after you have selected the snps with gqs and snp_qual greater than the threshold. snps_counter += 1 # If the feature is empty then just output basic information about the snp. if features.empty? outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}" else features.each do |feature| if feature.name == "CDS" cds_snps_counter +=1 annotation = Annotation.where("annotations.qualifier = 'product' and annotations.feature_id = ?", feature.id).first #if annotation is nil, or empty if annotation.nil? outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}" else feature_sequence = feature.sequence feature_sequence_bio = Bio::Sequence::NA.new(feature_sequence) #Mutate sequence with SNP feature_sequence_mutated = feature.sequence feature_sequence_snp_pos = (snp.ref_pos-1) - (feature.start-1) feature_sequence_mutated[feature_sequence_snp_pos] = allele.base feature_sequence_mutated_bio = Bio::Sequence::NA.new(feature_sequence_mutated) # Translate the sequences if feature.strand == -1 mutated_seq_translated = feature_sequence_mutated_bio.reverse_complement.translate original_seq_translated = feature_sequence_bio.reverse_complement.translate else mutated_seq_translated = feature_sequence_mutated_bio.translate original_seq_translated = feature_sequence_bio.translate end # Remove the star at the end of each translated sequence. mutated_seq_translated_clean = mutated_seq_translated.gsub(/\*$/,"") original_seq_translated_clean = original_seq_translated.gsub(/\*$/,"") # Amino acid properties hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"] non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"] polar = ["Y", "W", "H", "K", "R", "E", "Q", "D", "N", "S", "P", "B"] non_polar = ["I", "L", "V", "C", "A", "G", "M", "F", "T"] small = ["V","C","A","G","D","N","S","T","P"] non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"] # Get alleles for each strain bases_from_alleles = [] strains.each do |strain| allele_for_strains = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first # allele_for_strains = Allele.find_by_sql("select * from alleles inner join genotypes on genotypes.allele_id = alleles.id inner join strains on strains.id = genotypes.strain_id where strains.id = #{strain.id} and alleles.snp_id = #{snp.id}") puts allele_for_strains.inspect # next if bases_from_alleles.empty? bases_from_alleles << allele_for_strains.base end # If no difference between the amino acids then its synonymous SNP, if different then its non-synonymous. if original_seq_translated_clean == mutated_seq_translated_clean total_number_of_syn_snps +=1 if mutated_seq_translated_clean =~ /\*/ total_number_of_pseudo +=1 outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}" else outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}" end else total_number_of_non_syn_snps +=1 diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean) if mutated_seq_translated_clean =~ /\*/ total_number_of_pseudo +=1 outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}" else outfile.puts "#{snp.ref_pos-1}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}" end end end end end end puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0 end end end end puts "Total number of snps: #{snps_counter} with Genotype quality cutoff at #{cuttoff_genotype} and SNP quality cutoff at #{cuttoff_snp}" puts "Total number of snps in CDS region: #{cds_snps_counter}" puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}" puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}" puts "Total number of pseudogenes: #{total_number_of_pseudo}" outfile.puts "Total number of snps: #{snps_counter}" outfile.puts "Total number of snps in CDS region: #{cds_snps_counter}" outfile.puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}" outfile.puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}" outfile.puts "Total number of possible pseudogenes: #{total_number_of_pseudo}" end
populate_features_and_annotations(sequence_file)
click to toggle source
A method to populate the database with the features (genes etc) and the annotations from the gbk/embl file. We include all features that are not 'source' or 'gene' as they are repetitive info. 'CDS' is the gene. The annotation table includes also the start and end coordinates of the CDS. The strand is also included. the 'locations' method is defined in bioruby under genbank. It must be required at the top (bio).
Also, the qualifier and value are extracted from the gbk/embl file and added to the database.
# File lib/create_methods.rb, line 23 def populate_features_and_annotations(sequence_file) puts "Adding features and their annotations...." ActiveRecord::Base.transaction do counter = 0 sequence_file.features.each do |feature| unless feature.feature == "source" || feature.feature == "gene" counter += 1 puts "Total number of features and annotations added: #{counter}" if counter % 100 == 0 db_feature = Feature.new db_feature.name = feature.feature db_feature.start = feature.locations.first.from db_feature.end = feature.locations.first.to db_feature.strand = feature.locations.first.strand #Add nucleotide sequence from ORIGIN of genbank file. db_feature.sequence = sequence_file.seq[feature.locations.first.from-1..feature.locations.first.to-1] db_feature.save # Populate the Annotation table with qualifier information from the genbank file feature.qualifiers.each do |qualifier| a = Annotation.new a.qualifier = qualifier.qualifier a.value = qualifier.value a.save db_feature.annotations << a end end end end end
populate_snps_alleles_genotypes(vcf_file, cutoff_ad)
click to toggle source
This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
# File lib/create_methods.rb, line 53 def populate_snps_alleles_genotypes(vcf_file, cutoff_ad) puts "Adding SNPs........" # open vcf file and parse each line File.open(vcf_file) do |f| # header names while line = f.gets if line =~ /CHROM/ line.chomp! column_headings = line.split("\t") potential_strain_names = column_headings[9..-1] potential_strain_names.map!{|name| name.sub(/\..*/, '')} # strain_names = column_headings[9..-1] # strain_names.map!{|name| name.sub(/\..*/, '')} strains = Array.new # strain_names.each do |str| # ss = Strain.new # ss.name = str # ss.save # end # strains = Array.new # strain_names.each do |strain_name| # strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first # strains << strain # end good_snps = 0 # start parsing snps while line = f.gets line.chomp! details = line.split("\t") ref = details[0] ref_pos = details[1].to_i ref_base = details[3] snp_bases = details[4].split(",") snp_qual = details [5] number_of_colons_in_format = details[8].count ":" format = details[8].split(":") gt_array_position = format.index("GT") gq_array_position = format.index("GQ") ad_array_position = format.index("AD") # dp = format.index("DP") # columns_after_format = details[9..-1] # samples = columns_after_format.select{|column_after_format| column_after_format.count(":") == number_of_colons_in_format} # puts samples samples = details[9..-1] if strains.empty? samples.zip(potential_strain_names).each do |column_after_format, potential_strain_name| if column_after_format.count(":") == number_of_colons_in_format ss = Strain.new ss.name = potential_strain_name ss.save strains << ss end end end gts = [] gqs = [] ad_ratios = [] next if samples.any?{|sample| sample =~ /\.\/\./} # no coverage in at least one sample samples.map do |sample| format_values = sample.split(":") # output (e.g.): ["0/0 ", "0,255,209", "99"] if gt_array_position gt = format_values[gt_array_position] # e.g. gt = gt.split("/") next if gt.size > 1 && (gt.first != gt.last) # if its 0/1, 1/2 etc then ignore next if gt.first == "." # no coverage gt = gt.first.to_i else puts "GT field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting" exit end if gq_array_position gq = format_values[gq_array_position].to_f else puts "GQ field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting" exit end if ad_array_position # If there is AD in vcf. Typically AD is Allele specific depth. i.e. if ref is 'A' and alt is 'G' and AD is '6,9' you got 6 A reads and 9 G reads. # ad below is 6 and 9 in the example above. ad = format_values[ad_array_position].split(",").map{|ad_value| ad_value.to_i} # Find the sum of all bases (sum_of_ad) reported by the ad, so its 15 in the example. sum_of_ad = ad.inject{|sum,x| sum + x } ad_ratios << ad[gt]/sum_of_ad.to_f end gqs << gq gts << gt end next if ad_ratios.any?{|ad_ratio| ad_ratio < cutoff_ad.to_i} # exclude if any samples have a call ratio of less than a cutoff set by user if gts.size == samples.size # if some gts have been rejected due to heterozygote or no coverage good_snps +=1 # populate snps ActiveRecord::Base.transaction do s = Snp.new s.ref_pos = ref_pos s.qual = snp_qual s.save # create ref allele ref_allele = Allele.new ref_allele.base = ref_base ref_allele.snp = s ref_allele.save s.reference_allele = ref_allele s.save snp_alleles = Array.new gts.uniq.select{|gt| gt > 0}.each do |gt| # create snp allele snp_allele = Allele.new snp_bases_index = gt - 1 snp_allele.base = snp_bases[snp_bases_index] snp_allele.snp = s snp_allele.save snp_alleles << snp_allele end genos = [] gts.each_with_index do |gt, index| genotype = Genotype.new genotype.strain = strains[index] #Adding the genotype quality with Genotype genotype.geno_qual = gqs[index] if gt == 0# wild type genotype.allele = ref_allele else # snp type genotype.allele = snp_alleles[gt - 1] end genos << genotype end # Using activerecord-import to speed up importing Genotype.import genos, :validate => false puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0 # puts "Total SNPs added so far: #{good_snps}" end end end end end end #Here we link the features to snps. puts "Linking features to SNPs" ActiveRecord::Base.transaction do Snp.all.each_with_index do |snp, index| puts "Total SNPs linked to features added so far: #{index}" if index % 100 == 0 features = Feature.where("features.start <= ? AND features.end >= ?", snp.ref_pos, snp.ref_pos) unless features.empty? features.each do |feature| snp.features << feature end end end end end