class Object

Public Instance Methods

db_schema() click to toggle source
# File lib/snp_db_schema.rb, line 1
def db_schema
  ActiveRecord::Schema.define do
    unless table_exists? :strains
      create_table :strains do |t|
        t.column :name, :string
        t.column :description, :string
      end
    end

    unless table_exists? :features_snps
      create_table :features_snps, :id => false do |t|
        t.column :feature_id, :integer, :null => false
        t.column :snp_id, :integer, :null => false
      end
    end

    unless table_exists? :features
      create_table :features do |t|
        t.column :name, :string
        t.column :start, :integer
        t.column :end, :integer
        t.column :strand, :integer
        t.column :sequence, :string
      end
    end

    unless table_exists? :snps
      create_table :snps do |t|
        t.column :ref_pos, :integer
        t.column :qual, :float 
        t.column :reference_allele_id, :integer
      end
    end

    unless table_exists? :alleles
      create_table :alleles do |t|
        t.column :snp_id, :integer
        t.column :base, :string
      end
    end
   
    unless table_exists? :genotypes
      create_table :genotypes do |t|
        t.column :allele_id, :integer
        t.column :strain_id, :integer
        t.column :geno_qual, :float
      end
    end

    unless table_exists? :annotations
      create_table :annotations do |t|
        t.column  :qualifier, :string
        t.column :value, :string
        t.column :feature_id, :integer
      end
    end
     
    # indices
    unless index_exists? :strains, :id
      add_index :strains, :id
    end
    unless index_exists? :strains, :name
      add_index :strains, :name
    end
    unless index_exists? :features, :id
      add_index :features, :id
    end
    unless index_exists? :features, :name
      add_index :features, :name
    end
    unless index_exists? :features, :start
      add_index :features, :start
    end
    unless index_exists? :features, :end
      add_index :features, :end
    end
    unless index_exists? :features, :strand
      add_index :features, :strand
    end
    unless index_exists? :features, :sequence
      add_index :features, :sequence
    end
    unless index_exists? :snps, :id
      add_index :snps, :id
    end
    unless index_exists? :snps, :ref_pos
      add_index :snps, :ref_pos
    end
    unless index_exists? :snps, :qual
      add_index :snps, :qual
    end
    unless index_exists? :snps, :reference_allele_id
      add_index :snps, :reference_allele_id
    end
    unless index_exists? :features_snps, :feature_id
      add_index :features_snps, :feature_id
    end
    unless index_exists? :features_snps, :snp_id
      add_index :features_snps, :snp_id
    end
    unless index_exists? :snps, :qual
      add_index :snps, :qual
    end
    unless index_exists? :alleles, :snp_id
      add_index :alleles, :snp_id
    end
    unless index_exists? :alleles, :base
      add_index :alleles, :base
    end
    unless index_exists? :genotypes, :id
      add_index :genotypes, :id
    end
    unless index_exists? :genotypes, :allele_id
      add_index :genotypes, :allele_id
    end
    unless index_exists? :genotypes, :strain_id
      add_index :genotypes, :strain_id
    end
    unless index_exists? :genotypes, :geno_qual
      add_index :genotypes, :geno_qual
    end
    unless index_exists? :annotations, :feature_id
        add_index :annotations, :feature_id
    end
    unless index_exists? :annotations, :qualifier
      add_index :annotations, :qualifier
    end
    unless index_exists? :annotations, :value
      add_index :annotations, :value
    end
  end
end
establish_connection(db_location) click to toggle source
# File lib/snp_db_connection.rb, line 5
def establish_connection(db_location)
  ActiveRecord::Base.establish_connection(
    :adapter => "sqlite3",
    :database => db_location)
end
find_unqiue_snps(strain_names, out, cutoff_genotype, cutoff_snp) click to toggle source
# File lib/snp-search.rb, line 10
def find_unqiue_snps(strain_names, out, cutoff_genotype, cutoff_snp)

  *strain_names = strain_names

  where_statement = strain_names.collect{|strain_name| "strains.name = '#{strain_name}' OR "}.join("").sub(/ OR $/, "")

  outfile = File.open(out, "w")

   snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{cutoff_genotype} AND snps.qual >= #{cutoff_snp} AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
   # puts "The number of unique snps are #{snps.size}"

   output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, false)
end
get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cutoff_genotype, cutoff_snp, tree, fasttree_path) click to toggle source
# File lib/filter_ignore_snps_methods.rb, line 7
def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cutoff_genotype, cutoff_snp, tree, fasttree_path)

  strains = Strain.all

  sequence_hash = Hash.new
  sequence_hash["ref"] = Array.new
  strains.each do |strain|
    sequence_hash[strain.name] = Array.new
  end

  snps_array = Array.new
  snp_positions = Array.new

  # output opened for data input
  output = File.open(out, "w")
  tab_delim_file_name = File.basename(out, File.extname(out)) + "_snps.tsv"
  tab_delim_file = File.open(tab_delim_file_name, "w")
  position_map_file_name = File.basename(out, File.extname(out)) + "_snps_positions.txt"
  position_map_file = File.open(position_map_file_name, "w")


  snps_within_features_with_annotation = ""
  # Perform query
  # puts ignore_snps_on_annotation.inspect
  if ignore_snps_on_annotation
    annotations_where = ignore_snps_on_annotation.split(",").map{|annotation| "annotations.value LIKE '%#{annotation}%'"}.join(" OR ")
    features_with_annotation = Feature.joins(:annotations).where(annotations_where)
    snps_within_features_with_annotation = Snp.joins(:features).where("features.id IN (?)", features_with_annotation.collect{|feature| feature.id})
  end

  if snps_within_features_with_annotation.empty?
    snps = Snp.all
  else
    snps = Snp.where("snps.id NOT IN (?)", snps_within_features_with_annotation.collect{|snp| snp.id})
  end

  positions_to_ignore = Array.new
  if ignore_snps_in_range
    range_strings = ignore_snps_in_range.split(",")
    range_strings.each do |range|
      start_position, end_position = range.split("..")
      positions_to_ignore += (start_position.to_i..end_position.to_i).to_a
    end
  end

  if ignore_strains
    strains_to_ignore = ignore_strains.split(",")
  end


  i = 0
  puts "Your Query is submitted and is being processed......."
  strains = Strain.find(:all)
  if ignore_strains
    strains_to_ignore = ignore_strains.split(",")
    strains.reject!{|strain| strains_to_ignore.include?(strain.name)}
  end

  snps.each do |snp|

    ActiveRecord::Base.transaction do
      i += 1
      next if positions_to_ignore.include?(snp.ref_pos) # Ignore positions that user specified
      alleles = snp.alleles
            
      genotypes = snp.alleles.collect{|allele| allele.genotypes}.flatten

      snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
      # ignore snp if the snp qual is less than cutoff.
      next if snp_qual.any?{|snps_quality| snps_quality.qual < cutoff_snp.to_i}
      
      next if alleles.any?{|allele| allele.base.length > 1} # indel
      next unless genotypes.all?{|genotype| genotype.geno_qual >= cutoff_genotype.to_f} # all geno quals > cutoff
      # puts "#{i} SNPs processed so far" if i % 100 == 0
      strain_alleles = Hash.new
      strains.each do |strain|
        strain_genotype = genotypes.select{|genotype| genotype.strain_id == strain.id}.first
        # next if strain_genotype == nil
        strain_allele = alleles.select{|allele| allele.id == strain_genotype.allele_id}.first
        strain_alleles[strain.name] = strain_allele.base
     end

      if remove_non_informative_snps
        next if strain_alleles.values.uniq.size == 1 # remove non-informative SNPs
      end

      snp_positions << snp.ref_pos
      snps_array << snp
      strain_alleles.each do |strain_name, allele_base|
        sequence_hash[strain_name] << allele_base
      end
      sequence_hash["ref"] << snp.reference_allele.base
    end
  end

  # If user has specified a tabular output
  if tabular_output
    output_information_methods(snps_array, output, cutoff_genotype, cutoff_snp, true)
  # If user has specified a fasta output
  elsif fasta_output
    # generate FASTA file
    output.puts ">ref\n#{sequence_hash["ref"].join("")}"
    tab_delim_file.puts "\t#{snp_positions.join("\t")}"
    tab_delim_file.puts "ref\t#{sequence_hash["ref"].join("\t")}"
    strains.each do |strain|
      output.puts ">#{strain.name}\n#{sequence_hash[strain.name].join("")}"
      tab_delim_file.puts "#{strain.name}\t#{sequence_hash[strain.name].join("\t")}"
    end

    snp_positions.each_with_index do |snp_position, index|
      position_map_file.puts "#{index+1} => #{snp_position}"
    end
  end
  # If user has chosen a newick output.
  if tree
    nwk_out_file_name = File.basename(out, File.extname(out)) + ".nwk"
    puts "running phylogeny"
    `#{fasttree_path} -fastest -nt #{output} > #{nwk_out_file_name}`
  end

  output.close
  tab_delim_file.close
end
guess_sequence_format(reference_genome) click to toggle source

This method guesses the reference sequence file format

# File lib/create_methods.rb, line 7
def guess_sequence_format(reference_genome)
  file_extension = File.extname(reference_genome).downcase
  file_format = nil
  case file_extension
  when ".gbk", ".genbank", ".gb"
    file_format = :genbank
  when ".embl", ".emb"
    file_format = :embl
  end
  return file_format
end
information() click to toggle source

This method outputs information for each SNP, e.g. synonymous and non-synonymous etc. Its is called in lib/snp-search.rb

# File lib/information_methods.rb, line 5
def information()

  strains = Strain.all

  output = File.open("#{out}", "w")

  output.puts "start_cds_in_ref\tend_cds_in_ref\tpos_of_SNP_in_ref\tSNP_base\tsynonymous or non-synonymous\tpossible_pseudogene?\tamino_acid_original\tamino_acid_change\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?\t#{strains.map{|strain| strain.name}.join("\t")}"

  snp_info = 0

  snps = Snp.find_by_sql("SELECT snps.* from snps inner join alleles on snps.id = alleles.snp_id")

  snps.each do |snp|
    snp.alleles.each do |allele|

      features = Feature.joins(:snps).where("snps.id = ?", snp.id)

      features.each do |feature|
        if feature.feature == "CDS" || "RNA"
          annotation = Annotation.where("annotations.qualifier = 'product' and annotations.feature_id = ?", feature.id).first
        else
          annotation = Annotation.where("annotations.qualifier = 'note' and annotations.feature_id = ?", feature.id).first
        end
 
  
    all_seqs_mutated = genome_sequence.seq
    mutated_seq_translated = []
    original_seq_translated = []

    # Mutate the sequence with the SNP
    all_seqs_mutated[snp.ref_pos.to_i-1] = allele.base

    # The reason why we use all the sequence from the genome and not just the feature sequence is because the ref_pos is based on the genome sequence and when we come to mutate it we have to have the full sequence to find the position.

    mutated_seq = Bio::Sequence.auto(all_seqs_mutated[feature.start-1..feature.end-1])

    original_seq =  Bio::Sequence.auto(all_seqs_original[feature.start-1..feature.end-1])

    #If the strand is negative then reverse complement

    if feature.strand == -1
      mutated_seq_translated << mutated_seq.reverse_complement.translate
      original_seq_translated << original_seq.reverse_complement.translate

    else
      mutated_seq_translated << mutated_seq.translate
      original_seq_translated << original_seq.translate

    end

    # Remove the star at the end of each translated sequence.

    mutated_seq_translated.zip(original_seq_translated).each do |mut, org|
      mutated_seq_translated_clean = mut.gsub(/\*$/,"")
      original_seq_translated_clean = org.gsub(/\*$/,"")

      # Amino acid properties

      hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"]
      non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"]

      polar = ["R", "N", "D", "E", "Q", "H", "K", "S", "T", "Y"]
      non_polar = ["A", "C", "G", "I", "L", "M", "F", "P", "W", "V"]

      small = ["V","C","A","G","D","N","S","T","P"]
      non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]

      alleles_array = []
      strains.each do |strain|
        allele = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first
        alleles_array << allele.base
      end

      snp_info +=1

      if original_seq_translated_clean == mutated_seq_translated_clean
        if mutated_seq_translated_clean =~ /\*/
          output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tsynonymous\tYes\t\t\t\t\t\t#{alleles_array.join("\t")}"
        else
          output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tsynonymous\t\t\t\t\t\t\t#{alleles_array.join("\t")}"
        end
      else

        diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean)

        if mutated_seq_translated_clean =~ /\*/
          output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
          puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
        else
          output.puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\t\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
          puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{Allele.find(snp.reference_allele_id).base}\tnon-synonymous\t\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}\t#{alleles_array.join("\t")}"
        end
      end
      puts "Total SNPs outputted so far: #{snp_info}" if snp_info % 50 == 0 
    end
  end
end

#Take all SNP positions in ref genome
# snp_positions = Feature.find_by_sql("select snps.ref_pos from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|snp| snp.ref_pos}

# # Take all SNP nucleotide
# snps = Feature.find_by_sql("select alleles.base from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|allele| allele.base}

# # Mutate (substitute) the original sequence with the SNPs

# # Here all_seqs_original are all the nucelotide sequences but with the snps subsituted in them

# #Get start position of CDS with SNP
# coordinates_start = Feature.find_by_sql("select start from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.start}
output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, info = true) click to toggle source

This method finds info_snps from a list of strains given by user.

Its is called in lib/snp-search.rb
# File lib/output_information_methods.rb, line 5
def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, info = true)

  strains = Strain.all

  outfile.puts "pos_of_SNP_in_ref\tref_base\tSNP_base\tsynonymous or non-synonymous\tGene_annotation\tpossible_pseudogene?\tamino_acid_original\tamino_acid_change\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?\t#{strains.map{|strain| strain.name}.join("\t") if info}"

  snps_counter = 0
  cds_snps_counter = 0
  total_number_of_syn_snps = 0
  total_number_of_non_syn_snps = 0
  total_number_of_pseudo = 0

  snps.each do |snp|
    ActiveRecord::Base.transaction do
      snp.alleles.each do |allele|
        next if snp.alleles.any?{|allele| allele.base.length > 1} # indel
        if allele.id != snp.reference_allele_id

          # get annotation (if there is any) for each SNP
          features = Feature.joins(:snps).where("snps.id = ?", snp.id)
          
          # get snp quality for each snp
          snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}")
          # ignore snp if the snp qual is less than cuttoff.
          next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i}
          

          # get all genotype qualities for each snp.
          gqs = Genotype.find_by_sql("select geno_qual from genotypes inner join alleles on alleles.id = genotypes.allele_id inner join snps on snps.id = alleles.snp_id where snps.id = #{snp.id}")
          # ignore snp if any of its genotype qualities is lower than the cuttoff.
          next if gqs.any?{|genotype_quality| genotype_quality.geno_qual < cuttoff_genotype.to_i}
          
          ref_base = Bio::Sequence.auto(Allele.find(snp.reference_allele_id).base)
          snp_base = Bio::Sequence.auto(allele.base)
          # count snps now: after you have selected the snps with gqs and snp_qual greater than the threshold.
          snps_counter += 1 
          # If the feature is empty then just output basic information about the snp.

          if features.empty?
            outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
          else 
            features.each do |feature|
              if feature.name == "CDS"

                cds_snps_counter +=1

                annotation = Annotation.where("annotations.qualifier = 'product' and annotations.feature_id = ?", feature.id).first
                #if annotation is nil, or empty
                if annotation.nil?
                  outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}"
                else

                  feature_sequence = feature.sequence

                  feature_sequence_bio = Bio::Sequence::NA.new(feature_sequence)

                  #Mutate sequence with SNP
                  feature_sequence_mutated = feature.sequence
                  feature_sequence_snp_pos = (snp.ref_pos-1) - (feature.start-1)
                  feature_sequence_mutated[feature_sequence_snp_pos] = allele.base
                  feature_sequence_mutated_bio = Bio::Sequence::NA.new(feature_sequence_mutated)

                  # Translate the sequences
                  if feature.strand == -1
                    mutated_seq_translated = feature_sequence_mutated_bio.reverse_complement.translate
                    original_seq_translated = feature_sequence_bio.reverse_complement.translate

                  else
                    mutated_seq_translated = feature_sequence_mutated_bio.translate
                    original_seq_translated = feature_sequence_bio.translate

                  end

                  # Remove the star at the end of each translated sequence.
                  mutated_seq_translated_clean = mutated_seq_translated.gsub(/\*$/,"")
                  original_seq_translated_clean = original_seq_translated.gsub(/\*$/,"")

                  # Amino acid properties
                  hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"]
                  non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"]

                  polar = ["Y", "W", "H", "K", "R", "E", "Q", "D", "N", "S", "P", "B"]
                  non_polar = ["I", "L", "V", "C", "A", "G", "M", "F", "T"]

                  small = ["V","C","A","G","D","N","S","T","P"]
                  non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]

                  # Get alleles for each strain
                  bases_from_alleles = []
                    strains.each do |strain|
                      
                      allele_for_strains = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first
                      # allele_for_strains =  Allele.find_by_sql("select * from alleles inner join genotypes on genotypes.allele_id = alleles.id inner join strains on strains.id = genotypes.strain_id where strains.id = #{strain.id} and alleles.snp_id = #{snp.id}")
                      puts allele_for_strains.inspect
                      # next if bases_from_alleles.empty?
                      bases_from_alleles << allele_for_strains.base
                    end

                  # If no difference between the amino acids then its synonymous SNP, if different then its non-synonymous.
                  if original_seq_translated_clean == mutated_seq_translated_clean
                    total_number_of_syn_snps +=1
                    if mutated_seq_translated_clean =~ /\*/
                      total_number_of_pseudo +=1
                      outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}"
                    else
                      outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}"
                    end
                  else
                    total_number_of_non_syn_snps +=1
                    diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean)

                    if mutated_seq_translated_clean =~ /\*/
                      total_number_of_pseudo +=1
                      outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}"
                    else
                      outfile.puts "#{snp.ref_pos-1}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}"
                    end
                  end
                end
              end
            end
          end
        puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0 
        end
      end
    end
  end
  puts "Total number of snps: #{snps_counter} with Genotype quality cutoff at #{cuttoff_genotype} and SNP quality cutoff at #{cuttoff_snp}"
  puts "Total number of snps in CDS region: #{cds_snps_counter}"
  puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}"
  puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}"
  puts "Total number of pseudogenes: #{total_number_of_pseudo}"
  outfile.puts "Total number of snps: #{snps_counter}"
  outfile.puts "Total number of snps in CDS region: #{cds_snps_counter}"
  outfile.puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}"
  outfile.puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}"
  outfile.puts "Total number of possible pseudogenes: #{total_number_of_pseudo}"
end
populate_features_and_annotations(sequence_file) click to toggle source
A method to populate the database with the features (genes etc) and the annotations from the gbk/embl file.  
We include all features that are not 'source' or 'gene' as they are repetitive info.  'CDS' is the gene.
The annotation table includes also the start and end coordinates of the CDS.  The strand is also included.  the 'locations' method is defined in bioruby under genbank.  It must be required at the top (bio).

Also, the qualifier and value are extracted from the gbk/embl file and added to the database.

# File lib/create_methods.rb, line 23
def populate_features_and_annotations(sequence_file)
  puts "Adding features and their annotations...."
  ActiveRecord::Base.transaction do
    counter = 0
    sequence_file.features.each do |feature|
      unless feature.feature == "source" || feature.feature == "gene"
        counter += 1
        puts "Total number of features and annotations added: #{counter}" if counter % 100 == 0
        db_feature = Feature.new
        db_feature.name = feature.feature
        db_feature.start = feature.locations.first.from
        db_feature.end = feature.locations.first.to
        db_feature.strand = feature.locations.first.strand
        #Add nucleotide sequence from ORIGIN of genbank file.
        db_feature.sequence = sequence_file.seq[feature.locations.first.from-1..feature.locations.first.to-1]
        db_feature.save
        # Populate the Annotation table with qualifier information from the genbank file
        feature.qualifiers.each do |qualifier|
          a = Annotation.new
          a.qualifier = qualifier.qualifier
          a.value = qualifier.value
          a.save
          db_feature.annotations << a
        end
      end
    end
  end
end
populate_snps_alleles_genotypes(vcf_file, cutoff_ad) click to toggle source

This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.

# File lib/create_methods.rb, line 53
def populate_snps_alleles_genotypes(vcf_file, cutoff_ad)

  puts "Adding SNPs........"
  
  # open vcf file and parse each line
  File.open(vcf_file) do |f|
    # header names
    while line = f.gets
      if  line =~ /CHROM/
        line.chomp!
        column_headings = line.split("\t")
        potential_strain_names = column_headings[9..-1]
        potential_strain_names.map!{|name| name.sub(/\..*/, '')}
        # strain_names = column_headings[9..-1]
        # strain_names.map!{|name| name.sub(/\..*/, '')}
        strains = Array.new

        # strain_names.each do |str|
        #   ss = Strain.new
        #   ss.name = str
        #   ss.save
        # end

        # strains = Array.new
        # strain_names.each do |strain_name|
        #   strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
        #   strains << strain
        # end

        good_snps = 0
        # start parsing snps
        while line = f.gets
          line.chomp!
          details = line.split("\t")
          ref = details[0]
          ref_pos = details[1].to_i

          ref_base = details[3]
          snp_bases = details[4].split(",")
          snp_qual = details [5]
          number_of_colons_in_format = details[8].count ":"
          format = details[8].split(":")
          
          gt_array_position = format.index("GT")
          gq_array_position = format.index("GQ")
          ad_array_position = format.index("AD")
          # dp = format.index("DP")
          # columns_after_format = details[9..-1]
          # samples = columns_after_format.select{|column_after_format| column_after_format.count(":")  == number_of_colons_in_format}
          # puts samples
          samples = details[9..-1]
          if strains.empty?
            samples.zip(potential_strain_names).each do |column_after_format, potential_strain_name|
              if column_after_format.count(":")  == number_of_colons_in_format
                ss = Strain.new
                ss.name = potential_strain_name
                ss.save
                strains << ss
              end
            end
          end

          gts = []
          gqs = []
          ad_ratios = []
          
          next if samples.any?{|sample| sample =~ /\.\/\./}  # no coverage in at least one sample

          samples.map do |sample|
            format_values = sample.split(":") # output (e.g.): ["0/0 ", "0,255,209", "99"]
            if gt_array_position
              gt = format_values[gt_array_position] # e.g.
              gt = gt.split("/")
              next if gt.size > 1 && (gt.first != gt.last) # if its 0/1, 1/2 etc then ignore
              next if gt.first == "." # no coverage
              gt = gt.first.to_i
            else
              puts "GT field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting"
              exit
            end

            if gq_array_position
              gq = format_values[gq_array_position].to_f
            else
              puts "GQ field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting"
              exit
            end

            if ad_array_position
              # If there is AD in vcf.  Typically AD is Allele specific depth. i.e. if ref is 'A' and alt is 'G' and AD is '6,9' you got 6 A reads and 9 G reads.
              # ad below is 6 and 9 in the example above.
              ad = format_values[ad_array_position].split(",").map{|ad_value| ad_value.to_i}
              # Find the sum of all bases (sum_of_ad) reported by the ad, so its 15 in the example.
              sum_of_ad = ad.inject{|sum,x| sum + x }
              ad_ratios << ad[gt]/sum_of_ad.to_f
            end
            
            gqs << gq
            gts << gt
          end
          
          next if ad_ratios.any?{|ad_ratio| ad_ratio < cutoff_ad.to_i} # exclude if any samples have a call ratio of less than a cutoff set by user
          if gts.size == samples.size # if some gts have been rejected due to heterozygote or no coverage
            good_snps +=1
            
            # populate snps

            ActiveRecord::Base.transaction do
              s = Snp.new
              s.ref_pos = ref_pos
              s.qual = snp_qual
              s.save

              #  create ref allele
              ref_allele = Allele.new
              ref_allele.base = ref_base
              ref_allele.snp = s
              ref_allele.save

              s.reference_allele = ref_allele
              s.save

              
              snp_alleles = Array.new
              gts.uniq.select{|gt| gt > 0}.each do |gt|
                # create snp allele
                snp_allele = Allele.new
                snp_bases_index = gt - 1
                snp_allele.base = snp_bases[snp_bases_index]
                snp_allele.snp = s
                snp_allele.save
                snp_alleles << snp_allele
              end

              genos = []
              gts.each_with_index do |gt, index|
                genotype = Genotype.new
                genotype.strain = strains[index]
                #Adding the genotype quality with Genotype

                genotype.geno_qual = gqs[index]
                if gt == 0# wild type
                  genotype.allele = ref_allele
                else # snp type
                  genotype.allele = snp_alleles[gt - 1]
                end
                genos << genotype
              end

              # Using activerecord-import to speed up importing
              Genotype.import genos, :validate => false 
              puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0
              # puts "Total SNPs added so far: #{good_snps}"
            end
          end
        end
      end
    end
  end
  #Here we link the features to snps.
  puts "Linking features to SNPs"
  ActiveRecord::Base.transaction do
    Snp.all.each_with_index do |snp, index|
      puts "Total SNPs linked to features added so far: #{index}" if index % 100 == 0
      features = Feature.where("features.start <= ? AND features.end >= ?", snp.ref_pos, snp.ref_pos)

      unless features.empty?
        features.each do |feature|
          snp.features << feature
        end
      end
    end
  end
end