class Ensembl::Variation::VariationFeature

The VariationFeature class gives information about the genomic position of each Variation, including also validation status and consequence type.

This class uses ActiveRecord to access data in the Ensembl database. See the general documentation of the Ensembl module for more information on what this means and what methods are available.

@example

# SLOWER QUERY
vf = VariationFeature.find_by_variation_name('rs10111')
# FASTER QUERY
vf = Variation.find_by_name('rs10111').variation_feature

puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
puts vf.variation.ancestral_allele
genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
genomic_region.genes
up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)

The VariationFeature class gives information about the genomic position of each Variation, including also validation status and consequence type.

This class uses ActiveRecord to access data in the Ensembl database. See the general documentation of the Ensembl module for more information on what this means and what methods are available.

@example

# SLOWER QUERY
vf = VariationFeature.find_by_variation_name('rs10111')
# FASTER QUERY
vf = Variation.find_by_name('rs10111').variation_feature

puts vf.seq_region_start, vf.seq_region_end, vf.allele_string
puts vf.variation.ancestral_allele
genomic_region = vf.fetch_region (returns an Ensembl::Core::Slice)
genomic_region.genes
up_region,down_region = vf.flanking_seq (returns two Ensembl::Core::Slice)

Public Instance Methods

consequence_type() click to toggle source
# File lib/bio-ensembl/variation/variation_feature.rb, line 61
def consequence_type # workaround as ActiveRecord do not parse SET field in MySQL
  "#{attributes_before_type_cast['consequence_type']}" 
end
consequence_types() click to toggle source
# File lib/bio-ensembl/variation/variation_feature62.rb, line 74
def consequence_types # workaround as ActiveRecord do not parse SET field in MySQL
  "#{attributes_before_type_cast['consequence_types']}" 
end
fetch_region(up = 5000, down = 5000) click to toggle source

Based on Perl API 'get_all_Genes' method for Variation class. Get a genomic region starting from the Variation coordinates, expanding the region upstream and downstream.

@param [Integer] up Length of upstream flanking region @param [Integer] down Length of downstream flanking region @return [Slice] Slice object containing the variation

# File lib/bio-ensembl/variation/variation_feature.rb, line 72
def fetch_region(up = 5000, down = 5000)
  sr = core_connection(self.seq_region_id)
  slice = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,self.seq_region_start-up,self.seq_region_end+down)
  return slice
end
flanking_seq() click to toggle source
# File lib/bio-ensembl/variation/variation_feature.rb, line 78
def flanking_seq
  sr = core_connection(self.seq_region_id)
  f = Variation.find(self.variation_id).flanking_sequence
  slice_up = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,f.up_seq_region_start,f.up_seq_region_end,self.seq_region_strand)
  slice_down = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,f.down_seq_region_start,f.down_seq_region_end,self.seq_region_strand)
  return slice_up,slice_down
end
transcript_variations() click to toggle source
# File lib/bio-ensembl/variation/variation_feature.rb, line 86
def transcript_variations
  tvs = TranscriptVariation.find_all_by_variation_feature_id(self.variation_feature_id)
  if tvs[0].nil? then # the variation is not stored in the database, so run the calculations
    sr = core_connection(self.seq_region_id)
    return custom_transcript_variation(self,sr)
  else
    return tvs # the variation is already present in the database
  end  
end

Private Instance Methods

check_aa_change(vf,t) click to toggle source
# File lib/bio-ensembl/variation/variation_feature.rb, line 259
def check_aa_change(vf,t)
    alleles = vf.allele_string.split('/') # get the different alleles for this variation
    # if the variation is an InDel then it produces a frameshift
    if vf.seq_region_start != vf.seq_region_end or alleles.include?("-") then
      return "FRAMESHIFT_CODING",nil
    end

    # Find the position inside the CDS
    
    mutation_position = t.genomic2cds(vf.seq_region_start)
    
    mutation_base = Bio::Sequence::NA.new(alleles[1])
    if t.seq_region_strand == -1
       mutation_base.reverse_complement!
    end
    # The rank of the codon
    target_codon = (mutation_position)/3 + 1
    cds_sequence = nil
    cds_sequence = t.cds_seq
    mut_sequence = cds_sequence.dup
    # Replace base with the variant allele
    mut_sequence[mutation_position] = mutation_base.seq
    refcodon =  cds_sequence[(target_codon*3 -3)..(target_codon*3-1)]
    mutcodon =  mut_sequence[(target_codon*3 -3)..(target_codon*3-1)]
    codontable = Bio::CodonTable[1]
    refaa = codontable[refcodon]
    mutaa = codontable[mutcodon.downcase]
    if mutaa == nil
      raise RuntimeError "Codon #{mutcodon.downcase} wasn't recognized."
    end
    pep_string = refaa+"/"+mutaa
    if mutaa == "*" and refaa != "*"
      return "STOP_GAINED",pep_string
    elsif mutaa != "*" and refaa == "*"
      return "STOP_LOST",pep_string
    elsif mutaa != refaa
      return "NON_SYNONYMOUS_CODING",pep_string 
    elsif mutaa == refaa
      return "SYNONYMOUS_CODING",pep_string 
    end
     
 end
check_intergenic(vf,t) click to toggle source

CONSEQUENCE CALCULATION FUNCTIONS ##

# File lib/bio-ensembl/variation/variation_feature.rb, line 182
def check_intergenic(vf,t)
  if vf.seq_region_end < t.seq_region_start and (t.seq_region_start - vf.seq_region_end) > 5000 then
     return "INTERGENIC"
  elsif vf.seq_region_start > t.seq_region_end and (vf.seq_region_start - t.seq_region_end) > 5000 then
     return "INTERGENIC"      
  end
  return nil        
end
check_near_exons(feature,exons_ranges) click to toggle source
# File lib/bio-ensembl/variation/variation_feature.rb, line 303
def check_near_exons(feature,exons_ranges)
 exons_ranges.each do |exon_range|
   if feature.is_a? Range
     return exon_range if (feature.first <= exon_range.last) && (exon_range.first <= feature.last)
   else
     return exon_range if exon_range.include? feature
   end  
 end
 return false
end
check_non_coding(vf,t) click to toggle source
# File lib/bio-ensembl/variation/variation_feature.rb, line 206
def check_non_coding(vf,t)
    if t.biotype == "miRNA" then 
       return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_MATURE_miRNA" : "COMPLEX_INDEL"
    elsif t.biotype == "nonsense_mediated_decay"
       return (vf.seq_region_start == vf.seq_region_end) ? "NMD_TRANSCRIPT" : "COMPLEX_INDEL"
    else
       return (vf.seq_region_start == vf.seq_region_end) ? "WITHIN_NON_CODING_GENE" : "COMPLEX_INDEL"
    end
    return nil
end
check_partial_codon(vf,t) click to toggle source
# File lib/bio-ensembl/variation/variation_feature62.rb, line 349
def check_partial_codon(vf,t)
  begin
    mutation_position = t.genomic2cds(vf.seq_region_start)
    cds_sequence = t.cds_seq
    @cache[:mutation_position] = mutation_position
    @cache[:cds_sequence] = cds_sequence
    # check if the mutation is on the last codon and if it's a partial codon
    if (cds_sequence.length - mutation_position) <= 3
      return (cds_sequence.length % 3 == 0) ? nil : "incomplete_terminal_codon_variant"
    end
  rescue Exception => e
    return nil
  end 
end
check_splice_site(vf,t) click to toggle source
# File lib/bio-ensembl/variation/variation_feature.rb, line 226
def check_splice_site(vf,t)
  @cache[:exons] = []
  var_start,var_end = (vf.seq_region_strand == 1) ? [vf.seq_region_start,vf.seq_region_end] : [vf.seq_region_end,vf.seq_region_start]
  t.exons.each {|ex| @cache[:exons] << Range.new(ex.seq_region_start,ex.seq_region_end)}
  
  exon_up = check_near_exons(var_start,@cache[:exons])
  exon_down = check_near_exons(var_end,@cache[:exons])
  if !exon_up and !exon_down # we are inside an intron
     # checking boundaries
     near_exon_up_2bp = check_near_exons(var_start-2..var_start,@cache[:exons])
     near_exon_down_2bp = check_near_exons(var_end..var_end+2,@cache[:exons])
     if near_exon_up_2bp or near_exon_down_2bp then
        return "ESSENTIAL_SPLICE_SITE"
     else
        near_exon_up_8bp = check_near_exons(var_start+8..var_start,@cache[:exons])
        near_exon_down_8bp = check_near_exons(var_end..var_end+8,@cache[:exons])    
        if near_exon_up_8bp or near_exon_down_8bp then
           return "SPLICE_SITE"
        else
           return "INTRONIC"   
        end
     end
  elsif exon_up and exon_down # the variation is inside an exon
       # check if it is a splice site
       if (var_start-exon_up.first) <= 3 or (exon_down.last-var_end) <= 3 then
          return "SPLICE_SITE"                   
       end
  else # a complex indel spanning intron/exon boundary
       return "COMPLEX_INDEL"
  end
  return nil      
end
check_upstream_downstream(vf,t) click to toggle source
# File lib/bio-ensembl/variation/variation_feature.rb, line 191
def check_upstream_downstream(vf,t)
  if vf.seq_region_end < t.seq_region_start and (t.seq_region_start - vf.seq_region_end) <= 5000 then
     return (t.strand == 1) ? "UPSTREAM" : "DOWNSTREAM"
  elsif vf.seq_region_start > t.seq_region_end and (vf.seq_region_start - t.seq_region_end) <= 5000 then
     return (t.strand == 1) ? "DOWNSTREAM" : "UPSTREAM"
  
  # check if it's an InDel and if overlaps the transcript start / end
  elsif t.seq_region_start > vf.seq_region_start and t.seq_region_start < vf.seq_region_end then
      return "COMPLEX_INDEL"
  elsif t.seq_region_end > vf.seq_region_start and t.seq_region_end < vf.seq_region_end then
      return "COMPLEX_INDEL"                
  end
  return nil
end
check_utr(vf,t) click to toggle source
# File lib/bio-ensembl/variation/variation_feature.rb, line 217
def check_utr(vf,t)
    if vf.seq_region_start > t.seq_region_start and vf.seq_region_end < t.coding_region_genomic_start then
       return (t.strand == 1) ? "5PRIME_UTR" : "3PRIME_UTR"
    elsif vf.seq_region_start > t.coding_region_genomic_end and vf.seq_region_end < t.seq_region_end then
       return (t.strand == 1) ? "3PRIME_UTR" : "5PRIME_UTR"   
    end
    return nil   
end
core_connection(seq_region_id) click to toggle source
# File lib/bio-ensembl/variation/variation_feature.rb, line 98
def core_connection(seq_region_id) 
  if !Ensembl::Core::DBConnection.connected? then  
    host,user,password,db_name,port,species,release = Ensembl::Variation::DBConnection.get_info
    begin
      Ensembl::Core::DBConnection.connect(species,release.to_i,:username => user, :password => password,:host => host, :port => port)
    rescue
      raise NameError, "Can't derive Core database name from #{db_name}. Are you using non conventional names?"
    end
  end
  # Check if SeqRegion already exists in Ensembl::SESSION
  seq_region = nil
  if Ensembl::SESSION.seq_regions.has_key?(seq_region_id)
    seq_region = Ensembl::SESSION.seq_regions[seq_region_id]
  else
    seq_region = Ensembl::Core::SeqRegion.find(seq_region_id)
    Ensembl::SESSION.seq_regions[seq_region.id] = seq_region
  end
  return seq_region
end
custom_transcript_variation(vf,sr) click to toggle source

Calculate a consequence type for a user-defined variation

# File lib/bio-ensembl/variation/variation_feature.rb, line 119
def custom_transcript_variation(vf,sr)
          
  @variation_name = vf.variation_name
  @seq_region = sr

  downstream = 5000
  upstream = 5000
  tvs = [] # store all the calculated TranscriptVariations
   # retrieve the slice of the genomic region where the variation is located
   region = Ensembl::Core::Slice.fetch_by_region(Ensembl::Core::CoordSystem.find(sr.coord_system_id).name,sr.name,vf.seq_region_start-upstream,vf.seq_region_end+downstream-1)
   # iterate through all the transcripts present in the region
  genes = region.genes(inclusive = true)
   if genes[0] != nil
    genes.each do |g|
      g.transcripts.each do |t|
        @cache = {}
        tv = TranscriptVariation.new() # create a new TranscriptVariation object for every transcript present
        # do the calculations
        
        # check if the variation is intergenic for this transcript (no effects)
        tv.consequence_type = check_intergenic(vf,t)
        
        # check if the variation is upstram or downstram the transcript
        tv.consequence_type = check_upstream_downstream(vf,t) if tv.consequence_type == ""
        
        # if no consequence type is found, then the variation is inside the transcript
        # check for non coding gene
        tv.consequence_type = check_non_coding(vf,t) if tv.consequence_type == "" and t.biotype != 'protein_coding'

        # if no consequence type is found, then check intron / exon boundaries
        tv.consequence_type = check_splice_site(vf,t) if tv.consequence_type == ""

        # if no consequence type is found, check if the variation is inside UTRs
        tv.consequence_type = check_utr(vf,t) if tv.consequence_type == ""    
                  
        # if no consequence type is found, then variation is inside an exon.
        # Check the codon change
        (tv.consequence_type,tv.peptide_allele_string) = check_aa_change(vf,t) if tv.consequence_type == ""
          
        
        begin # this changed from release 58
           tv.transcript_stable_id = t.stable_id
        rescue NoMethodError
           tv.transcript_id = t.id
        end
        
        tv.consequence_type = "INTERGENIC" if tv.consequence_type == ""
        tvs << tv 
      end   
    end
   end
   # if there are no transcripts/genes within 5000 bases upstream and downstream set the variation as INTERGENIC (no effects)
   if tvs.size == 0 then
    tv = TranscriptVariation.new()
    tv.consequence_type = "INTERGENIC"
    tvs << tv
   end

   return tvs
 end