class BioInterchange::Genomics::VCFReader

Public Class Methods

new(name = nil, name_uri = nil, date = nil, batch_size = nil) click to toggle source

Creates a new instance of a Genome Variation Format (GVF) reader.

name

Optional name of the person who generated the GVF file.

name_uri

Optional e-mail address of the person who generated the GVF file.

date

Optional date of when the GVF file was produced.

# File lib/biointerchange/genomics/vcf_reader.rb, line 26
def initialize(name = nil, name_uri = nil, date = nil, batch_size = nil)
  # Remember: calling super without brackets passes all arguments of initialize!
  super
end

Protected Instance Methods

add_comment(feature_set, comment) click to toggle source

Adds a comment to the feature set; ignores the header line that preceds VCF features. Comments are added on a line-by-line basis.

feature_set

VCF feature set to which the comment line is being added

comment

comment line in the VCF file

# File lib/biointerchange/genomics/vcf_reader.rb, line 116
def add_comment(feature_set, comment)
  if comment.start_with?("CHROM\tPOS\tID\tREF\tALT") then
    columns = comment.split("\t")
    @samples = columns[9..-1]
    @samples = [] unless @samples
  else
    @comment << comment
  end
end
add_feature(feature_set, line) click to toggle source

Adds a VCF feature to a VCF feature set.

feature_set

feature set to which the feature should be added to

line

line from the VCF that describes the feature

# File lib/biointerchange/genomics/vcf_reader.rb, line 130
def add_feature(feature_set, line)
  line.chomp!
  chrom, pos, id, ref, alt, qual, filter, info, format, samples = line.split("\t")

  # Replace an unknown ID by nil, so that feature coordinates are used during serialization:
  id = nil

  #
  # Split composite fields
  #

  # Alternative alleles:
  alt = alt.split(',')

  # Filters:
  filter = filter.split(';')

  # Feature information:
  info = info.split(';')
  info = info.map { |key_value_pair| key, values = key_value_pair.split('=', 2) }
  info = Hash[info]
  info = Hash[info.map { |key, value|
    if value then
      [ key, value.split(',') ]
    else
      [ key, true ]
    end
  }]

  # Format for following sample columns:
  format = format.split(':')

  # Sample columns (need to be further split in the writer -- depends on format):
  samples = samples.split("\t").map { |value|
    # Dot: not data provided for the sample
    if value == '.' then
      {}
    else
      values = value.split(':')
      Hash[format.zip(values)]
    end
  }

  feature_set.add(BioInterchange::Genomics::VCFFeature.new(chrom, pos, id, ref, alt, qual, filter, info, samples))
end
add_pragma(feature_set, line) click to toggle source
# File lib/biointerchange/genomics/vcf_reader.rb, line 37
def add_pragma(feature_set, line)
  line.chomp!
  name, value = line[2..-1].split(/=/, 2)
  value.strip!

  # Interpret pragmas, and if not known, delegate to GFF3Reader (in alphabetical order):
  if name == 'assembly' then
    # attributes = split_attributes(value)
    # structured_attributes = feature_set.pragma(name)
    # structured_attributes = { name => [] } unless structured_attributes
    # structured_attributes[name] << attributes
    # feature_set.set_pragma(name, structured_attributes)
  elsif name == 'center' then
    #
  elsif name == 'contig' then
    self.add_vcf_pragma(feature_set, name, value)
  elsif name == 'fileDate' then
    feature_set.set_pragma(name, { name => Date.parse(value) })
  elsif name == 'fileformat' then
    feature_set.set_pragma(name, { name => value.sub(/^VCFv/, '').to_f })
  elsif name == 'FILTER' then
    self.add_vcf_pragma(feature_set, name, value)
  elsif name == 'FORMAT' then
    self.add_vcf_pragma(feature_set, name, value)
  elsif name == 'geneAnno' then
    #
  elsif name == 'ID' then
    #
  elsif name == 'INFO' then
    feature_set.set_pragma(name, vcf_mapping(value))
  elsif name == 'Number' then
    #
  elsif name == 'PEDIGREE' then
    self.add_vcf_pragma(feature_set, name, value)
  elsif name == 'phasing' then
    #
  elsif name == 'reference' then
    #
  elsif name == 'SAMPLE' then
    #
  elsif name == 'tcgaversion' then
    #
  elsif name == 'Type' then
    #
  elsif name == 'vcfProcessLog' then
    #
  elsif name == 'reference' then
    # 'reference' is not specified in VCF 4.1, but used in examples and real-world
    # VCF files nevertheless.
    # TODO What if reference already set?
    feature_set.set_pragma(name, value)
  else
    # Cannot be passed to super class, because GFF3 has inherently different pragma statements.
    feature_set.set_pragma(name, { name => value })
  end
end
add_vcf_pragma(feature_set, name, value) click to toggle source

Adds pragma information where the pragma can appear multiple times in the input (application: VCF). Each pragma information is still a hash, which is stored in an array.

feature_set

feature set to which the pragma information is added

name

name of the pragma under which the information is being stored

value

hashmap of the actual pragma information (will be passed through vcf_mapping call)

# File lib/biointerchange/genomics/vcf_reader.rb, line 101
def add_vcf_pragma(feature_set, name, value)
  values = feature_set.pragma(name)
  if values then
    values << vcf_mapping(value)
  else
    values = [ vcf_mapping(value) ]
  end
  feature_set.set_pragma(name, values)
end
create_feature_set() click to toggle source
# File lib/biointerchange/genomics/vcf_reader.rb, line 33
def create_feature_set
  BioInterchange::Genomics::VCFFeatureSet.new()
end

Private Instance Methods

vcf_mapping(value) click to toggle source

Takes a VCF meta-information string and returns a key-value mapping.

value

value of a meta-information assignment in VCF (key/value mappings of the form “<ID=value,…>”)

# File lib/biointerchange/genomics/vcf_reader.rb, line 181
def vcf_mapping(value)
  value = value[1..-2]

  mapping = {}
  identifier = ''
  assignment = ''
  state = :id
  value.each_char { |character|
    if state == :value then
      if character == '"' then
        state = :quoted
        next
      else
        state = :plain
      end
    end

    state = :separator if state == :plain and character == ','

    if state == :id then
      if character == '=' then
        state = :value
        assignment = ''
      else
        identifier << character
      end
    elsif state == :separator then
      if character == ',' then
        state = :id
        mapping[identifier] = assignment
        identifier = ''
      else
        # TODO Format error.
      end
    elsif state == :quoted then
      if character == '"' then
        state = :separator
        mapping[identifier] = assignment
        identifier = ''
      else
        assignment << character
      end
    elsif state == :plain then
      assignment << character
    else
      # TODO Whoops. Report error.
    end
  }

  mapping[identifier] = assignment unless identifier.empty?

  mapping
end