class EupathDBGFF
Unlike JGI genes files, ApiDB files have several differences:
- genes on the reverse strand appear in order of their exons, and so the exons are not all in the correct order with respect to the underlying sequence.
Attributes
features_to_ignore[RW]
Public Class Methods
new(path)
click to toggle source
# File lib/eupathdb_gff.rb, line 14 def initialize(path) @file = File.open path, 'r' @next_gff = read_record @features_to_ignore = [ 'rRNA', 'tRNA', 'snRNA', 'transcript', 'ncRNA', ] end
Public Instance Methods
ignore_line?(cur)
click to toggle source
ignore this line when parsing the file
# File lib/eupathdb_gff.rb, line 117 def ignore_line?(cur) return ['supercontig', 'introgressed_chromosome_region'].include?(cur.feature) end
ignore_record?(record)
click to toggle source
Certain things I don’t want uploaded, like apicoplast genome, etc.
# File lib/eupathdb_gff.rb, line 122 def ignore_record?(record) if !record or !record.seqname or @features_to_ignore.include?(record.feature) or record.seqname.match(/^apidb\|NC\_/) or record.seqname.match(/^apidb\|API_IRAB/) or record.seqname.match(/^apidb\|M76611/) or record.seqname.match(/^apidb\|X95276/) #or # record.seqname.match(/^apidb\|Pf/) return true else return false end end
next_gene()
click to toggle source
# File lib/eupathdb_gff.rb, line 26 def next_gene cur = @next_gff if !cur return nil end # Ignore the supercontigs at the start of the file while ignore_line?(cur) or ignore_record?(cur) @next_gff = read_record cur = @next_gff if !cur return nil end end if cur.feature != 'gene' raise Exception, "Badly parsed apidb line: #{cur}. Expected gene first." end # save line so can set these values later, # i gene_line = cur # First mRNA cur = read_record if cur.feature != 'mRNA' # skip rRNA type genes because they are not relevant if ignore_record?(cur) # skip forward to the next gene while cur.feature != 'gene' cur = read_record return nil if cur.nil? # we have reached the end on an ignored gene end @next_gff = cur if cur return next_gene else return nil end else raise Exception, "Badly parsed apidb line: #{cur}. Expected mRNA next." end end # Setup the gene in itself gene = setup_gene_from_first_line gene_line # setup stuff from mRNA line ids = cur.attributes['Ontology_term'] if ids gene.go_identifiers = ids.split ',' end # Next CDS cur = read_record if cur.feature != 'CDS' raise Exception, "Badly parsed apidb line: #{cur}. Expected CDS next." end gene.cds = [] while cur.feature == 'CDS' f = Bio::Location.new f.from = cur.start f.to = cur.end gene.cds.push f cur = read_record end #next exons if cur.feature != 'exon' raise Exception, "Badly parsed apidb line: #{cur}. Expected exon next." end gene.exons = [] while cur and cur.feature == 'exon' f = Bio::Location.new f.from = cur.start f.to = cur.end gene.exons.push f cur = read_record end @next_gff = cur return gene end
Private Instance Methods
read_record()
click to toggle source
# File lib/eupathdb_gff.rb, line 137 def read_record line = "" # while blank or comment lines, skip, except for ##Fasta, which # means all the genes have already been defined while line.lstrip.rstrip.empty? or line.match(/^\#/) line = @file.gets if !line or line.match(/^\#\#FASTA/) return nil end end whole = EupathDBGFFRecord.new(line) return whole end
setup_gene_from_first_line(gene_line)
click to toggle source
Given a line describing a gene in an apidb gff file, setup all the stuff associated with the ‘gene’ line
# File lib/eupathdb_gff.rb, line 158 def setup_gene_from_first_line(gene_line) gene = PositionedGeneWithOntology.new gene.start = gene_line.start gene.strand = gene_line.strand aliai = gene_line.attributes['Alias'] if aliai aliai.chomp! gene.alternate_ids = aliai.split ',' end # make description proper description = gene_line.attributes['description'] gene.description = CGI::unescape(description) # yey for useful functions I didn't write # name - remove the 'apidb|' bit match = gene_line.attributes['ID'].match('apidb\|(.*)') if !match or !match[1] or match[1] === '' raise Exception, "Badly parsed gene name: #{gene_line}.attributes['ID']}." end gene.name = match[1] gene.seqname = gene_line.seqname return gene end