class Ezgff::GffDb

Public Class Methods

attributes_as_json(gffline) click to toggle source
# File lib/ezgff/gffsqlitedb.rb, line 151
def self.attributes_as_json(gffline)
  keys_multi_val_allowed = %{Parent Alias Note Dbxref Ontology_term}

  gr = Bio::GFF::GFF3::Record.new(gffline.chomp)

  h = Hash.new
  gr.attributes.each do |att|
    k, v = att
    unless h.has_key?(k)
      h[k] = []
    end
    h[k] << v
  end
  h2 = Hash.new
  h.each do |key, values|
    if key == "Dbxref2" # dummy (not used currently)
      h3 = Hash.new
      values.each do |val|
        m = /(.+?):/.match(val)
        dbtag = m[1]
        dbval = m.post_match
        h3.update({dbtag => dbval})
      end
      h2[key] = h3
    elsif key == "Ontology_term" || key == "Alias" || key == "Dbxref"
      h2[key] = values
    else
      h2[key] = values.join(",")
    end
  end
  h2.to_json
end
build_db(gff_in, ezdb_base = nil) click to toggle source

sqlite3 schema

gff_records (
  line_num     integer primary key,
  record       text,            # original record
  id           text,
  parent       text,
  seqid        text not null,
  source       text,
  type         text,
  start        integer not null,
  end          integer not null,
  score        real,
  strand       varchar(1),
  phase        integer,
  attributes   text,
  attributes_json json
)
# File lib/ezgff/gffsqlitedb.rb, line 35
    def self.build_db(gff_in, ezdb_base = nil)
      ezdb_base = (ezdb_base || ".")
      ezdb_path = ezdb_base + "/" + File.basename(gff_in) + ".ezdb"
      gff_file = ezdb_path + "/" + File.basename(gff_in)
      Dir.mkdir(ezdb_path)
      File.open(gff_file, "w") do |o|
        File.open(gff_in).each do |l|
          break if /^\#\#FASTA/.match(l)
          ## skip header section
          next if /^\#/.match(l)
          o.puts l
        end
      end
      
  #    FileUtils.cp(gff_in, gff_file)
      sq3_file = gff_file + ".sqlite3"

      ## Create table in sqlite3 RDBMS
      ##   table name: gff_record

      sq3_db = SQLite3::Database.new(sq3_file)

      sql = <<-SQL
      CREATE TABLE gff_records (
        line_num     integer primary key,
        record       text,
        id           text,
        parent       text,
        seqid        text not null,
        source       text,
        type         text,
        start        integer not null,
        end          integer not null,
        score        real,
        strand       varchar(1),
        phase        integer,
        attributes   text,
        attributes_json json
      );
      SQL

      sq3_db.execute(sql)

      ## Read GFF file and insert data into
      ## the sqlite3 table

      sq3_db.transaction do 
        File.open(gff_file).each_with_index do |l, i|
      #    puts l
          ## skip FASTA seq section
          break if /^\#\#FASTA/.match(l)
      
          ## skip header section
          next if /^\#/.match(l)
          gr = Bio::GFF::GFF3::Record.new(l.chomp)
      #    p gr.attributes
          id = nil
          id_found = gr.attributes.select{|a| a[0] == "ID"}
          if id_found.size == 1
            id = id_found[0][1]
          elsif id_found.size == 0
            ## do nothing (id = nil)
          elsif id_found > 1
            STDERR.puts gr.attributes
            raise "Multiple IDs found."
          end
          parent =  ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
          a = l.chomp.split(/\t/)
      
          sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
          values = [
            i,       # line number
            l.chomp, # raw record
            id,      # ID
            parent,  # parent ID
            a[0],    # seqid
            a[1],    # source
            a[2],    # type
            a[3],    # start
            a[4],    # end
            (a[5] == "." ? nil : a[5]),    # score
            a[6],    # strand
            (a[7] == "." ? nil : a[7]),    # phase
            a[8],    # attributes
            attributes_as_json(l)]
          sq3_db.execute(sql, values)
        end
      end

      ## Indexing the sqlite3 table
      table = "gff_records"
      %w{id parent source type}.each do |col|
        idxname = "index_#{table}_on_#{col}"
        sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
        sq3_db.execute(sql)
      end

      return ezdb_path

    end
build_tabix(gff_in) click to toggle source
# File lib/ezgff/gffsqlitedb.rb, line 137
def self.build_tabix(gff_in)
  ## sort gff by position
  gfffile_sorted = gff_in + ".gz"
  cmd = %Q{(grep ^"#" #{gff_in}; grep -v ^"#" #{gff_in} | sort -t $'\t' -k1,1 -k4,4n) | bgzip > #{gfffile_sorted};}
  STDERR.puts cmd
  system cmd

  cmd = "tabix -p gff #{gfffile_sorted}"
  STDERR.puts cmd
  system cmd
  
  STDERR.puts "#{gfffile_sorted} and #{gfffile_sorted}.tbi were generated."
end
new(path) click to toggle source
# File lib/ezgff/gffsqlitedb.rb, line 184
def initialize(path)
  @db = SQLite3::Database.new(path)
end

Public Instance Methods

each_record() { |an| ... } click to toggle source
# File lib/ezgff/gffsqlitedb.rb, line 188
def each_record
  sql = "SELECT * FROM gff_records"
  @db.execute(sql).each do |r|
    an = Annotation.new()
    an.build_from_db_record(r)
    yield an
  end
end
get(id) click to toggle source
# File lib/ezgff/gffsqlitedb.rb, line 197
  def get(id)
    sql = %Q{SELECT * FROM gff_records WHERE id=="#{id}";}
#    puts sql
    res = @db.execute(sql)
    if res.size == 1
      an = Annotation.new(@db)
      an.build_from_db_record(res[0])
      return an
    else
      if res.size >= 2
        raise "multiple hits"
      elsif res.size == 0
        raise "not found: #{id}"
      end
    end
  end
get_by_line_number(n) click to toggle source
# File lib/ezgff/gffsqlitedb.rb, line 214
def get_by_line_number(n)
  sql = %Q{SELECT * FROM gff_records WHERE line_num=="#{n}";}
  res = @db.execute(sql)
  if res.size == 1
    an = Annotation.new(@db)
    an.build_from_db_record(res[0])
    return an
  else
    if res.size >= 2
      raise "multiple hits"
    elsif res.size == 0
      raise "not found: #{id}"
    end
  end
end