module Exodb::Utils

Public Instance Methods

guess_miriam(str) click to toggle source

Guess the type of id

@param [String] id string @return [String] a miriam registry or the string itself incase cannot guess

# File lib/exodb/utils.rb, line 29
def guess_miriam(str)
        case str
        when /\A((AC|AP|NC|NG|NM|NP|NR|NT|NW|XM|XP|XR|YP|ZP)_\d+|(NZ\_[A-Z]{4}\d+))(\.\d+)?\z/
                return "urn:miriam:refseq:#{str}"
        when /\A((ENS[A-Z]*[FPTG]\d{11}(\.\d+)?)|(FB\w{2}\d{7})|(Y[A-Z]{2}\d{3}[a-zA-Z](\-[A-Z])?)|([A-Z_a-z0-9]+(\.)?(t)?(\d+)?([a-z])?))\z/
                return "urn:miriam:ensembl:#{str}"
        when /\A((HGNC|hgnc):)?\d{1,5}\z/
                return "urn:miriam:hgnc:#{str}"
        when /\ACCDS\d+\.\d+\z/
                return "urn:miriam:ccds:#{str}"
        else
                return str
        end
end
load_indel_from_csv(csvfile) click to toggle source
# File lib/exodb/rositza/load.rb, line 59
def load_indel_from_csv(csvfile)
        
        CSV.read(csvfile, col_sep: "\t", headers: true).each do |record|
                
                var = Exodb::Variant.new()
                var.location=("#{record["chr"]}:#{record["pos"]}")
                #var.reference = record["Reference Genotype"]
                var.alternatet = ['-2N']
                var.somstatus = record["Somatic Status"].downcase
                var.passfilter = true
                #var.pileupt = record["Reads"]
                var.predicted_damage = true
                var.temp = {} if var.temp.blank?
                var.temp['aachange'] = "#{record["symbol"]}:InDel"
                var.add_to_sample(record["cell lines"])
                p var.save!
        end
end
load_sample_from_csv(csvfile) click to toggle source
# File lib/exodb/rositza/load.rb, line 78
def load_sample_from_csv(csvfile)
        CSV.read(csvfile, col_sep: "\t", headers: true).each do |record|
                
                sample = Cell.new({oid: record["SampleFinal"],
                                                type: record["Type"].downcase,
                                                typeid: "urn:miriam:bioportal.meddra:#{record["Type"] =~ /^spitz /i ? '10041632' : record["Type"] =~ /^spitzoid /i ? '10072450' : '10028679'}",
                                                patient: record["SampleFinal"].split('T')[0],
                                                preferred: record["Preferred"] == 'Y' ? true : false,
                                                paired: record["merge41final"] =~ /\Apaired\z/i ? true : false},
                                                labels: {})
                
                sample.add_to_dataset('internal.ds:000001')
                
                p sample.save!
                
        end
end
load_snv_from_csv(csvfile) click to toggle source
# File lib/exodb/rositza/load.rb, line 20
def load_snv_from_csv(csvfile)
        
        CSV.read(csvfile, col_sep: "\t", headers: true).each do |record|
                
                var = Exodb::Variant.new()
                var.location=("chr#{record["chromosome"]}:#{record["start position"]}")
                var.reference = record["ref nucleotide"].split('/')[0]
                var.alternatet = record["var nucleotide"].split('/').uniq
                var.somstatus = record["Somatic Status"].downcase
                var.passfilter = true
                var.pileupt = record["Reads"]
                var.predicted_damage = record["PolyPhen"] =~ /probably_damaging/ || record["SIFT"] =~ /deleterious/i || record["PROVEAN"] =~ /deleterious/i ? true : false
                var.temp = {} if var.temp.blank?
                var.temp['aachange'] = "#{record["symbol"]}:#{record["AA Change"]}"
                var.add_to_sample(record["cell"])
                
                p var.save!
        end
end
load_splice_from_csv(csvfile) click to toggle source
# File lib/exodb/rositza/load.rb, line 40
def load_splice_from_csv(csvfile)
        
        CSV.read(csvfile, col_sep: "\t", headers: true).each do |record|
                
                var = Exodb::Variant.new()
                var.location=("chr#{record["chr"]}:#{record["Position"]}")
                var.reference = record["Reference Genotype"]
                var.alternatet = ['N']
                var.somstatus = 'somatic'
                var.passfilter = true
                #var.pileupt = record["Reads"]
                var.predicted_damage = true
                var.temp = {} if var.temp.blank?
                var.temp['aachange'] = "#{record["Gene Symbol"]}:SpV"
                var.add_to_sample(record["Sample"])
                p var.save!
        end
end
load_variant_from_merge(mergefile, normal = false, assembly = Exodb::DEFAULTASSEMBLY) click to toggle source
# File lib/exodb/rositza/load.rb, line 96
def load_variant_from_merge(mergefile, normal = false, assembly = Exodb::DEFAULTASSEMBLY)
        
        File.open(mergefile).each do |line|
                record = line.chomp.split("\t")
                
                header = record[0..15]
                
                list = record[16..-1]
                
                until list.empty?
                        if normal
                                snv = Exodb::Variant.where(oid: "#{header[2]}:#{header[3]}:#{assembly}:#{sampledata[0]}").first
                                snv.pileupn = sampledata[6] if snv != nil
                        else
                                sampledata = list.shift(7)
                                snv = Exodb::Variant.new()
                                snv.location=("#{header[2]}:#{header[3]}:#{assembly}")
                                snv.reference = header[10]
                                snv.pileupt = sampledata[6]
                                snv.temp = {} if snv.temp.blank?
                                snv.add_to_sample(sampledata[0])
                                p snv.save!
                        end
                end
        end
        
end
upload_generef_from_gff3(filename, assembly = Exodb::DEFAULTASSEMBLY) click to toggle source

Upload gene information to database using gff3 and genome sequence fasta file Exodb::Utils.upload_generef_from_gff3(‘ref_GRCh37.p5_top_level.gff3’)

@param [String] gff3 file @param [String] assembly name [default: gff file name]

# File lib/exodb/extra/upload_generef.rb, line 24
def upload_generef_from_gff3(filename, assembly = Exodb::DEFAULTASSEMBLY)
        
        gff = Bio::GFF::GFF3.new(File.open(filename).read)
        
        processDbxref = lambda do |str|
                case str
                when /^GeneID/
                        return "urn:miriam:ncbigene:#{str.split(/:/)[1]}"
                when /^HGNC/
                        return "urn:miriam:hgnc:#{str}"
                when /^HPRD/
                        return "urn:miriam:hprd:#{str.split(/:/)[1]}"
                when /^miRBase/
                        return "urn:miriam:mirbase:#{str.split(/:/)[1]}"
                when /^Genbank/
                        return "urn:miriam:refseq:#{str.split(/:/)[1]}"
                when /^CCDS/
                        return "urn:miriam:ccds:#{str.split(/:/)[1]}"
                when /^MIM/
                        return "urn:miriam:omim:#{str.split(/:/)[1]}"
                else
                        return str
                end
        end
        
        assembly = assembly.blank? ? Exodb::DEFAULTASSEMBLY : Exodb::ASSEMBLY[assembly.downcase]
        
        regions = {}
        genes = {}
        seq = {}
        regions.default='chr0'
        
        gff.records.each do |e|
                
                case e.feature
                when 'region'
                        e.attributes.each do |attr|
                                case attr[0]
                                when 'chromosome'
                                        regions[e.seqname] = e.seqname =~ /\ANC_/ ? "chr#{attr[1]}" : e.seqname
                                end
                        end
                        
                        chr = Exodb::Chrref.new()
                        chr.location=("#{regions.has_key?(e.seqname) ? regions[e.seqname] : e.seqname}:#{e.start}..#{e.end}:#{assembly}")
                        chr.oid = "#{chr.chr}:#{assembly}"
                        chr.add_to_set(:xrefs, guess_miriam(e.seqname))
                        chr.save!
                        
                        if File.exist?("./genome/#{e.seqname}.fa")
                                seq = {}
                                Bio::FlatFile.open(Bio::FastaFormat, "./genome/#{e.seqname}.fa").each {|fasta| seq[fasta.acc_version] = fasta.to_seq}
                        end
                        
                when 'gene', 'tRNA'
                        
                        gene = {type: 'gene', xrefs: [], strand: e.strand, chrrefseq: "#{guess_miriam(e.seqname)}", location: "#{regions[e.seqname]}:#{e.start}..#{e.end}:#{assembly}", childs: [], exon: [], cds: []}
                        
                        e.attributes.each do |attr|
                                case attr[0]
                                when 'Dbxref'
                                        gene[:xrefs].push(processDbxref.call(attr[1]))
                                when 'Name'
                                        gene[:xrefs].push("urn:miriam:hgnc.symbol:#{attr[1]}") if attr[1] !~ /^LOC\d+$/
                                when 'pseudo'
                                        gene[:psuedo] = attr[1] == 'true' ? true : false
                                when 'ID'
                                        gene[:id] = attr[1]
                                end
                        end
                        
                        gene[:sequence] = seq[e.seqname].subseq(e.start.to_i - Exodb::Generef.expanding, e.end.to_i + Exodb::Generef.expanding).to_s if seq.has_key?(e.seqname)
                        gene[:seqstart] = e.start.to_i - Exodb::Generef.expanding
                        gene[:seqstop] = e.end.to_i + Exodb::Generef.expanding
                        gene[:oid] = "#{gene[:location]}:#{assembly}"
                        genes[gene[:id]] = gene
                        
                when /\A(transcript|[^t]*RNA)/
                        rna = {type: 'rna', xrefs: [], strand: e.strand, chr: regions[e.seqname], location: "#{regions[e.seqname]}:#{e.start}..#{e.end}:#{assembly}", exon: [], cds: []}
                        
                        e.attributes.each do |attr|
                                case attr[0]
                                when 'Dbxref'
                                        rna[:xrefs].push(processDbxref.call(attr[1]))
                                when 'pseudo'
                                        rna[:psuedo] = attr[1] == 'true' ? true : false
                                when 'ID'
                                        rna[:id] = attr[1]
                                when 'Parent'
                                        rna[:parent] = attr[1]
                                end
                        end
                        
                        genes[rna[:id]] = rna
                        genes[rna[:parent]][:childs].push(rna[:id]) if rna[:parent]
                        
                when 'exon'
                        e.attributes.each do |attr|
                                case attr[0]
                                when 'Parent'
                                        genes[attr[1]][:exon].push([e.start, e.end].sort)
                                end
                        end
                when 'CDS'
                        e.attributes.each do |attr|
                                case attr[0]
                                when 'Parent'
                                        genes[attr[1]][:cds].push([e.start, e.end].sort)
                                end
                        end
                end
        end
        
        count = {succ: 0, fail: 0}
        
        genes.each_pair do |id, entry|
                if entry[:type] == 'gene'
                        
                        gene = Exodb::Generef.new()
                        
                        entry.each_pair do |k, v|
                                gene.method(:"#{k}=").call(v) if ![:type, :childs, :exon, :cds].include?(k)
                        end
                        
                        entry[:childs].each do |child|
                                
                                rna = Isoform.new()
                                data = genes[child]
                                rna.xrefs = data[:xrefs]
                                rna.exon = data[:exon].sort
                                rna.cds = data[:cds].sort
                                
                                gene.isoforms.push(rna)
                                
                        end
                        
                        if gene.save!
                                count[:succ] += 1
                                Exodb::putstv "Deposit Gene reference #{gene.xrefs[0]}"
                        else
                                count[:fail] += 1
                                Exodb::putstv "Deposit Gene reference #{gene.xrefs[0]}"
                        end
                        
                end
                
        end
        
        Exodb::putst "SUCCESS: #{count[:succ]} , FAIL: #{count[:fail]}"
        
end