module Exodb::Utils
Public Instance Methods
guess_miriam(str)
click to toggle source
Guess the type of id
@param [String] id string @return [String] a miriam registry or the string itself incase cannot guess
# File lib/exodb/utils.rb, line 29 def guess_miriam(str) case str when /\A((AC|AP|NC|NG|NM|NP|NR|NT|NW|XM|XP|XR|YP|ZP)_\d+|(NZ\_[A-Z]{4}\d+))(\.\d+)?\z/ return "urn:miriam:refseq:#{str}" when /\A((ENS[A-Z]*[FPTG]\d{11}(\.\d+)?)|(FB\w{2}\d{7})|(Y[A-Z]{2}\d{3}[a-zA-Z](\-[A-Z])?)|([A-Z_a-z0-9]+(\.)?(t)?(\d+)?([a-z])?))\z/ return "urn:miriam:ensembl:#{str}" when /\A((HGNC|hgnc):)?\d{1,5}\z/ return "urn:miriam:hgnc:#{str}" when /\ACCDS\d+\.\d+\z/ return "urn:miriam:ccds:#{str}" else return str end end
load_indel_from_csv(csvfile)
click to toggle source
# File lib/exodb/rositza/load.rb, line 59 def load_indel_from_csv(csvfile) CSV.read(csvfile, col_sep: "\t", headers: true).each do |record| var = Exodb::Variant.new() var.location=("#{record["chr"]}:#{record["pos"]}") #var.reference = record["Reference Genotype"] var.alternatet = ['-2N'] var.somstatus = record["Somatic Status"].downcase var.passfilter = true #var.pileupt = record["Reads"] var.predicted_damage = true var.temp = {} if var.temp.blank? var.temp['aachange'] = "#{record["symbol"]}:InDel" var.add_to_sample(record["cell lines"]) p var.save! end end
load_sample_from_csv(csvfile)
click to toggle source
# File lib/exodb/rositza/load.rb, line 78 def load_sample_from_csv(csvfile) CSV.read(csvfile, col_sep: "\t", headers: true).each do |record| sample = Cell.new({oid: record["SampleFinal"], type: record["Type"].downcase, typeid: "urn:miriam:bioportal.meddra:#{record["Type"] =~ /^spitz /i ? '10041632' : record["Type"] =~ /^spitzoid /i ? '10072450' : '10028679'}", patient: record["SampleFinal"].split('T')[0], preferred: record["Preferred"] == 'Y' ? true : false, paired: record["merge41final"] =~ /\Apaired\z/i ? true : false}, labels: {}) sample.add_to_dataset('internal.ds:000001') p sample.save! end end
load_snv_from_csv(csvfile)
click to toggle source
# File lib/exodb/rositza/load.rb, line 20 def load_snv_from_csv(csvfile) CSV.read(csvfile, col_sep: "\t", headers: true).each do |record| var = Exodb::Variant.new() var.location=("chr#{record["chromosome"]}:#{record["start position"]}") var.reference = record["ref nucleotide"].split('/')[0] var.alternatet = record["var nucleotide"].split('/').uniq var.somstatus = record["Somatic Status"].downcase var.passfilter = true var.pileupt = record["Reads"] var.predicted_damage = record["PolyPhen"] =~ /probably_damaging/ || record["SIFT"] =~ /deleterious/i || record["PROVEAN"] =~ /deleterious/i ? true : false var.temp = {} if var.temp.blank? var.temp['aachange'] = "#{record["symbol"]}:#{record["AA Change"]}" var.add_to_sample(record["cell"]) p var.save! end end
load_splice_from_csv(csvfile)
click to toggle source
# File lib/exodb/rositza/load.rb, line 40 def load_splice_from_csv(csvfile) CSV.read(csvfile, col_sep: "\t", headers: true).each do |record| var = Exodb::Variant.new() var.location=("chr#{record["chr"]}:#{record["Position"]}") var.reference = record["Reference Genotype"] var.alternatet = ['N'] var.somstatus = 'somatic' var.passfilter = true #var.pileupt = record["Reads"] var.predicted_damage = true var.temp = {} if var.temp.blank? var.temp['aachange'] = "#{record["Gene Symbol"]}:SpV" var.add_to_sample(record["Sample"]) p var.save! end end
load_variant_from_merge(mergefile, normal = false, assembly = Exodb::DEFAULTASSEMBLY)
click to toggle source
# File lib/exodb/rositza/load.rb, line 96 def load_variant_from_merge(mergefile, normal = false, assembly = Exodb::DEFAULTASSEMBLY) File.open(mergefile).each do |line| record = line.chomp.split("\t") header = record[0..15] list = record[16..-1] until list.empty? if normal snv = Exodb::Variant.where(oid: "#{header[2]}:#{header[3]}:#{assembly}:#{sampledata[0]}").first snv.pileupn = sampledata[6] if snv != nil else sampledata = list.shift(7) snv = Exodb::Variant.new() snv.location=("#{header[2]}:#{header[3]}:#{assembly}") snv.reference = header[10] snv.pileupt = sampledata[6] snv.temp = {} if snv.temp.blank? snv.add_to_sample(sampledata[0]) p snv.save! end end end end
upload_generef_from_gff3(filename, assembly = Exodb::DEFAULTASSEMBLY)
click to toggle source
Upload gene information to database using gff3 and genome sequence fasta file Exodb::Utils.upload_generef_from_gff3
(‘ref_GRCh37.p5_top_level.gff3’)
@param [String] gff3 file @param [String] assembly name [default: gff file name]
# File lib/exodb/extra/upload_generef.rb, line 24 def upload_generef_from_gff3(filename, assembly = Exodb::DEFAULTASSEMBLY) gff = Bio::GFF::GFF3.new(File.open(filename).read) processDbxref = lambda do |str| case str when /^GeneID/ return "urn:miriam:ncbigene:#{str.split(/:/)[1]}" when /^HGNC/ return "urn:miriam:hgnc:#{str}" when /^HPRD/ return "urn:miriam:hprd:#{str.split(/:/)[1]}" when /^miRBase/ return "urn:miriam:mirbase:#{str.split(/:/)[1]}" when /^Genbank/ return "urn:miriam:refseq:#{str.split(/:/)[1]}" when /^CCDS/ return "urn:miriam:ccds:#{str.split(/:/)[1]}" when /^MIM/ return "urn:miriam:omim:#{str.split(/:/)[1]}" else return str end end assembly = assembly.blank? ? Exodb::DEFAULTASSEMBLY : Exodb::ASSEMBLY[assembly.downcase] regions = {} genes = {} seq = {} regions.default='chr0' gff.records.each do |e| case e.feature when 'region' e.attributes.each do |attr| case attr[0] when 'chromosome' regions[e.seqname] = e.seqname =~ /\ANC_/ ? "chr#{attr[1]}" : e.seqname end end chr = Exodb::Chrref.new() chr.location=("#{regions.has_key?(e.seqname) ? regions[e.seqname] : e.seqname}:#{e.start}..#{e.end}:#{assembly}") chr.oid = "#{chr.chr}:#{assembly}" chr.add_to_set(:xrefs, guess_miriam(e.seqname)) chr.save! if File.exist?("./genome/#{e.seqname}.fa") seq = {} Bio::FlatFile.open(Bio::FastaFormat, "./genome/#{e.seqname}.fa").each {|fasta| seq[fasta.acc_version] = fasta.to_seq} end when 'gene', 'tRNA' gene = {type: 'gene', xrefs: [], strand: e.strand, chrrefseq: "#{guess_miriam(e.seqname)}", location: "#{regions[e.seqname]}:#{e.start}..#{e.end}:#{assembly}", childs: [], exon: [], cds: []} e.attributes.each do |attr| case attr[0] when 'Dbxref' gene[:xrefs].push(processDbxref.call(attr[1])) when 'Name' gene[:xrefs].push("urn:miriam:hgnc.symbol:#{attr[1]}") if attr[1] !~ /^LOC\d+$/ when 'pseudo' gene[:psuedo] = attr[1] == 'true' ? true : false when 'ID' gene[:id] = attr[1] end end gene[:sequence] = seq[e.seqname].subseq(e.start.to_i - Exodb::Generef.expanding, e.end.to_i + Exodb::Generef.expanding).to_s if seq.has_key?(e.seqname) gene[:seqstart] = e.start.to_i - Exodb::Generef.expanding gene[:seqstop] = e.end.to_i + Exodb::Generef.expanding gene[:oid] = "#{gene[:location]}:#{assembly}" genes[gene[:id]] = gene when /\A(transcript|[^t]*RNA)/ rna = {type: 'rna', xrefs: [], strand: e.strand, chr: regions[e.seqname], location: "#{regions[e.seqname]}:#{e.start}..#{e.end}:#{assembly}", exon: [], cds: []} e.attributes.each do |attr| case attr[0] when 'Dbxref' rna[:xrefs].push(processDbxref.call(attr[1])) when 'pseudo' rna[:psuedo] = attr[1] == 'true' ? true : false when 'ID' rna[:id] = attr[1] when 'Parent' rna[:parent] = attr[1] end end genes[rna[:id]] = rna genes[rna[:parent]][:childs].push(rna[:id]) if rna[:parent] when 'exon' e.attributes.each do |attr| case attr[0] when 'Parent' genes[attr[1]][:exon].push([e.start, e.end].sort) end end when 'CDS' e.attributes.each do |attr| case attr[0] when 'Parent' genes[attr[1]][:cds].push([e.start, e.end].sort) end end end end count = {succ: 0, fail: 0} genes.each_pair do |id, entry| if entry[:type] == 'gene' gene = Exodb::Generef.new() entry.each_pair do |k, v| gene.method(:"#{k}=").call(v) if ![:type, :childs, :exon, :cds].include?(k) end entry[:childs].each do |child| rna = Isoform.new() data = genes[child] rna.xrefs = data[:xrefs] rna.exon = data[:exon].sort rna.cds = data[:cds].sort gene.isoforms.push(rna) end if gene.save! count[:succ] += 1 Exodb::putstv "Deposit Gene reference #{gene.xrefs[0]}" else count[:fail] += 1 Exodb::putstv "Deposit Gene reference #{gene.xrefs[0]}" end end end Exodb::putst "SUCCESS: #{count[:succ]} , FAIL: #{count[:fail]}" end