require 'protk/constants' require 'protk/randomize' require 'uri' require 'digest/md5' require 'net/ftp' require 'net/ftp/list' require 'bio' require 'tempfile' require 'pp' require 'set'

dbname=ARGV

# Load database spec file # $genv=Constants.instance() dbdir=“#{$genv.protein_database_root}/#{dbname}”

dbspec_file=“#{dbdir}/.protkdb.yaml” dbspec=YAML.load_file “#{dbspec_file}”

format = dbspec!=nil ? dbspec : “fasta”

# Output database filename # db_filename=“#{dbdir}/current.#{format}”

##################### # Utility Functions # #####################

def check_ftp_release_notes(release_notes)

rn_uri = URI.parse(release_notes)

rn_path="#{$genv.database_downloads}/#{rn_uri.host}/#{rn_uri.path}"

update_needed=false

host=rn_uri.host
Net::FTP.open(host) do |ftp|

  ftp.login
  rn_dir=Pathname.new(rn_uri.path).dirname.to_s
  rn_file=Pathname.new(rn_uri.path).basename.to_s
  ftp.chdir(rn_dir)

  ftp.passive=true

  p "Checking release notes"

  # Is the last path component a wildcard expression (we only allow *)
  # If so we need to find the file with the most recent modification time
  #
  if ( rn_file =~ /\*/)
    entries=ftp.list(rn_file)
    p entries
    latest_file=nil
    latest_file_mtime=nil
    entries.each do |dir_entry|
      info=Net::FTP::List.parse(dir_entry)
      if ( info.file? )
        latest_file_mtime = info.mtime if ( latest_file_mtime ==nil )
        latest_file = info.basename if ( latest_file_mtime ==nil )

        if ( info.mtime <=> latest_file_mtime ) #entry's mtime is later
          latest_file_mtime=info.mtime
          latest_file=info.basename
        end

      end        
    end

    throw "No release notes found" if ( latest_file ==nil)

    rn_file=latest_file

    # Adjust the rn_path to be the path of the latest file
    #
    rn_path="#{Pathname.new(rn_path).dirname}/#{latest_file}"

  end

  # Hash existing release notes data if it exists
  #
  existing_digest=nil
  existing_digest=Digest::MD5.hexdigest(File.read(rn_path))  if  Pathname.new(rn_path).exist? 

  rn_data=""
  dl_file=Tempfile.new("rn_file")

  ftp.getbinaryfile(rn_file,dl_file.path) { |data|  rn_data << data }

  rn_digest=Digest::MD5.hexdigest(rn_data)

  p "Done Downloading release notes #{ftp} #{rn_file} to #{dl_file.path} #{ftp.pwd}"

  throw "No release notes data at #{release_notes}" unless rn_digest!=nil

  # Update release notes data
  case
  when ( existing_digest != rn_digest )
    FileUtils.mkpath(Pathname.new(rn_path).dirname.to_s)
    File.open(rn_path, "w") {|file| file.puts(rn_data) }
    update_needed = true
  else
    p "Release notes are up to date"
  end 
end
update_needed

end

def download_ftp_file(ftp,file_name,dest_dir)

dest_path="#{dest_dir}/#{file_name}"

download_size=ftp.size(file_name)
mod_time=ftp.mtime(file_name,true)

percent_size=download_size/100
i=1
pc_complete=0
last_time=Time.new
p "Downloading #{file_name}"
ftp.passive=true

ftp.getbinaryfile(file_name,dest_path,1024) { |data| 

  progress=i*1024
  if ( pc_complete < progress.divmod(percent_size)[0] && ( Time.new - last_time) > 10 )
    pc_complete=progress.divmod(percent_size)[0]
    p "Downloading #{file_name} #{pc_complete} percent complete"
    last_time=Time.new
  end
  i=i+1
}

end

def download_ftp_source(source)

data_uri = URI.parse(source)

data_path="#{$genv.database_downloads}/#{data_uri.host}/#{data_uri.path}"
# Make sure our destination dir is available
#
FileUtils.mkpath(Pathname.new(data_path).dirname.to_s)

Net::FTP.open(data_uri.host) do |ftp|
  p "Connected to #{data_uri.host}"
  ftp.login

  ftp.chdir(Pathname.new(data_uri.path).dirname.to_s)

  last_path_component=Pathname.new(data_uri.path).basename.to_s

  case 
  when last_path_component=~/\*/  # A wildcard match. Need to download them all
    p "Getting directory listing for #{last_path_component}"
    ftp.passive=true
    matching_items=ftp.list(last_path_component)

    PP.pp(matching_items)

    matching_items.each do |dir_entry|
      info=Net::FTP::List.parse(dir_entry)
      download_ftp_file(ftp,info.basename,Pathname.new(data_path).dirname)
    end

  else # Just one file to download
    download_ftp_file(ftp,last_path_component,Pathname.new(data_path).dirname)
  end

end

end

def archive_fasta_file(filename)

if ( Pathname.new(filename).exist? )
  mt=File.new(filename).mtime
  timestamp="#{mt.year}_#{mt.month}_#{mt.day}"
  archive_filename="#{filename.gsub(/.fasta$/,'')}_#{timestamp}.fasta"
  p "Moving old database to #{archive_filename}"
  FileUtils.mv(filename,archive_filename)
end

end

def cleanup_file(filename)

if  (File.exist? filename  )
  archive_filename="#{filename}.tmp"
  p "Cleaning up #{filename}"
  FileUtils.mv(filename,archive_filename,:force=>true)
end

end

##################### # Source Files # #####################

def file_source(raw_source)

full_path=raw_source
full_path = "#{$genv.protein_database_root}/#{raw_source}" unless ( raw_source =~ /^\//) # relative paths should be relative to datbases dir
throw "File source #{full_path} does not exist" unless Pathname.new(full_path).exist?
full_path

end

def db_source(db_source)

current_release_path = "#{$genv.protein_database_root}/#{db_source}/current.fasta"
throw "Database source #{current_release_path} does not exist" unless Pathname.new(current_release_path).exist?
current_release_path

end

def ftp_source(ftpsource)

data_uri=URI.parse(ftpsource[0])
data_file_path="#{$genv.database_downloads}/#{data_uri.host}/#{data_uri.path}"
unpacked_data_path=data_file_path.gsub(/\.gz$/,'')

release_notes_url=ftpsource[1]
release_notes_exist=true
release_notes_exist=false if (release_notes_url =~ /^\s*none\s*$/) || (release_notes_url==nil)

release_notes_show_update_needed = true

if release_notes_exist

  data_rn=URI.parse(release_notes_url)

  if ( data_rn != nil )
    release_notes_file_path="#{$genv.database_downloads}/#{data_rn.host}/#{data_rn.path}"

    task :check_rn do
      release_notes_show_update_needed = check_ftp_release_notes(release_notes_url) 
    end

    file release_notes_file_path => :check_rn
  end
else
  task :check_date do

  end
end

if ( data_file_path=~/\*/) # A wildcard
  unpacked_data_path=data_file_path.gsub(/\*/,"_all_").gsub(/\.gz$/,'')
end

task unpacked_data_path  do #Unpacking. Includes unzipping and/or concatenating
    if ( release_notes_show_update_needed )
      download_ftp_source(ftpsource[0])
      file_pattern = Pathname.new(data_file_path).basename.to_s        

      case

    when data_file_path=~/\*/ # Multiple files to unzip/concatenate and we don't know what they are yet

      if file_pattern =~ /.gz$/
        unzipcmd="gunzip -vdf #{file_pattern}"
        p "Unzipping #{unzipcmd} ... this could take a while"
        sh %{ cd #{Pathname.new(data_file_path).dirname}; #{unzipcmd}  }             
      end

      file_pattern.gsub!(/\.gz$/,'')
      catcmd="cat #{file_pattern} > #{unpacked_data_path}"

      p "Concatenating files #{catcmd} ... this could take a while"
      sh %{ cd #{Pathname.new(data_file_path).dirname}; #{catcmd}  }

    else # Simple case. A single file
      if file_pattern =~ /.gz$/
        p "Unzipping #{Pathname.new(data_file_path).basename} ... "
        sh %{ cd #{Pathname.new(data_file_path).dirname}; gunzip -f #{Pathname.new(data_file_path).basename}  }           
      end
    end
  end
end

file unpacked_data_path => release_notes_file_path if release_notes_exist

unpacked_data_path

end

source_files=dbspec.collect do |raw_source|

sf=""
case 
when raw_source.class==Array
  sf=ftp_source(raw_source)
when (raw_source =~ /\.fasta$/ || raw_source =~ /\.txt$/ || raw_source =~ /\.dat$/ )
  sf=file_source(raw_source)
else
  sf=db_source(raw_source)
end
sf

end

######################## # Concat Filter Copy # ########################

raw_db_filename = “#{dbdir}/raw.#{format}”

file raw_db_filename => [source_files,dbspec_file].flatten do

source_filters=dbspec[:include_filters]

if ( format == "fasta" && source_filters.length > 0 ) # We can perform concat and filter for fasta only

  archive_fasta_file(raw_db_filename) if dbspec[:archive_old]

  cleanup_file(raw_db_filename)

  output_fh=File.open(raw_db_filename, "w")

  id_regexes=dbspec[:id_regexes]
  source_i=0
  throw "The number of source files #{source_files.length} should equal the number of source filters #{source_filters.length}" unless source_filters.length == source_files.length
  throw "The number of source files #{source_files.length} should equal the number of id regexes #{id_regexes.length}" unless source_filters.length == id_regexes.length

  added_ids=Set.new

  source_files.each do |source|
    # Open source as Fasta
    #
    Bio::FlatFile.open(Bio::FastaFormat, source) do |ff|
      p "Reading source file #{source}"

      n_match=0

      filters=source_filters[source_i] #An array of filters for this input file
      id_regex=/#{id_regexes[source_i]}/

      ff.each do |entry|
        filters.each do |filter|
          if ( entry.definition =~ /#{filter}/)
            n_match=n_match+1
            idmatch=id_regex.match(entry.definition)
            case 
            when idmatch==nil || idmatch[1]==nil
              p "No match to id regex #{id_regex} for #{entry.definition}. Skipping this entry"              
            else
              new_def="#{idmatch[1]}"
              if ( added_ids.include?(new_def) )
                p "Warning: Skipping duplicate definition for #{new_def}"
              else
                entry.definition=new_def
                output_fh.puts(entry.to_s)
                added_ids.add new_def
              end
              #              p entry.definition.to_s
            end
            break
          end
        end
      end
      p "Warning no match to any filter in #{filters} for source file #{source}" unless n_match > 0
    end
    source_i=source_i+1
  end
  output_fh.close
else # Other formats just copy a file across ... must be a single source

  throw "Only a single source file is permitted for formats other than fasta" unless source_files.length == 1

  cleanup_file(raw_db_filename)

  sh "cp #{source_files[0]} #{raw_db_filename}" do |ok,res|
    if ! ok 
      puts "Unable to copy #{source_files[0]} to #{raw_db_filename}"
    end
  end

end

end

##################### # Decoys # #####################

decoy_db_filename = “#{dbdir}/with_decoys.fasta” file decoy_db_filename => raw_db_filename do

archive_fasta_file(decoy_db_filename) if dbspec[:archive_old]

cleanup_file(decoy_db_filename)

decoys_filename = "#{dbdir}/decoys_only.fasta"
decoy_prefix=dbspec[:decoy_prefix]

# Count entries in the raw input file
#
ff=Bio::FlatFile.open(Bio::FastaFormat, raw_db_filename)
db_length=0
ff.each do |entry| 
  db_length=db_length+1 
end

p "Generating decoy sequences ... this could take a while"  
# Make decoys, concatenate and delete decoy only file
Randomize.make_decoys raw_db_filename, db_length, decoys_filename, decoy_prefix
cmd = "cat #{raw_db_filename} #{decoys_filename} >> #{decoy_db_filename}; rm #{decoys_filename}"
sh %{ #{cmd} }

end

# Adjust dependencies depending on whether we're making decoys # case dbspec when true

throw "Decoys are only supported for fasta formatted databases" unless format=="fasta"
file db_filename => decoy_db_filename

else

file db_filename => raw_db_filename

end

################### # Symlink Current # ###################

# Current database file should symlink to raw or decoy # file db_filename do

if ( dbspec[:is_annotation_db])
  db_filename=raw_db_filename # For annotation databases we don't use symlinks at all
else
  # if we are an annotation db we can't symlink so do nothing

  # source db filename is either decoy or raw
  #
  case dbspec[:decoys]
  when true
    source_db_filename = decoy_db_filename
  when false
    source_db_filename = raw_db_filename
  end

  p "Current db links to #{source_db_filename}"

  # Symlink to the source file
  #
  source_db_filename_relative = Pathname.new(source_db_filename).basename.to_s
  File.symlink(source_db_filename_relative,db_filename)
end

end

################### # Indexing # ################### if dbspec # blast_index_files=FileList.new(.collect {|ext| “#{db_filename}#{ext}” })

#  task :make_blast_index => blast_index_files  do
blast_index_files=["#{db_filename}.phr"]
blast_index_files.each do |indfile|
  file indfile => db_filename do
    cmd="cd #{dbdir}; #{$genv.makeblastdb} -in #{db_filename} -parse_seqids -dbtype prot -max_file_sz 20000000000"
    p "Creating blast index"
    sh %{ #{cmd} }
  end
end

task dbname => blast_index_files

end

if dbspec

msgf_index_files=FileList.new([".canno"].collect {|ext| "#{db_filename}#{ext}"  })
#  task :make_blast_index => blast_index_files  do
msgf_index_files.each do |indfile|
  file indfile => db_filename do
    cmd="cd #{dbdir}; java -Xmx3500M -cp #{$genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{db_filename} -tda 0"
    p "Creating msgf index"
    sh %{ #{cmd} }
  end
end

task dbname => msgf_index_files

end

if format==“dat” && dbspec

dat_index_file= "#{dbdir}/id_AC.index"

cleanup_file dat_index_file #Regenerate indexes every time

file dat_index_file => db_filename do
  puts "Indexing annotation database"
  dbclass=Bio::SPTR
  parser = Bio::FlatFileIndex::Indexer::Parser.new(dbclass, nil, nil)
  Bio::FlatFileIndex::Indexer::makeindexFlat(dbdir, parser, {}, db_filename)
end

task dbname => dat_index_file

end

################# # Root task # #################

task dbname => db_filename