require 'csv'
namespace :gblsci do
namespace :sample_data do desc 'Ingests a directory of geoblacklight.json files' task seed: :environment do Dir.glob(File.join(Rails.root, 'solr', 'geoblacklight', 'example_docs', '**', '*.json')).each do |fn| puts "Ingesting #{fn}" begin Blacklight.default_index.connection.add(JSON.parse(File.read(fn))) rescue => e puts "Failed to ingest #{fn}: #{e.inspect}" end end puts 'Committing changes to Solr' Blacklight.default_index.connection.commit end end namespace :images do desc 'Harvest image for specific document' task :harvest_doc_id, [:doc_id] => [:environment] do |_t, args| GeoblacklightSidecarImages::StoreImageJob.perform_later(args[:doc_id]) end desc 'Harvest all images' task harvest_all: :environment do begin query = '*:*' index = Geoblacklight::SolrDocument.index results = index.send_and_receive(index.blacklight_config.solr_path, q: query, fl: "*", rows: 100_000_000) num_found = results.response[:numFound] doc_counter = 0 results.docs.each do |document| sleep(1) begin GeoblacklightSidecarImages::StoreImageJob.perform_later(document.id) rescue Blacklight::Exceptions::RecordNotFound next end end end end desc 'Hash of SolrDocumentSidecar image state counts' task harvest_states: :environment do states = [ :initialized, :queued, :processing, :succeeded, :failed, :placeheld ] col_state = {} states.each do |state| sidecars = SolrDocumentSidecar.in_state(state) col_state[state] = sidecars.size end col_state.each do |col,state| puts "#{col} - #{state}" end end desc 'Re-queues incomplete states for harvesting' task harvest_retry: :environment do states = [ :initialized, :queued, :processing, :failed, :placeheld ] states.each do |state| sidecars = SolrDocumentSidecar.in_state(state) puts "#{state} - #{sidecars.size}" sidecars.each do |sc| begin document = Geoblacklight::SolrDocument.find(sc.document_id) GeoblacklightSidecarImages::StoreImageJob.perform_later(document.id) rescue puts "orphaned / #{sc.document_id}" end end end end desc 'Write harvest state report (CSV)' task harvest_report: :environment do # Create a CSV Dump of Results file = "#{Rails.root}/public/#{Time.now.strftime('%Y-%m-%d_%H-%M-%S')}.sidecar_report.csv" sidecars = SolrDocumentSidecar.all CSV.open(file, 'w') do |writer| header = [ "Sidecar ID", "Document ID", "Current State", "Doc Data Type", "Doc Title", "Doc Institution", "Error", "Viewer Protocol", "Image URL", "GBLSI Thumbnail URL" ] writer << header sidecars.each do |sc| cat = CatalogController.new begin document = Geoblacklight::SolrDocument.find(sc.document_id) writer << [ sc.id, sc.document_id, sc.image_state.current_state, document._source['layer_geom_type_s'], document._source['dc_title_s'], document._source['dct_provenance_s'], sc.image_state.last_transition.metadata['exception'], sc.image_state.last_transition.metadata['viewer_protocol'], sc.image_state.last_transition.metadata['image_url'], sc.image_state.last_transition.metadata['gblsi_thumbnail_uri'] ] rescue Exception => e puts "Exception: #{e.inspect}" puts "orphaned / #{sc.document_id}" next end end end end desc 'Destroy all harvested images and sidecar AR objects' task harvest_purge_all: :environment do # Remove all images sidecars = SolrDocumentSidecar.all sidecars.each do |sc| sc.image.purge end # Delete all Transitions and Sidecars SidecarImageTransition.destroy_all SolrDocumentSidecar.destroy_all end desc 'Destroy orphaned images and sidecar AR objects' # When a SolrDocumentSidecar AR object exists, # but it's corresponding SolrDocument is no longer in the Solr index. task harvest_purge_orphans: :environment do # Remove all images sidecars = SolrDocumentSidecar.all sidecars.each do |sc| begin document = Geoblacklight::SolrDocument.find(sc.document_id) rescue sc.destroy puts "orphaned / #{sc.document_id} / destroyed" end end end desc 'Destroy select sidecar AR objects by CSV file' task harvest_destroy_batch: :environment do # Expects a CSV file in Rails.root/tmp/destroy_batch.csv # # From your local machine, copy it up to production server like this: # scp destroy_batch.csv swadm@geoprod:/swadm/var/www/geoblacklight/current/tmp/ CSV.foreach("#{Rails.root}/tmp/destroy_batch.csv", headers: true) do |row| sc = SolrDocumentSidecar.find_by(:document_id => row[0]) sc.destroy puts "document_id - #{row[0]} - destroyed" end end desc 'Inspect failed state objects' task harvest_failed_state_inspect: :environment do states = [ :failed ] states.each do |state| sidecars = SolrDocumentSidecar.in_state(state).each do |sc| puts "#{state} - #{sc.document_id} - #{sc.image_state.last_transition.metadata.inspect}" end end end end
end