class SearchSolrTools::Harvesters::Base

base class for solr harvesters

Constants

DELETE_DOCUMENTS_RATIO
JSON_CONTENT_TYPE
XML_CONTENT_TYPE

Attributes

environment[RW]

Public Class Methods

new(env = 'development', die_on_failure: false) click to toggle source
# File lib/search_solr_tools/harvesters/base.rb, line 26
def initialize(env = 'development', die_on_failure: false)
  @environment = env
  @die_on_failure = die_on_failure
end

Public Instance Methods

create_new_solr_add_doc() click to toggle source

returns Nokogiri XML document with content ‘<?xml version=“1.0”?><add/>’

# File lib/search_solr_tools/harvesters/base.rb, line 203
def create_new_solr_add_doc
  doc = Nokogiri::XML::Document.new
  doc.root = Nokogiri::XML::Node.new('add', doc)
  doc
end
create_new_solr_add_doc_with_child(child) click to toggle source

returns a Nokogiri XML document with content ‘<?xml version=“1.0”?><add> <child /> </add>’

# File lib/search_solr_tools/harvesters/base.rb, line 211
def create_new_solr_add_doc_with_child(child)
  doc = create_new_solr_add_doc
  doc.root.add_child(child)
  doc
end
delete_old_documents(timestamp, constraints, solr_core, force: false) click to toggle source
# File lib/search_solr_tools/harvesters/base.rb, line 80
def delete_old_documents(timestamp, constraints, solr_core, force: false)
  constraints = sanitize_data_centers_constraints(constraints)
  delete_query = "last_update:[* TO #{timestamp}] AND #{constraints}"
  solr = RSolr.connect url: solr_url + "/#{solr_core}"
  unchanged_count = (solr.get 'select', params: { wt: :ruby, q: delete_query, rows: 0 })['response']['numFound'].to_i
  if unchanged_count.zero?
    logger.info "All documents were updated after #{timestamp}, nothing to delete"
  else
    logger.info "Begin removing documents older than #{timestamp}"
    remove_documents(solr, delete_query, constraints, force, unchanged_count)
  end
end
doc_valid?(doc) click to toggle source

Make sure that Solr is able to accept this doc in a POST

# File lib/search_solr_tools/harvesters/base.rb, line 218
def doc_valid?(doc)
  spatial_coverages = doc.xpath(".//field[@name='spatial_coverages']").first
  return true if spatial_coverages.nil?

  spatial_coverages = spatial_coverages.text.split

  # We've only seen the failure with 4 spatial coverage values
  return true if spatial_coverages.size < 4

  valid_solr_spatial_coverage?(spatial_coverages)
end
encode_data_provider_url(url) click to toggle source

Some data providers require encoding (such as URI.encode), while others barf on encoding. The default is to just return url, override this in the subclass if special encoding is needed.

# File lib/search_solr_tools/harvesters/base.rb, line 40
def encode_data_provider_url(url)
  url
end
get_results(request_url, metadata_path, content_type = 'application/xml') click to toggle source

Get results from an end point specified in the request_url

# File lib/search_solr_tools/harvesters/base.rb, line 175
def get_results(request_url, metadata_path, content_type = 'application/xml')
  timeout = 300
  retries_left = 3

  request_url = encode_data_provider_url(request_url)

  begin
    logger.debug "Request: #{request_url}"
    response = URI.parse(request_url).open(read_timeout: timeout, 'Content-Type' => content_type)
  rescue OpenURI::HTTPError, Timeout::Error, Errno::ETIMEDOUT => e
    retries_left -= 1
    logger.error "## REQUEST FAILED ## #{e.class} ## Retrying #{retries_left} more times..."

    retry if retries_left.positive?

    # TODO: Do we really need this "die_on_failure" anymore?  The empty return
    #  will cause the "No Documents" error to be thrown in the harvester class
    #  now, so it will pretty much always "die on failure"
    raise e if @die_on_failure

    return
  end
  doc = Nokogiri.XML(response)
  doc.xpath(metadata_path, Helpers::IsoNamespaces.namespaces(doc))
end
get_serialized_doc(doc, content_type) click to toggle source
# File lib/search_solr_tools/harvesters/base.rb, line 164
def get_serialized_doc(doc, content_type)
  if content_type.eql?(XML_CONTENT_TYPE)
    doc.respond_to?(:to_xml) ? doc.to_xml : doc
  elsif content_type.eql?(JSON_CONTENT_TYPE)
    MultiJson.dump(doc)
  else
    doc
  end
end
harvest_and_delete(harvest_method, delete_constraints, solr_core = SolrEnvironments[@environment][:collection_name]) click to toggle source
# File lib/search_solr_tools/harvesters/base.rb, line 71
def harvest_and_delete(harvest_method, delete_constraints, solr_core = SolrEnvironments[@environment][:collection_name])
  start_time = Time.now.utc.iso8601

  harvest_status = harvest_method.call
  delete_old_documents start_time, delete_constraints, solr_core

  harvest_status
end
insert_solr_doc(doc, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name]) click to toggle source

TODO: Need to return a specific type of failure:

- Bad record content identified and no ingest attempted
- Solr tries to ingest document and fails (bad content not detected prior to ingest)
- Solr cannot insert document for reasons other than the document structure and content.
# File lib/search_solr_tools/harvesters/base.rb, line 136
def insert_solr_doc(doc, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
  url = solr_url + "/#{core}/update?commit=true"
  status = Helpers::HarvestStatus::INGEST_OK

  # Some of the docs will cause Solr to crash - CPU goes to 195% with `top` and it
  # doesn't seem to recover.
  return Helpers::HarvestStatus::INGEST_ERR_INVALID_DOC if content_type == XML_CONTENT_TYPE && !doc_valid?(doc)

  doc_serialized = get_serialized_doc(doc, content_type)

  # Some docs will cause solr to time out during the POST
  begin
    RestClient.post(url, doc_serialized, content_type:) do |response, _request, _result|
      success = response.code == 200
      unless success
        logger.error "Error for #{doc_serialized}\n\n response: #{response.body}"
        status = Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR
      end
    end
  rescue StandardError => e
    # TODO: Need to provide more detail re: this failure so we know whether to
    #  exit the job with a status != 0
    logger.error "Rest exception while POSTing to Solr: #{e}, for doc: #{doc_serialized}"
    status = Helpers::HarvestStatus::INGEST_ERR_SOLR_ERROR
  end
  status
end
insert_solr_docs(docs, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name]) click to toggle source

Update Solr with an array of Nokogiri xml documents, report number of successfully added documents

# File lib/search_solr_tools/harvesters/base.rb, line 115
def insert_solr_docs(docs, content_type = XML_CONTENT_TYPE, core = SolrEnvironments[@environment][:collection_name])
  success = 0
  failure = 0

  status = Helpers::HarvestStatus.new

  docs.each do |doc|
    doc_status = insert_solr_doc(doc, content_type, core)
    status.record_status doc_status
    doc_status == Helpers::HarvestStatus::INGEST_OK ? success += 1 : failure += 1
  end
  logger.info "#{success} document#{success == 1 ? '' : 's'} successfully added to Solr."
  logger.info "#{failure} document#{failure == 1 ? '' : 's'} not added to Solr."

  status
end
ping_solr(core = SolrEnvironments[@environment][:collection_name]) click to toggle source

Ping the Solr instance to ensure that it’s running. The ping query is specified to manually check the title, as it’s possible there is no “default” query in the solr instance.

# File lib/search_solr_tools/harvesters/base.rb, line 47
def ping_solr(core = SolrEnvironments[@environment][:collection_name])
  url = solr_url + "/#{core}/admin/ping?df=title"
  success = false

  # Some docs will cause solr to time out during the POST
  begin
    RestClient.get(url) do |response, _request, _result|
      success = response.code == 200
      logger.error "Error in ping request: #{response.body}" unless success
    end
  rescue StandardError => e
    logger.error "Rest exception while pinging Solr: #{e}"
  end
  success
end
ping_source() click to toggle source

This should be overridden by child classes to implement the ability to “ping” the data center. Returns true if the ping is successful (or, as in this default, no ping method was defined)

# File lib/search_solr_tools/harvesters/base.rb, line 66
def ping_source
  logger.info 'Harvester does not have ping method defined, assuming true'
  true
end
remove_documents(solr, delete_query, constraints, force, numfound) click to toggle source
# File lib/search_solr_tools/harvesters/base.rb, line 101
def remove_documents(solr, delete_query, constraints, force, numfound)
  all_response_count = (solr.get 'select', params: { wt: :ruby, q: constraints, rows: 0 })['response']['numFound']
  if force || (numfound / all_response_count.to_f < DELETE_DOCUMENTS_RATIO)
    logger.info "Deleting #{numfound} documents for #{constraints}"
    solr.delete_by_query delete_query
    solr.commit
  else
    logger.info "Failed to delete records older than current harvest start because they exceeded #{DELETE_DOCUMENTS_RATIO} of the total records for this data center."
    logger.info "\tTotal records: #{all_response_count}"
    logger.info "\tNon-updated records: #{numfound}"
  end
end
sanitize_data_centers_constraints(query_string) click to toggle source
# File lib/search_solr_tools/harvesters/base.rb, line 93
def sanitize_data_centers_constraints(query_string)
  # Remove lucene special characters, preserve the query parameter and compress whitespace
  query_string = query_string.gsub(/[:&|!~\-\(\)\{\}\[\]\^\*\?\+]+/, ' ')
  query_string = query_string.gsub('data_centers ', 'data_centers:')
  query_string = query_string.gsub('source ', 'source:')
  query_string.squeeze(' ').strip
end
solr_url() click to toggle source
# File lib/search_solr_tools/harvesters/base.rb, line 31
def solr_url
  env = SolrEnvironments[@environment]
  "http://#{env[:host]}:#{env[:port]}/#{env[:collection_path]}"
end
valid_solr_spatial_coverage?(spatial_coverages) click to toggle source

spatial_coverages is an array with length 4:

North, East, South, West
# File lib/search_solr_tools/harvesters/base.rb, line 232
def valid_solr_spatial_coverage?(spatial_coverages)
  north, east, south, west = spatial_coverages

  polar_point = (north == south) && (north.to_f.abs == 90)

  (east == west) || !polar_point
end