class Stanford::ContentInventory

Stanford-specific utility methods for transforming contentMetadata to versionInventory and doing comparisons

Data Model

@note Copyright © 2012 by The Board of Trustees of the Leland Stanford Junior University.

All rights reserved.  See {file:LICENSE.rdoc} for details.

Public Instance Methods

generate_content_metadata(file_group, object_id, version_id) click to toggle source

@api external @param file_group [FileGroup] The {FileGroup} object used as the data source @return [String] The contentMetadata instance generated from the FileGroup @example {include:file:spec/features/stanford/content_metadata_write_spec.rb}

# File lib/stanford/content_inventory.rb, line 99
def generate_content_metadata(file_group, object_id, version_id)
  cm = Nokogiri::XML::Builder.new do |xml|
    xml.contentMetadata(type: 'sample', objectId: object_id) do
      xml.resource(type: 'version', sequence: '1', id: "version-#{version_id}") do
        file_group.files.each do |file_manifestation|
          signature = file_manifestation.signature
          file_manifestation.instances.each do |instance|
            xml.file(
              id: instance.path,
              size: signature.size,
              datetime: instance.datetime,
              shelve: 'yes',
              publish: 'yes',
              preserve: 'yes'
            ) do
              fixity = signature.fixity
              xml.checksum(type: 'MD5') { xml.text signature.md5 } if fixity[:md5]
              xml.checksum(type: 'SHA-1') { xml.text signature.sha1 } if fixity[:sha1]
              xml.checksum(type: 'SHA-256') { xml.text signature.sha256 } if fixity[:sha256]
            end
          end
        end
      end
    end
  end
  cm.to_xml
end
generate_instance(node) click to toggle source

@api internal @param node (see generate_signature) @return [FileInstance] The {FileInstance} object generated from the XML data

# File lib/stanford/content_inventory.rb, line 84
def generate_instance(node)
  instance = Moab::FileInstance.new
  instance.path = node.attributes['id'].content
  instance.datetime = begin
    node.attributes['datetime'].content
  rescue StandardError
    nil
  end
  instance
end
generate_signature(node) click to toggle source

@api internal @param node [Nokogiri::XML::Node] The XML node containing file information @return [FileSignature] The {FileSignature} object generated from the XML data

# File lib/stanford/content_inventory.rb, line 64
def generate_signature(node)
  signature = Moab::FileSignature.new
  signature.size = node.attributes['size'].content
  checksum_nodes = node.xpath('checksum')
  checksum_nodes.each do |checksum_node|
    case checksum_node.attributes['type'].content.upcase
    when 'MD5'
      signature.md5 = checksum_node.text
    when 'SHA1', 'SHA-1'
      signature.sha1 = checksum_node.text
    when 'SHA256', 'SHA-256'
      signature.sha256 = checksum_node.text
    end
  end
  signature
end
group_from_cm(content_metadata, subset) click to toggle source

@api external @param content_metadata [String] The contentMetadata as a string @param subset [String] Speciifes which subset of files to list (all|preserve|publish|shelve) @return [FileGroup] The {FileGroup} object generated from a contentMetadata instance @example {include:file:spec/features/stanford/content_metadata_read_spec.rb}

# File lib/stanford/content_inventory.rb, line 37
def group_from_cm(content_metadata, subset)
  ng_doc = Nokogiri::XML(content_metadata)
  validate_content_metadata(ng_doc)
  nodeset = case subset.to_s.downcase
            when 'preserve'
              ng_doc.xpath("//file[@preserve='yes']")
            when 'publish'
              ng_doc.xpath("//file[@publish='yes']")
            when 'shelve'
              ng_doc.xpath("//file[@shelve='yes']")
            when 'all'
              ng_doc.xpath('//file')
            else
              raise(Moab::MoabRuntimeError, "Unknown disposition subset (#{subset})")
            end
  content_group = Moab::FileGroup.new(group_id: 'content', data_source: "contentMetadata-#{subset}")
  nodeset.each do |file_node|
    signature = generate_signature(file_node)
    instance = generate_instance(file_node)
    content_group.add_file_instance(signature, instance)
  end
  content_group
end
inventory_from_cm(content_metadata, object_id, subset, version_id = nil) click to toggle source

@param content_metadata [String] The content metadata to be transformed into a versionInventory @param object_id [String] The identifier of the digital object @param subset [String] Speciifes which subset of files to list (all|preserve|publish|shelve) @param version_id [Integer] The ID of the version whosen content metadata is to be transformed @return [FileInventory] The versionInventory equivalent of the contentMetadata

if the supplied content_metadata is blank or empty, then a skeletal FileInventory will be returned
# File lib/stanford/content_inventory.rb, line 20
def inventory_from_cm(content_metadata, object_id, subset, version_id = nil)
  # The contentMetadata datastream is not required for ingest, since some object types, such as collection
  #   or APO do not require one.
  # Many of these objects have contentMetadata with no child elements, such as this:
  #    <contentMetadata objectId="bd608mj3166" type="file"/>
  # but there are also objects that have no datasteam of this name at all
  cm_inventory = Moab::FileInventory.new(type: 'version', digital_object_id: object_id, version_id: version_id)
  content_group = group_from_cm(content_metadata, subset)
  cm_inventory.groups << content_group
  cm_inventory
end
remediate_checksum_nodes(file_node, signature) click to toggle source

@param [Nokogiri::XML::Element] file_node the File stanza being remediated @param [FileSignature] signature the fixity data for the file from the FileGroup @return [void] update the file’s checksum elements if data missing, raise exception if inconsistent

# File lib/stanford/content_inventory.rb, line 212
def remediate_checksum_nodes(file_node, signature)
  # collect <checksum> elements for checksum types that are already present
  checksum_nodes = {}
  file_node.xpath('checksum').each do |checksum_node|
    type = @type_for_name[checksum_node['type']]
    checksum_nodes[type] = checksum_node
  end
  # add new <checksum> elements for the other checksum types that were missing
  @names_for_type.each do |type, names|
    unless checksum_nodes.key?(type)
      checksum_node = Nokogiri::XML::Element.new('checksum', file_node.document)
      checksum_node['type'] = names[0]
      file_node << checksum_node
      checksum_nodes[type] = checksum_node
    end
  end
  # make sure the <checksum> element has a content value
  checksum_nodes.each do |type, checksum_node|
    cm_checksum = checksum_node.content
    sig_checksum = signature.checksums[type]
    if cm_checksum.nil? || cm_checksum.empty?
      checksum_node.content = sig_checksum
    elsif cm_checksum != sig_checksum
      raise(Moab::MoabRuntimeError, "Inconsistent #{type} for #{file_node['id']}: #{cm_checksum} != #{sig_checksum}")
    end
  end
end
remediate_content_metadata(content_metadata, content_group) click to toggle source

@param content_metadata [String] The contentMetadata as a string @param content_group [FileGroup] The {FileGroup} object used as the fixity data source @return [String] Returns a remediated copy of the contentMetadata with fixity data filled in @see blog.slashpoundbang.com/post/1454850669/how-to-pretty-print-xml-with-nokogiri

# File lib/stanford/content_inventory.rb, line 179
def remediate_content_metadata(content_metadata, content_group)
  return nil if content_metadata.nil?
  return content_metadata if content_group.nil? || content_group.files.empty?

  signature_for_path = content_group.path_hash
  @type_for_name = Moab::FileSignature.checksum_type_for_name
  @names_for_type = Moab::FileSignature.checksum_names_for_type
  ng_doc = Nokogiri::XML(content_metadata, &:noblanks)
  nodeset = ng_doc.xpath('//file')
  nodeset.each do |file_node|
    filepath = file_node['id']
    signature = signature_for_path[filepath]
    remediate_file_size(file_node, signature)
    remediate_checksum_nodes(file_node, signature)
  end
  ng_doc.to_xml(indent: 2)
end
remediate_file_size(file_node, signature) click to toggle source

@param [Nokogiri::XML::Element] file_node the File stanza being remediated @param [FileSignature] signature the fixity data for the file from the FileGroup @return [void] update the file size attribute if missing, raise exception if inconsistent

# File lib/stanford/content_inventory.rb, line 200
def remediate_file_size(file_node, signature)
  file_size = file_node['size']
  if file_size.nil? || file_size.empty?
    file_node['size'] = signature.size.to_s
  elsif file_size != signature.size.to_s
    raise(Moab::MoabRuntimeError, "Inconsistent size for #{file_node['id']}: #{file_size} != #{signature.size}")
  end
end
validate_content_metadata(content_metadata) click to toggle source

@param content_metadata [String, Nokogiri::XML::Document] The contentMetadata as a string or XML doc @return [Boolean] True if contentMetadata has essential file attributes, else raise exception

# File lib/stanford/content_inventory.rb, line 129
def validate_content_metadata(content_metadata)
  result = validate_content_metadata_details(content_metadata)
  raise Moab::InvalidMetadataException, "#{result[0]} ..." unless result.empty?

  true
end
validate_content_metadata_details(content_metadata) click to toggle source

@param content_metadata [String, Nokogiri::XML::Document] The contentMetadata as a string or XML doc @return [Array<String>] List of problems found

# File lib/stanford/content_inventory.rb, line 138
def validate_content_metadata_details(content_metadata)
  result = []
  content_metadata_doc =
    case content_metadata.class.name
    when 'String'
      Nokogiri::XML(content_metadata)
    when 'Pathname'
      Nokogiri::XML(content_metadata.read)
    when 'Nokogiri::XML::Document'
      content_metadata
    else
      raise Moab::InvalidMetadataException, 'Content Metadata is in unrecognized format'
    end
  nodeset = content_metadata_doc.xpath('//file')
  nodeset.each do |file_node|
    missing = %w[id size md5 sha1]
    missing.delete('id') if file_node.has_attribute?('id')
    missing.delete('size') if file_node.has_attribute?('size')
    checksum_nodes = file_node.xpath('checksum')
    checksum_nodes.each do |checksum_node|
      case checksum_node.attributes['type'].content.upcase
      when 'MD5'
        missing.delete('md5')
      when 'SHA1', 'SHA-1'
        missing.delete('sha1')
      end
    end
    if missing.include?('id')
      result << "File node #{nodeset.index(file_node)} is missing #{missing.join(',')}"
    elsif !missing.empty?
      id = file_node['id']
      result << "File node having id='#{id}' is missing #{missing.join(',')}"
    end
  end
  result
end