class NewspaperWorks::Ingest::BatchIssueIngester

Attributes

issues[RW]
lccn[RW]
opts[RW]
path[RW]
publication[RW]

Public Class Methods

new(path, opts = {}) click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 16
def initialize(path, opts = {})
  @path = path
  lccn = opts[:lccn]
  @lccn = normalize_lccn(lccn.nil? ? lccn_from_path(path) : lccn)
  # get publication info for LCCN from authority web service:
  @publication = NewspaperWorks::Ingest::PublicationInfo.new(@lccn)
  # issues for publication, as enumerable of PDFIssue
  @issues = issue_enumerator
  @opts = opts
  configure_logger('ingest')
end

Public Instance Methods

create_issue(issue_data) click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 44
def create_issue(issue_data)
  issue = NewspaperIssue.create
  copy_issue_metadata(issue_data, issue)
  NewspaperWorks::Ingest.assign_administrative_metadata(issue, @opts)
  issue.save!
  write_log(
    "Created new NewspaperIssue work with date, lccn, edition metadata:"\
    "\n"\
    "\tLCCN: #{@lccn}\n"\
    "\tPublication Date: #{issue_data.publication_date}\n"\
    "\tEdition number: #{issue_data.edition_number}"
  )
  link_publication(issue)
  issue
end
create_page(page_image, issue) click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 67
def create_page(page_image, issue)
  page = NewspaperPage.create
  page.title = page_image.title
  page.page_number = page_image.page_number
  page.save!
  # Link page as a child of issue, via ordered members:
  issue.ordered_members << page
  NewspaperWorks::Ingest.assign_administrative_metadata(page, @opts)
  issue.save!
  # Ensure we have a source TIFF file, attach to page:
  path = page_image.path
  path = page_image.path.end_with?('jp2') ? make_tiff(path) : path
  attach_file(page, path)
end
ingest() click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 113
def ingest
  write_log("Beginning issue(s) batch ingest for #{@path}")
  write_log("\tPublication: #{@publication.title} (LCCN: #{@lccn})")
  @issues.each do |path, issue_data|
    issue = create_issue(issue_data)
    tactic = ingest_type
    ingest_pdf(issue, path) if tactic == 'issue_pdf'
    ingest_pages(issue, issue_data) if tactic == 'page_image'
  end
  write_log(
    "Issue ingest completed for LCCN #{@lccn}. Asyncrhonous jobs "\
    "may still be creating derivatives for issue, and child page works."
  )
end
ingest_pages(issue, issue_data) click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 82
def ingest_pages(issue, issue_data)
  # Create pages in order they appear (lexical)
  issue_data.each_value { |page_image| create_page(page_image, issue) }
  # Make an issue PDF from constituent pages, via retryable async job,
  #   which will not succeed until the PDF derivatives are created
  #   for each page, but should eventually succeed on that condition:
  NewspaperWorks::ComposeIssuePDFJob.perform_later(issue)
end
ingest_pdf(issue, path) click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 60
def ingest_pdf(issue, path)
  # ingest primary PDF for issue:
  attach_file(issue, path)
  # queue page creation job:
  CreateIssuePagesJob.perform_later(issue, [path], nil, nil)
end
ingest_type() click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 108
def ingest_type
  return 'issue_pdf' if @issues.class == NewspaperWorks::Ingest::PDFIssues
  'page_image'
end
issue_enumerator() click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 28
def issue_enumerator
  impl = NewspaperWorks::Ingest::PDFIssues
  impl = NewspaperWorks::Ingest::ImageIngestIssues if detect_media(path) == 'image'
  # issue enumerator depends on detected media:
  impl.new(path, publication)
end
make_tiff(path) click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 91
def make_tiff(path)
  raise ArgumentError unless path.end_with?('jp2')
  Hyrax.config.whitelisted_ingest_dirs |= [Dir.tmpdir]
  name = File.basename(path).split('.')[0]
  # OpenJPEG2000 has weird quirk, only likes 3-char file ext TIF:
  tiff_path = File.join(Dir.mktmpdir, "#{name}.tif")
  cmd = "opj_decompress -i #{path} -o #{tiff_path}"
  Open3.popen3(cmd) do |_stdin, _stdout, stderr, _wait_thr|
    unless stderr.read.strip.empty?
      msg = "Error converting JP2 to TIFF: #{path}"
      write_log(msg, Logger::ERROR)
      raise msg
    end
  end
  tiff_path
end