class NewspaperWorks::Ingest::BatchIssueIngester
Attributes
issues[RW]
lccn[RW]
opts[RW]
path[RW]
publication[RW]
Public Class Methods
new(path, opts = {})
click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 16 def initialize(path, opts = {}) @path = path lccn = opts[:lccn] @lccn = normalize_lccn(lccn.nil? ? lccn_from_path(path) : lccn) # get publication info for LCCN from authority web service: @publication = NewspaperWorks::Ingest::PublicationInfo.new(@lccn) # issues for publication, as enumerable of PDFIssue @issues = issue_enumerator @opts = opts configure_logger('ingest') end
Public Instance Methods
create_issue(issue_data)
click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 44 def create_issue(issue_data) issue = NewspaperIssue.create copy_issue_metadata(issue_data, issue) NewspaperWorks::Ingest.assign_administrative_metadata(issue, @opts) issue.save! write_log( "Created new NewspaperIssue work with date, lccn, edition metadata:"\ "\n"\ "\tLCCN: #{@lccn}\n"\ "\tPublication Date: #{issue_data.publication_date}\n"\ "\tEdition number: #{issue_data.edition_number}" ) link_publication(issue) issue end
create_page(page_image, issue)
click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 67 def create_page(page_image, issue) page = NewspaperPage.create page.title = page_image.title page.page_number = page_image.page_number page.save! # Link page as a child of issue, via ordered members: issue.ordered_members << page NewspaperWorks::Ingest.assign_administrative_metadata(page, @opts) issue.save! # Ensure we have a source TIFF file, attach to page: path = page_image.path path = page_image.path.end_with?('jp2') ? make_tiff(path) : path attach_file(page, path) end
ingest()
click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 113 def ingest write_log("Beginning issue(s) batch ingest for #{@path}") write_log("\tPublication: #{@publication.title} (LCCN: #{@lccn})") @issues.each do |path, issue_data| issue = create_issue(issue_data) tactic = ingest_type ingest_pdf(issue, path) if tactic == 'issue_pdf' ingest_pages(issue, issue_data) if tactic == 'page_image' end write_log( "Issue ingest completed for LCCN #{@lccn}. Asyncrhonous jobs "\ "may still be creating derivatives for issue, and child page works." ) end
ingest_pages(issue, issue_data)
click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 82 def ingest_pages(issue, issue_data) # Create pages in order they appear (lexical) issue_data.each_value { |page_image| create_page(page_image, issue) } # Make an issue PDF from constituent pages, via retryable async job, # which will not succeed until the PDF derivatives are created # for each page, but should eventually succeed on that condition: NewspaperWorks::ComposeIssuePDFJob.perform_later(issue) end
ingest_pdf(issue, path)
click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 60 def ingest_pdf(issue, path) # ingest primary PDF for issue: attach_file(issue, path) # queue page creation job: CreateIssuePagesJob.perform_later(issue, [path], nil, nil) end
ingest_type()
click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 108 def ingest_type return 'issue_pdf' if @issues.class == NewspaperWorks::Ingest::PDFIssues 'page_image' end
issue_enumerator()
click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 28 def issue_enumerator impl = NewspaperWorks::Ingest::PDFIssues impl = NewspaperWorks::Ingest::ImageIngestIssues if detect_media(path) == 'image' # issue enumerator depends on detected media: impl.new(path, publication) end
link_publication(issue)
click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 35 def link_publication(issue) find_or_create_publication_for_issue( issue, @lccn, @publication.title, @opts ) end
make_tiff(path)
click to toggle source
# File lib/newspaper_works/ingest/batch_issue_ingester.rb, line 91 def make_tiff(path) raise ArgumentError unless path.end_with?('jp2') Hyrax.config.whitelisted_ingest_dirs |= [Dir.tmpdir] name = File.basename(path).split('.')[0] # OpenJPEG2000 has weird quirk, only likes 3-char file ext TIF: tiff_path = File.join(Dir.mktmpdir, "#{name}.tif") cmd = "opj_decompress -i #{path} -o #{tiff_path}" Open3.popen3(cmd) do |_stdin, _stdout, stderr, _wait_thr| unless stderr.read.strip.empty? msg = "Error converting JP2 to TIFF: #{path}" write_log(msg, Logger::ERROR) raise msg end end tiff_path end