class NewspaperWorks::Ingest::PdfImages

PdfImages uses poppler 0.19+ pdfimages command to extract image

listing metadata from PDF files.
For dpi extraction, falls back to calculating using MiniMagick,
if neccessary.

Constants

COL_BITS
COL_CHANNELS
COL_COLOR
COL_HEIGHT
COL_WIDTH

class constant column numbers

COL_XPPI

only poppler 0.25+ has this column in output:

Public Class Methods

new(path) click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 20
def initialize(path)
  @path = path
  @cmd = format('pdfimages -list %<path>s', path: path)
  @output = nil
  @entries = nil
end

Public Instance Methods

color() click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 62
def color
  # desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white
  #   so caller may want all of this information, and in case of
  #   mixed color spaces across images, this returns maximum
  desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray'
  channels = entries.map { |e| e[COL_CHANNELS].to_i }.max
  bits = entries.map { |e| e[COL_BITS].to_i }.max
  [desc, channels, bits]
end
entries() click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 37
def entries
  if @entries.nil?
    @entries = []
    output = process
    (0..output.size - 1).each do |i|
      @entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" "))
    end
  end
  @entries
end
height() click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 58
def height
  selectcolumn(COL_HEIGHT, &:to_i).max
end
ppi() click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 72
def ppi
  if entries[0].size <= 12
    # poppler < 0.25
    pdf = MiniMagick::Image.open(@path)
    width_points = pdf.width
    width_px = width
    return (72 * width_px / width_points).to_i
  end
  # with poppler 0.25+, pdfimages just gives us this:
  selectcolumn(COL_XPPI, &:to_i).max
end
process() click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 27
def process
  # call just once
  if @output.nil?
    Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr|
      @output = stdout.read.split("\n")
    end
  end
  @output.slice(2, @output.size - 1)
end
selectcolumn(i, &block) click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 48
def selectcolumn(i, &block)
  result = entries.map { |e| e[i] }
  return result.map!(&block) if block_given?
  result
end
width() click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 54
def width
  selectcolumn(COL_WIDTH, &:to_i).max
end