class NewspaperWorks::Ingest::PdfImages
PdfImages
uses poppler 0.19+ pdfimages command to extract image
listing metadata from PDF files. For dpi extraction, falls back to calculating using MiniMagick, if neccessary.
Constants
- COL_BITS
- COL_CHANNELS
- COL_COLOR
- COL_HEIGHT
- COL_WIDTH
class constant column numbers
- COL_XPPI
only poppler 0.25+ has this column in output:
Public Class Methods
new(path)
click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 20 def initialize(path) @path = path @cmd = format('pdfimages -list %<path>s', path: path) @output = nil @entries = nil end
Public Instance Methods
color()
click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 62 def color # desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white # so caller may want all of this information, and in case of # mixed color spaces across images, this returns maximum desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray' channels = entries.map { |e| e[COL_CHANNELS].to_i }.max bits = entries.map { |e| e[COL_BITS].to_i }.max [desc, channels, bits] end
entries()
click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 37 def entries if @entries.nil? @entries = [] output = process (0..output.size - 1).each do |i| @entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" ")) end end @entries end
height()
click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 58 def height selectcolumn(COL_HEIGHT, &:to_i).max end
ppi()
click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 72 def ppi if entries[0].size <= 12 # poppler < 0.25 pdf = MiniMagick::Image.open(@path) width_points = pdf.width width_px = width return (72 * width_px / width_points).to_i end # with poppler 0.25+, pdfimages just gives us this: selectcolumn(COL_XPPI, &:to_i).max end
process()
click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 27 def process # call just once if @output.nil? Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr| @output = stdout.read.split("\n") end end @output.slice(2, @output.size - 1) end
selectcolumn(i, &block)
click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 48 def selectcolumn(i, &block) result = entries.map { |e| e[i] } return result.map!(&block) if block_given? result end
width()
click to toggle source
# File lib/newspaper_works/ingest/pdf_images.rb, line 54 def width selectcolumn(COL_WIDTH, &:to_i).max end