class NewspaperWorks::Ingest::PdfPages

Public Class Methods

new(path) click to toggle source
# File lib/newspaper_works/ingest/pdf_pages.rb, line 10
def initialize(path)
  @baseid = SecureRandom.uuid
  @pdfpath = path
  @info = nil
  @entries = nil
  @tmpdir = nil
  @size = nil
  @pagecount = nil
  @pdftext = nil
end

Public Instance Methods

colordevice(channels, bpc) click to toggle source
# File lib/newspaper_works/ingest/pdf_pages.rb, line 32
def colordevice(channels, bpc)
  bits = bpc * channels
  # will be either 8bpc/16bpd color TIFF,
  #   with any CMYK source transformed to 8bpc RBG
  bits = 24 unless [24, 48].include? bits
  "tiff#{bits}nc"
end
each() { |e| ... } click to toggle source
# File lib/newspaper_works/ingest/pdf_pages.rb, line 107
def each
  entries.each do |e|
    yield(e)
  end
end
entries() click to toggle source

entries for each page

# File lib/newspaper_works/ingest/pdf_pages.rb, line 102
def entries
  @entries = gsconvert if @entries.nil?
  @entries
end
gsconvert() click to toggle source

ghostscript convert all pages to TIFF

# File lib/newspaper_works/ingest/pdf_pages.rb, line 88
def gsconvert
  output_base = File.join(tmpdir, "#{@baseid}-page%d.tiff")
  cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} " \
        "-dTextAlphaBits=4 " \
        "-sOutputFile=#{output_base} -r#{ppi} -f #{@pdfpath}"
  Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
    output = stdout.read.split("\n")
    @size = output.select { |e| e.start_with?('Page ') }.length
  end
  # Return an array of expected filenames
  (1..@size).map { |n| File.join(tmpdir, "#{@baseid}-page#{n}.tiff") }
end
gsdevice() click to toggle source
# File lib/newspaper_works/ingest/pdf_pages.rb, line 40
def gsdevice
  color, channels, bpc = pdfinfo.color
  device = nil
  # CCITT Group 4 Black and White, if applicable:
  device = 'tiffg4' if color == 'gray' && bpc == 1
  # 8 Bit Grayscale, if applicable:
  device = 'tiffgray' if color == 'gray' && bpc > 1
  # otherwise color:
  device = colordevice(channels, bpc) if device.nil?
  device
end
gstext() click to toggle source
# File lib/newspaper_works/ingest/pdf_pages.rb, line 52
def gstext
  cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=txtwrite " \
        "-sOutputFile=- -f #{@pdfpath}"
  Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
    @pdftext = stdout.read
  end
  @pdftext
end
looks_scanned() click to toggle source
# File lib/newspaper_works/ingest/pdf_pages.rb, line 71
def looks_scanned
  max_image_px = pdfinfo.width * pdfinfo.height
  single_image_per_page = pdfinfo.entries.length == pagecount
  # single 10mp+ image per page?
  single_image_per_page && max_image_px > 1024 * 1024 * 10
end
pagecount() click to toggle source
# File lib/newspaper_works/ingest/pdf_pages.rb, line 61
def pagecount
  cmd = "pdfinfo #{@pdfpath}"
  Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr|
    output = stdout.read.split("\n")
    pages_e = output.select { |e| e.start_with?('Pages:') }[0]
    @pagecount = pages_e.split[-1].to_i
  end
  @pagecount
end
pdfinfo() click to toggle source

return

# File lib/newspaper_works/ingest/pdf_pages.rb, line 22
def pdfinfo
  @info = PdfImages.new(@pdfpath) if @info.nil?
  @info
end
ppi() click to toggle source
# File lib/newspaper_works/ingest/pdf_pages.rb, line 78
def ppi
  unless looks_scanned
    # 400 dpi for something that does not look like scanned media:
    return 400
  end
  # For scanned media, defer to detected image PPI:
  pdfinfo.ppi
end
tmpdir() click to toggle source
# File lib/newspaper_works/ingest/pdf_pages.rb, line 27
def tmpdir
  @tmpdir = Dir.mktmpdir if @tmpdir.nil?
  @tmpdir
end