module ActAsPageExtractor

:nocov:

Constants

EXTRACTING_STATES
FILE_STORAGE
PDF_STORAGE
TMP_EXTRACTION_FILE_STORAGE
VALIDATE_COMPRESS_TYPES
VALIDATE_DOC_TYPES
VERSION

Public Class Methods

start_extraction() click to toggle source
# File lib/act_as_page_extractor/modules/interface.rb, line 16
def self.start_extraction
  document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!)
end
statistics() click to toggle source
# File lib/act_as_page_extractor/modules/interface.rb, line 20
def self.statistics
  totals_documents = document_class.count
  supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count
  {
    total: totals_documents,
    supported_documents: supported_documents,
    unsupported_documents: totals_documents - supported_documents,
    states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h,
  }
end

Public Instance Methods

cleanup_pages() click to toggle source
# File lib/act_as_page_extractor/modules/tools.rb, line 38
def cleanup_pages
  self.extracted_pages.destroy_all
end
convert_to_pdf() click to toggle source
# File lib/act_as_page_extractor/modules/extracting.rb, line 7
def convert_to_pdf
   @pdf_path = if 'pdf' == @document_path.split('.').last.downcase
     @document_path
   else
    if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)}
      pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.')
      pdf_path if File.exists?(pdf_path)
    end
  end
end
convert_to_text() click to toggle source
# File lib/act_as_page_extractor/modules/extracting.rb, line 18
def convert_to_text
  begin
    @pdf_pages = PdfUtils.info(@pdf_path).pages
    if @pdf_pages
      if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) }
      else
        # :nocov:
        @pdf_pages = nil
        raise
        # :nocov:
      end
    end
  # :nocov:
  rescue
  end
  # :nocov:
end
debug_info() click to toggle source

:nocov:

# File lib/act_as_page_extractor/modules/tools.rb, line 43
def debug_info
  # ap "@tmp_dir"
  # ap @tmp_dir
  # ap "@copy_document_path"
  # ap @copy_document_path
  # ap "@document_path"
  ap @document_path
  # ap "@pdf_path"
  # ap @pdf_path
  # ap "@pdf_pages"
  ap @pdf_pages
end
extract_pages() click to toggle source
# File lib/act_as_page_extractor/modules/extracting.rb, line 2
def extract_pages
  convert_to_pdf
  convert_to_text
end
initialized() click to toggle source
# File lib/act_as_page_extractor.rb, line 54
def initialized
  # add all need callbacks
    #on destroy remove pdf

  #Add to Readme!!
  #rails g act_as_page_extractor:migration Document category_id user_id
  # add to [Document] model:
  # has_many :extracted_pages, dependent: :destroy
  create_pdf_dir
end
is_extracted() click to toggle source
# File lib/act_as_page_extractor/modules/tools.rb, line 16
def is_extracted
  @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
end
origin_file_name() click to toggle source
# File lib/act_as_page_extractor/modules/interface.rb, line 2
def origin_file_name
  self.send(:extracted_filename).url.to_s.split('/').last
end
page_extract!() click to toggle source
# File lib/act_as_page_extractor.rb, line 65
def page_extract!
  initialized
  cleanup_pages
  create_tmp_dir
  begin
    copy_document
    unzip_document
    if valid_document
      extract_pages
      save_to_db
    end
  ensure
    update_state
    save_pdf
    debug_info
    finish
  end
end
pdf_path() click to toggle source
# File lib/act_as_page_extractor/modules/interface.rb, line 6
def pdf_path
  if page_extraction_state == EXTRACTING_STATES[:extracted] && page_extraction_doctype&.downcase != 'pdf'
    "#{pdf_storage}/#{origin_file_name.split('.').first}.pdf"
  end
end
remove_files() click to toggle source
# File lib/act_as_page_extractor/modules/interface.rb, line 12
def remove_files
  FileUtils::rm_rf(pdf_path) if File.exists?(pdf_path.to_s)
end
remove_last_byte(file_name) click to toggle source

fix for openoffice/jodconverter: delete last ugly byte in converted text page

# File lib/act_as_page_extractor/modules/saving.rb, line 38
def remove_last_byte(file_name)
  file = File.new(file_name, 'a+')
  if file.size > 0
    file.seek(file.size - 1)
    last_byte = file.getc
    file.truncate(file.size - 1) if last_byte == "\f"
  end
  file.close
end
save_pdf() click to toggle source
# File lib/act_as_page_extractor/modules/saving.rb, line 2
def save_pdf
  if save_as_pdf &&
     is_extracted &&
     @document_path.split('.').last&.downcase != 'pdf'

    if @pdf_path
      FileUtils.cp(@pdf_path, pdf_storage)
    end
  end
end
save_to_db() click to toggle source
# File lib/act_as_page_extractor/modules/saving.rb, line 13
def save_to_db
  self.update_attributes(page_extraction_state: EXTRACTING_STATES[:extracting])
  ExtractedPage.transaction do
    @pdf_pages&.times&.each do |pdf_page|
      page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt"
      remove_last_byte(page_filename)
      content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a")

      page_attributes = {
        page:        content,
        page_number: pdf_page + 1
      }

      page_attributes[extracted_document_id] = self.id

      additional_fields.each do |additional_field|
        page_attributes[additional_field] = self.send(additional_field.to_sym)
      end

      ExtractedPage.create(page_attributes)
    end
  end
end
timeout_wrapper() { || ... } click to toggle source
# File lib/act_as_page_extractor/modules/tools.rb, line 4
def timeout_wrapper
  result = nil
  begin
    result = Timeout::timeout(60*5) { yield }
  rescue
  # :nocov:
  ensure
  # :nocov:
    result
  end
end
unzip_document() click to toggle source
# File lib/act_as_page_extractor/modules/unzipping.rb, line 2
def unzip_document
   @document_path = @copy_document_path
   if validate_compress_types
     result = TotalCompressor.decompress(@copy_document_path)
     if result[:success] && result[:files].length == 1
       origin_document_name = @origin_document_path.split("/").last.split('.').first
       unpacked_document = result[:files].first.split('/').last
       unpacked_document_format = unpacked_document.split('.').last
       @document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}"
       File.rename(result[:files].first, @document_path)
     end
   end
 end
update_state() click to toggle source
# File lib/act_as_page_extractor/modules/tools.rb, line 20
def update_state
  updated_attributes = if is_extracted
    {
      page_extraction_state: EXTRACTING_STATES[:extracted],
      page_extraction_pages: @pdf_pages
    }
  else
    {
      page_extraction_state: EXTRACTING_STATES[:'error.extraction'],
      page_extraction_pages: 0
    }
  end.merge({
      page_extraction_doctype: @document_path&.split('.')&.last,
      page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty
    })
  self.update_attributes(updated_attributes)
end
valid_document() click to toggle source
# File lib/act_as_page_extractor/modules/validating.rb, line 6
def valid_document
  validate_size && validate_doc_types
end
validate_compress_types() click to toggle source
# File lib/act_as_page_extractor/modules/validating.rb, line 15
def validate_compress_types
  VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase)
end
validate_doc_types() click to toggle source
# File lib/act_as_page_extractor/modules/validating.rb, line 19
def validate_doc_types
  VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase)
end
validate_size() click to toggle source
# File lib/act_as_page_extractor/modules/validating.rb, line 10
def validate_size
  mb = 2**20
  File.size(@copy_document_path) <= 1*mb
end

Private Instance Methods

copy_document() click to toggle source
# File lib/act_as_page_extractor.rb, line 97
def copy_document
  @origin_document_path = "#{file_storage}#{self.send(:extracted_filename).url.to_s}"
  FileUtils.cp(@origin_document_path, @tmp_dir)
  @copy_document_path = "#{@tmp_dir}/#{@origin_document_path.split("/").last}"
  @document_filename = @origin_document_path.split("/").last
end
create_pdf_dir() click to toggle source
# File lib/act_as_page_extractor.rb, line 86
def create_pdf_dir
  if save_as_pdf
    FileUtils::mkdir_p(pdf_storage) unless File.exists?(pdf_storage)
  end
end
create_tmp_dir() click to toggle source
# File lib/act_as_page_extractor.rb, line 92
def create_tmp_dir
  @tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}"
  FileUtils::mkdir_p(@tmp_dir) unless File.exists?(@tmp_dir)
end
finish() click to toggle source
# File lib/act_as_page_extractor.rb, line 104
def finish
  remove_tmp_dir
end
remove_tmp_dir() click to toggle source
# File lib/act_as_page_extractor.rb, line 108
def remove_tmp_dir
  FileUtils.rm_rf(@tmp_dir) if @tmp_dir =~ /\/tmp\//
end