module ActAsPageExtractor
:nocov:
Constants
- EXTRACTING_STATES
- FILE_STORAGE
- PDF_STORAGE
- TMP_EXTRACTION_FILE_STORAGE
- VALIDATE_COMPRESS_TYPES
- VALIDATE_DOC_TYPES
- VERSION
Public Class Methods
start_extraction()
click to toggle source
# File lib/act_as_page_extractor/modules/interface.rb, line 16 def self.start_extraction document_class.where(page_extraction_state: EXTRACTING_STATES[:new]).each(&:page_extract!) end
statistics()
click to toggle source
# File lib/act_as_page_extractor/modules/interface.rb, line 20 def self.statistics totals_documents = document_class.count supported_documents = document_class.where("page_extraction_doctype ILIKE ANY (array[#{VALIDATE_DOC_TYPES.map{|dt| '\'%'+dt+'%\''}.join(',')}])").count { total: totals_documents, supported_documents: supported_documents, unsupported_documents: totals_documents - supported_documents, states: EXTRACTING_STATES.map{|state, value| [ state, document_class.where(page_extraction_state: value).count] }.to_h, } end
Public Instance Methods
cleanup_pages()
click to toggle source
# File lib/act_as_page_extractor/modules/tools.rb, line 38 def cleanup_pages self.extracted_pages.destroy_all end
convert_to_pdf()
click to toggle source
# File lib/act_as_page_extractor/modules/extracting.rb, line 7 def convert_to_pdf @pdf_path = if 'pdf' == @document_path.split('.').last.downcase @document_path else if timeout_wrapper{ Docsplit.extract_pdf(@document_path, output: @tmp_dir)} pdf_path = (@document_path.split('.')[0..-2] + ['pdf']).join('.') pdf_path if File.exists?(pdf_path) end end end
convert_to_text()
click to toggle source
# File lib/act_as_page_extractor/modules/extracting.rb, line 18 def convert_to_text begin @pdf_pages = PdfUtils.info(@pdf_path).pages if @pdf_pages if timeout_wrapper{ Docsplit::extract_text(@pdf_path, ocr: false, pages: 'all', output: @tmp_dir) } else # :nocov: @pdf_pages = nil raise # :nocov: end end # :nocov: rescue end # :nocov: end
debug_info()
click to toggle source
:nocov:
# File lib/act_as_page_extractor/modules/tools.rb, line 43 def debug_info # ap "@tmp_dir" # ap @tmp_dir # ap "@copy_document_path" # ap @copy_document_path # ap "@document_path" ap @document_path # ap "@pdf_path" # ap @pdf_path # ap "@pdf_pages" ap @pdf_pages end
extract_pages()
click to toggle source
# File lib/act_as_page_extractor/modules/extracting.rb, line 2 def extract_pages convert_to_pdf convert_to_text end
initialized()
click to toggle source
# File lib/act_as_page_extractor.rb, line 54 def initialized # add all need callbacks #on destroy remove pdf #Add to Readme!! #rails g act_as_page_extractor:migration Document category_id user_id # add to [Document] model: # has_many :extracted_pages, dependent: :destroy create_pdf_dir end
is_extracted()
click to toggle source
# File lib/act_as_page_extractor/modules/tools.rb, line 16 def is_extracted @pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages end
origin_file_name()
click to toggle source
# File lib/act_as_page_extractor/modules/interface.rb, line 2 def origin_file_name self.send(:extracted_filename).url.to_s.split('/').last end
page_extract!()
click to toggle source
# File lib/act_as_page_extractor.rb, line 65 def page_extract! initialized cleanup_pages create_tmp_dir begin copy_document unzip_document if valid_document extract_pages save_to_db end ensure update_state save_pdf debug_info finish end end
pdf_path()
click to toggle source
# File lib/act_as_page_extractor/modules/interface.rb, line 6 def pdf_path if page_extraction_state == EXTRACTING_STATES[:extracted] && page_extraction_doctype&.downcase != 'pdf' "#{pdf_storage}/#{origin_file_name.split('.').first}.pdf" end end
remove_files()
click to toggle source
# File lib/act_as_page_extractor/modules/interface.rb, line 12 def remove_files FileUtils::rm_rf(pdf_path) if File.exists?(pdf_path.to_s) end
remove_last_byte(file_name)
click to toggle source
fix for openoffice/jodconverter: delete last ugly byte in converted text page
# File lib/act_as_page_extractor/modules/saving.rb, line 38 def remove_last_byte(file_name) file = File.new(file_name, 'a+') if file.size > 0 file.seek(file.size - 1) last_byte = file.getc file.truncate(file.size - 1) if last_byte == "\f" end file.close end
save_pdf()
click to toggle source
# File lib/act_as_page_extractor/modules/saving.rb, line 2 def save_pdf if save_as_pdf && is_extracted && @document_path.split('.').last&.downcase != 'pdf' if @pdf_path FileUtils.cp(@pdf_path, pdf_storage) end end end
save_to_db()
click to toggle source
# File lib/act_as_page_extractor/modules/saving.rb, line 13 def save_to_db self.update_attributes(page_extraction_state: EXTRACTING_STATES[:extracting]) ExtractedPage.transaction do @pdf_pages&.times&.each do |pdf_page| page_filename = "#{@tmp_dir}/#{@document_filename.split('.').first}_#{(pdf_page + 1).to_s}.txt" remove_last_byte(page_filename) content = IO.read(page_filename).delete("<" ">" "&" "\u0001" "\u25A0" "\a") page_attributes = { page: content, page_number: pdf_page + 1 } page_attributes[extracted_document_id] = self.id additional_fields.each do |additional_field| page_attributes[additional_field] = self.send(additional_field.to_sym) end ExtractedPage.create(page_attributes) end end end
timeout_wrapper() { || ... }
click to toggle source
# File lib/act_as_page_extractor/modules/tools.rb, line 4 def timeout_wrapper result = nil begin result = Timeout::timeout(60*5) { yield } rescue # :nocov: ensure # :nocov: result end end
unzip_document()
click to toggle source
# File lib/act_as_page_extractor/modules/unzipping.rb, line 2 def unzip_document @document_path = @copy_document_path if validate_compress_types result = TotalCompressor.decompress(@copy_document_path) if result[:success] && result[:files].length == 1 origin_document_name = @origin_document_path.split("/").last.split('.').first unpacked_document = result[:files].first.split('/').last unpacked_document_format = unpacked_document.split('.').last @document_path = "#{@tmp_dir}/#{origin_document_name}.#{unpacked_document_format}" File.rename(result[:files].first, @document_path) end end end
update_state()
click to toggle source
# File lib/act_as_page_extractor/modules/tools.rb, line 20 def update_state updated_attributes = if is_extracted { page_extraction_state: EXTRACTING_STATES[:extracted], page_extraction_pages: @pdf_pages } else { page_extraction_state: EXTRACTING_STATES[:'error.extraction'], page_extraction_pages: 0 } end.merge({ page_extraction_doctype: @document_path&.split('.')&.last, page_extraction_filesize: Filesize.from("#{File.size(@document_path)} B").pretty }) self.update_attributes(updated_attributes) end
valid_document()
click to toggle source
# File lib/act_as_page_extractor/modules/validating.rb, line 6 def valid_document validate_size && validate_doc_types end
validate_compress_types()
click to toggle source
# File lib/act_as_page_extractor/modules/validating.rb, line 15 def validate_compress_types VALIDATE_COMPRESS_TYPES.include?(@copy_document_path.split('.').last&.downcase) end
validate_doc_types()
click to toggle source
# File lib/act_as_page_extractor/modules/validating.rb, line 19 def validate_doc_types VALIDATE_DOC_TYPES.include?(@document_path.split('.').last&.downcase) end
validate_size()
click to toggle source
# File lib/act_as_page_extractor/modules/validating.rb, line 10 def validate_size mb = 2**20 File.size(@copy_document_path) <= 1*mb end
Private Instance Methods
copy_document()
click to toggle source
# File lib/act_as_page_extractor.rb, line 97 def copy_document @origin_document_path = "#{file_storage}#{self.send(:extracted_filename).url.to_s}" FileUtils.cp(@origin_document_path, @tmp_dir) @copy_document_path = "#{@tmp_dir}/#{@origin_document_path.split("/").last}" @document_filename = @origin_document_path.split("/").last end
create_pdf_dir()
click to toggle source
# File lib/act_as_page_extractor.rb, line 86 def create_pdf_dir if save_as_pdf FileUtils::mkdir_p(pdf_storage) unless File.exists?(pdf_storage) end end
create_tmp_dir()
click to toggle source
# File lib/act_as_page_extractor.rb, line 92 def create_tmp_dir @tmp_dir = "#{TMP_EXTRACTION_FILE_STORAGE}/#{SecureRandom.hex(6)}" FileUtils::mkdir_p(@tmp_dir) unless File.exists?(@tmp_dir) end
finish()
click to toggle source
# File lib/act_as_page_extractor.rb, line 104 def finish remove_tmp_dir end
remove_tmp_dir()
click to toggle source
# File lib/act_as_page_extractor.rb, line 108 def remove_tmp_dir FileUtils.rm_rf(@tmp_dir) if @tmp_dir =~ /\/tmp\// end