module Pdftocsv

Parsing PDF files to CSV-like data

Constants

VERSION

Public Class Methods

parse(file_path) click to toggle source

Parsing PDF files to CSV-like data

Example:

>> Pdftocsv.parse("example.pdf")
=> [[['a1', 'b1', 'c1'], ['a2', 'b2', 'c2']], [['A1', 'B1', 'C1'], ['A2', 'B2', 'C2']]]

Arguments:

file_path: (String)
# File lib/pdftocsv.rb, line 22
def self.parse(file_path)
  @pages = []
  File.open(file_path, "rb") do |io|
    reader = PDF::Reader.new(io)
    reader.pages.each { |page| @pages << to_page_csv(page) }
  end
  @pages
end
to_page_csv(page) click to toggle source

Separating a whole page text by line

Arguments:

page: (String)
# File lib/pdftocsv.rb, line 36
def to_page_csv(page)
  page_csv = []
  text_lines = page.text.split("\n")
  text_lines.each do |text_line|
    text_list = to_text_list(text_line)
    page_csv << text_list if text_list.any?
  end
  page_csv
end
to_text_list(text_line) click to toggle source

Separating a line by unit

Arguments:

text_line: (String)
# File lib/pdftocsv.rb, line 50
def to_text_list(text_line)
  text_list = text_line.split("\s\s")
  text_list.delete_if { |text| text.nil? || text.empty? }
  text_list.each(&:strip!)
end