module DarwinCore::Ingester

This module abstracts information for reading csv file to be used in several classes which need such functionality

Attributes

data[R]
encoding[R]
fields[R]
fields_separator[R]
file_path[R]
ignore_headers[R]
line_separator[R]
properties[R]
quote_character[R]
size[R]

Public Instance Methods

read(batch_size = 10_000) { |res, errors| ... } click to toggle source
# File lib/dwc_archive/ingester.rb, line 12
def read(batch_size = 10_000)
  DarwinCore.logger_write(@dwc.object_id, "Reading #{name} data")
  res = []
  errors = []
  args = define_csv_args
  min_size = @fields.map { |f| f[:index].to_i || 0 }.max + 1
  csv = CSV.new(open(@file_path), **args)
  csv.each_with_index do |r, i|
    next if @ignore_headers && i == 0

    min_size > r.size ? errors << r : process_csv_row(res, errors, r)
    next if i == 0 || i % batch_size != 0

    DarwinCore.logger_write(@dwc.object_id,
                            format("Ingested %s records from %s",
                                   i, name))
    next unless block_given?

    yield [res, errors]
    res = []
    errors = []
  end
  yield [res, errors] if block_given?
  [res, errors]
end

Private Instance Methods

define_csv_args() click to toggle source
# File lib/dwc_archive/ingester.rb, line 40
def define_csv_args
  args = { col_sep: @field_separator }
  @quote_character = "\b" if @quote_character.empty?
  args.merge(quote_char: @quote_character)
end
init_attributes() click to toggle source
# File lib/dwc_archive/ingester.rb, line 60
def init_attributes
  @properties = @data[:attributes]
  init_encoding
  @field_separator = init_field_separator
  @quote_character = @properties[:fieldsEnclosedBy] || ""
  @line_separator = @properties[:linesTerminatedBy] || "\n"
  @ignore_headers = @properties[:ignoreHeaderLines] &&
                    [1, true].include?(@properties[:ignoreHeaderLines])
  init_file_path
  init_fields
end
init_encoding() click to toggle source
# File lib/dwc_archive/ingester.rb, line 72
def init_encoding
  @encoding = @properties[:encoding] || "UTF-8"
  accepted_encoding = %w[utf-8 utf8 utf-16 utf16].
                      include?(@encoding.downcase)
  unless accepted_encoding
    raise(
      DarwinCore::EncodingError,
      "No support for encodings other than utf-8 or utf-16 at the moment"
    )
  end
end
init_field_separator() click to toggle source
# File lib/dwc_archive/ingester.rb, line 101
def init_field_separator
  res = @properties[:fieldsTerminatedBy] || ","
  res = "\t" if res == "\\t"
  res
end
init_fields() click to toggle source
# File lib/dwc_archive/ingester.rb, line 92
def init_fields
  @data[:field] = [data[:field]] if data[:field].class != Array
  @fields = @data[:field].map { |f| f[:attributes] }
  if @fields.empty?
    raise DarwinCore::InvalidArchiveError,
          "No data fields are found"
  end
end
init_file_path() click to toggle source
# File lib/dwc_archive/ingester.rb, line 84
def init_file_path
  file = @data[:location] ||
         @data[:attributes][:location] ||
         @data[:files][:location]
  @file_path = File.join(@path, file)
  raise DarwinCore::FileNotFoundError, "No file data" unless @file_path
end
init_size() click to toggle source
# File lib/dwc_archive/ingester.rb, line 107
def init_size
  `wc -l #{@file_path}`.match(/^\s*(\d+)\s/)[1].to_i
end
name() click to toggle source
# File lib/dwc_archive/ingester.rb, line 46
def name
  self.class.to_s.split("::")[-1].downcase
end
process_csv_row(result, errors, row) click to toggle source
# File lib/dwc_archive/ingester.rb, line 50
def process_csv_row(result, errors, row)
  str = row.join("")
  str = str.force_encoding("utf-8")
  if str.encoding.name == "UTF-8" && str.valid_encoding?
    result << row.map { |f| f.nil? ? nil : f.force_encoding("utf-8") }
  else
    errors << row
  end
end