module DatasetsArrow::Arrowable

Public Instance Methods

each_record_batch() { |record_batch| ... } click to toggle source
# File lib/datasets-arrow/arrowable.rb, line 22
def each_record_batch(&block)
  return to_enum(__method__) unless block_given?

  data_path = arrow_data_path
  if data_path.exist?
    input = Arrow::MemoryMappedInputStream.new(data_path.to_path)
    reader = Arrow::RecordBatchFileReader.new(input)
    reader.each do |record_batch|
      record_batch.instance_variable_set(:@input, input)
      yield(record_batch)
    end
  else
    to_arrow.each_record_batch(&block)
  end
end
to_arrow() click to toggle source
# File lib/datasets-arrow/arrowable.rb, line 5
def to_arrow
  data_path = arrow_data_path
  if data_path.exist?
    Arrow::Table.load(data_path)
  else
    raw_table = {}
    to_table.to_h.each do |name, values|
      raw_table[name] = Arrow::ArrayBuilder.build(values)
    end
    table = Arrow::Table.new(raw_table)
    directory = data_path.parent
    directory.mkpath unless directory.exist?
    table.save(data_path)
    table
  end
end

Private Instance Methods

arrow_data_path() click to toggle source
# File lib/datasets-arrow/arrowable.rb, line 39
def arrow_data_path
  cache_dir_path + "data.arrow"
end