module ACSV::Detect

Constants

CONFIDENCE

Default confidence level for encoding detection to succeed

ENCODING_DETECTORS_ALL
ENCODING_DETECTORS_AVAIL
PREVIEW_BYTES

Number of bytes to test encoding on

SEPARATORS

Possible CSV separators to check

Public Class Methods

encoding(file_or_data, options={}) click to toggle source

Tries to detect the file encoding.

@param file_or_data [File, String] CSV file or data to probe @option options [Number] :confidence minimum confidence level (0-1) @option options [String] :method try only specific method, one of {encoding_methods} @return [String] most probable encoding

# File lib/acsv/detect/encoding.rb, line 20
def encoding(file_or_data, options={})
  if file_or_data.is_a? File
    position = file_or_data.tell
    data = file_or_data.read(PREVIEW_BYTES)
    file_or_data.seek(position)
  else
    data = file_or_data
  end

  detector_do(options) do |detector|
    if enc = detector.encoding(data, options)
      return enc
    end
  end
  nil
end
encoding_methods() click to toggle source

@return [Array<String>] List of available methods for encoding

# File lib/acsv/detect/encoding.rb, line 38
def encoding_methods
  ENCODING_DETECTORS_AVAIL.map(&:require_name)
end
encoding_methods_all() click to toggle source

@return [Array<String>] List of possible methods for encoding (even if its gem is missing)

# File lib/acsv/detect/encoding.rb, line 43
def encoding_methods_all
  ENCODING_DETECTORS_ALL.map(&:require_name)
end
separator(file_or_data) click to toggle source

@param file_or_data [File, String] CSV file or data to probe @return [String] most probable column separator character from first line, or nil when none found @todo return whichever character returns the same number of columns over multiple lines

# File lib/acsv/detect/separator.rb, line 10
def self.separator(file_or_data)
  if file_or_data.is_a? File
    position = file_or_data.tell
    firstline = file_or_data.readline
    file_or_data.seek(position)
  else
    firstline = file_or_data.split("\n", 2)[0]
  end
  separators = SEPARATORS.map{|s| s.encode(firstline.encoding)}
  sep = separators.map {|x| [firstline.count(x),x]}.sort_by {|x| x[0]}.last
  sep[0] == 0 ? nil : sep[1].encode('ascii')
end

Protected Class Methods

detector_do(options) { |detector| ... } click to toggle source

Run supplied block on detectors @option options [Boolean] :method Only try this method, instead of trying all

# File lib/acsv/detect/encoding.rb, line 54
def detector_do(options)
  if options[:method]
    detector = ENCODING_DETECTORS_AVAIL.select{|d| d.require_name == options[:method]}.first
    yield detector
  else
    ENCODING_DETECTORS_AVAIL.each do |detector|
      yield detector if detector.present?
    end
  end
end