module LZUTF8

LZUTF8 contains functions to compress and decompress UTF-8 text with the LZUTF-8 algorithm @author Ian Katz <ianfixes@gmail.com>

Constants

MAXIMUM_MATCH_DISTANCE
MAXIMUM_SEQUENCE_LENGTH
MINIMUM_SEQUENCE_LENGTH
VERSION

Public Class Methods

compress(string) click to toggle source

Decompress a string using the LZUTF8 algorithm

Due to the nature of this compression algorithm, only valid UTF-8 codepoints can be compressed; arbitrary binary data will fail.

@param string [String] The input string @return [String] compresed string

# File lib/lzutf8.rb, line 156
def self.compress(string)
  raise ArgumentError unless string.is_a? String

  input = string.bytes
  hash = {}
  match_score = proc { |dist, len| dist < 128 ? len * 1.5 : len } # 2 byte vs 3 byte compression

  pointer = -1
  output = []
  until pointer + 1 == input.size do
    pointer += 1
    c1, c2, c3, c4 = key = input[pointer, 4]
    key = key.pack('C*')
    next (output << c1) if [c2, c3, c4].any?(&:nil?) # near end of input, just iterate until it's consumed

    max_len = [input.size - pointer, MAXIMUM_SEQUENCE_LENGTH].min  # max length of a match
    matches = begin
      next (output << c1) if hash[key].nil? # no matches if no bucket

      hash[key].map do |start|              # all known bucket entries as [distance, length_of_match]
        matchable = input[pointer, max_len] # max length of the matchable input segment
        distance = pointer - start          # relative distance must be less than max expressable
        next nil if distance > MAXIMUM_MATCH_DISTANCE # this would mean a bucket entry we didn't clean up

        # linear comparison to find longest common prefix
        len = input[start, max_len].zip(matchable).index { |a, b| a.nil? || b.nil? || a != b }
        case len
        when nil then                   [distance, matchable.size] # hit end of input, fully matched
        when 0..MINIMUM_SEQUENCE_LENGTH then nil                   # no match
        else                            [distance, len - 1]        # index is the one BEFORE the mismatch
        end
      end
    ensure  # that the pointer is added to the hash for this sequence
      hash[key] = [] if hash[key].nil?
      hash[key] << pointer
      hash[key] = hash[key].select { |p| pointer - p < MAXIMUM_MATCH_DISTANCE } # prune too-distant matches
    end.compact

    best_match = matches.max { |a, b| match_score.call(a) <=> match_score.call(b) }
    next (output << c1) if best_match.nil?

    # Output a pointer to the match and advance the input pointer by the amount matched
    match_distance, match_length = best_match
    output += if match_distance < 0b10000000
      [0b11000000 | match_length, match_distance]
    else
      [0b11100000 | match_length, match_distance >> 8, match_distance & 0b00000000_11111111]
    end
    pointer += (match_length - 1)
  end
  output.pack('C*').force_encoding(Encoding::UTF_8)
end
decompress(string) click to toggle source

Decompress an LZUTF8-compressed string.

Due to the nature of this decompression algorithm, any uncompressed UTF-8 string can be passed to this function and will be returned unmodified

@param string [String] The input LZUTF8-compressed string @return [String] Uncompressed UTF-8 string

# File lib/lzutf8.rb, line 126
def self.decompress(string)
  raise ArgumentError unless string.is_a? String

  input = string.bytes
  output = []
  until input.empty? do
    c1 = input.shift # consume
    c2 = input.first # peek
    next (output << c1) unless (c1 & 0b11000000) == 0b11000000 && (c2 & 0b10000000).zero?

    # By this point we know it's not a literal char and we must actually consume the 2nd byte
    # either llllll_0ddddddd or 0b111lllll_0ddddddd_dddddddd
    c2       = input.shift & 0b01111111
    length   = c1 & 0b00011111
    distance = (c1 & 0b00100000).zero? ? c2 : (c2 << 8) + input.shift # consume 3rd byte if needed

    # get text from pointer, wrap in enumartor, take data until length satisfied and append
    output += Enumerator.new { |y| loop { output[-distance, length].each { |v| y << v } } }.lazy.take(length).to_a
  end

  output.pack('C*').force_encoding(Encoding::UTF_8)
end
explain(string) click to toggle source

Explain a compressed (or uncompressed) UTF-8 string at the sequence-of-bits level by printing to STDOUT

@param string [String] the input string

# File lib/lzutf8.rb, line 30
def self.explain(string)
  raise ArgumentError unless string.is_a? String

  input = string.bytes
  position = -1
  outsize = 0

  pointer2 = lambda do |seq|
    c1, c2 = seq
    [c2].none?(&:nil?) &&
      (c1 & 0b11100000) == 0b11000000 &&
      (c2 & 0b10000000).zero?
  end

  pointer3 = lambda do |seq|
    c1, c2, c3 = seq
    [c2, c3].none?(&:nil?) &&
      (c1 & 0b11100000) == 0b11100000 &&
      (c2 & 0b10000000).zero?
  end

  codepoint1 = ->(seq) { (seq.first & 0b10000000).zero? }

  codepoint2 = lambda do |seq|
    c1, c2 = seq
    [c2].none?(&:nil?) &&
      (c1 & 0b11100000) == 0b11000000 &&
      (c2 & 0b11000000) == 0b10000000
  end

  codepoint3 = lambda do |seq|
    c1, c2, c3 = seq
    [c2, c3].none?(&:nil?) &&
      (c1 & 0b11110000) == 0b11100000 &&
      (c2 & 0b11000000) == 0b10000000 &&
      (c3 & 0b11000000) == 0b10000000
  end

  codepoint4 = lambda do |seq|
    c1, c2, c3, c4 = seq
    [c2, c3, c4].none?(&:nil?) &&
      (c1 & 0b11111000) == 0b11110000 &&
      (c2 & 0b11000000) == 0b10000000 &&
      (c3 & 0b11000000) == 0b10000000 &&
      (c4 & 0b11000000) == 0b10000000
  end

  binarize = proc { |bytes| "0b" + bytes.map { |b| b.to_s(2).rjust(8, '0') }.join("_") }
  dump = proc do |pos, bytes, inc, meaning|
    outsize += inc
    puts "#{pos.to_s.rjust(4, '0')} #{binarize.call(bytes)} #{meaning} OS=#{outsize}"
  end

  dumpc = proc { |pos, bytes| dump.call(pos, bytes, bytes.size, "literal - #{bytes.pack('C*')}") }
  dumpp = proc do |pos, bytes|
    length, distance = self.pointer_info(bytes)
    dump.call(pos, bytes, length, "pointer l=#{length} d=#{distance}")
  end

  until input.empty? do
    position += 1
    c1 = input.shift
    c2, c3, c4 = input[0, 3]
    sequence = [c1, c2, c3, c4]

    case sequence
    when codepoint1
      dumpc.call(position, [c1])
    when codepoint2
      dumpc.call(position, [c1, c2])
      position += 1.times { input.shift } # rubocop:disable Lint/UselessTimes
    when codepoint3
      dumpc.call(position, [c1, c2, c3])
      position += 2.times { input.shift }
    when codepoint4
      dumpc.call(position, [c1, c2, c3, c4])
      position += 3.times { input.shift }
    when pointer2
      dumpp.call(position, [c1, c2])
      position += 1.times { input.shift } # rubocop:disable Lint/UselessTimes
    when pointer3
      dumpp.call(position, [c1, c2, c3])
      position += 2.times { input.shift }
    else
      dump.call(position, [c1], 1, "Assumed part of a sequence")
    end
  end
end
pointer_info(bytes) click to toggle source

Extract the information contained in an LZUTF8 Sized Pointer

@param bytes [Array<Integer>] an array of character codes @return [Integer, Integer] the pointer length and distance

# File lib/lzutf8.rb, line 15
def self.pointer_info(bytes)
  c1, c2, c3 = bytes
  length = c1 & 0b00011111
  # either llllll_0ddddddd or 0b111lllll_0ddddddd_dddddddd
  distance = if (c1 & 0b00100000).zero?
    c2
  else
    (c2 << 8) + c3
  end
  [length, distance]
end