class ChupaText::Extractor

Public Class Methods

new() click to toggle source
# File lib/chupa-text/extractor.rb, line 25
def initialize
  @decomposers = []
end

Public Instance Methods

add_decomposer(decomposer) click to toggle source
# File lib/chupa-text/extractor.rb, line 44
def add_decomposer(decomposer)
  @decomposers << decomposer
end
apply_configuration(configuration) click to toggle source

Sets the extractor up by the configuration. It adds decomposers enabled in the configuration.

@param [Configuration] configuration The configuration to be

applied.

@return [void]

# File lib/chupa-text/extractor.rb, line 36
def apply_configuration(configuration)
  decomposers = Decomposers.create(Decomposer.registry,
                                   configuration.decomposer)
  decomposers.each do |decomposer|
    add_decomposer(decomposer)
  end
end
extract(input, &block) click to toggle source

Extracts texts from input. Each extracted text is passes to the given block.

@param [Data, String] input The input to be extracted texts.

If `input` is `String`, it is treated as the local file path or URI
of input data.

@yield [text_data] Gives extracted text data to the block.

The block may be called zero or more times.

@yieldparam [Data] text_data The extracted text data.

You can get text data by `text_data.body`.

@return [void]

# File lib/chupa-text/extractor.rb, line 61
def extract(input, &block)
  extract_recursive(ensure_data(input), &block)
end

Private Instance Methods

ensure_data(input) click to toggle source
# File lib/chupa-text/extractor.rb, line 66
def ensure_data(input)
  if input.is_a?(Data)
    input
  else
    InputData.new(input)
  end
end
extract_recursive(target) { |utf8_data| ... } click to toggle source
# File lib/chupa-text/extractor.rb, line 86
def extract_recursive(target, &block)
  debug do
    "#{log_tag}[extract][target] <#{target.uri}>:<#{target.mime_type}>"
  end
  decomposer = find_decomposer(target)
  if decomposer.nil?
    if target.text_plain?
      debug {"#{log_tag}[extract][text-plain]"}
      utf8_data = target.to_utf8_body_data
      yield(utf8_data)
      utf8_data.release unless target == utf8_data
    else
      debug {"#{log_tag}[extract][decomposer] not found"}
      if target.text?
        utf8_data = target.to_utf8_body_data
        yield(utf8_data)
        utf8_data.release unless target == utf8_data
      end
    end
  else
    debug {"#{log_tag}[extract][decomposer] #{decomposer.class}"}
    with_timeout(target) do
      decomposer.decompose(target) do |decomposed|
        begin
          debug do
            "#{log_tag}[extract][decomposed] " +
              "#{decomposer.class}: " +
              "<#{target.uri}>: " +
              "<#{target.mime_type}> -> <#{decomposed.mime_type}>"
          end
          extract_recursive(decomposed, &block)
        ensure
          decomposed.release
        end
      end
    end
  end
end
find_decomposer(data) click to toggle source
# File lib/chupa-text/extractor.rb, line 74
def find_decomposer(data)
  candidates = []
  @decomposers.each do |decomposer|
    score = decomposer.target_score(data)
    next if score.nil?
    candidates << [score, decomposer]
  end
  return nil if candidates.empty?
  candidate = candidates.sort_by {|score, _| -score}.first
  candidate[1]
end
log_tag() click to toggle source
# File lib/chupa-text/extractor.rb, line 138
def log_tag
  "[extractor]"
end
with_timeout(data) { || ... } click to toggle source
# File lib/chupa-text/extractor.rb, line 125
def with_timeout(data, &block)
  timeout = TimeoutValue.new("#{log_tag}[timeout]", data.timeout).raw
  if timeout
    begin
      Timeout.timeout(timeout, &block)
    rescue Timeout::Error
      raise TimeoutError.new(data, timeout)
    end
  else
    yield
  end
end