class ChupaText::Extractor
Public Class Methods
new()
click to toggle source
# File lib/chupa-text/extractor.rb, line 25 def initialize @decomposers = [] end
Public Instance Methods
add_decomposer(decomposer)
click to toggle source
# File lib/chupa-text/extractor.rb, line 44 def add_decomposer(decomposer) @decomposers << decomposer end
apply_configuration(configuration)
click to toggle source
Sets the extractor up by the configuration. It adds decomposers enabled in the configuration.
@param [Configuration] configuration The configuration to be
applied.
@return [void]
# File lib/chupa-text/extractor.rb, line 36 def apply_configuration(configuration) decomposers = Decomposers.create(Decomposer.registry, configuration.decomposer) decomposers.each do |decomposer| add_decomposer(decomposer) end end
extract(input, &block)
click to toggle source
Extracts texts from input. Each extracted text is passes to the given block.
@param [Data, String] input The input to be extracted texts.
If `input` is `String`, it is treated as the local file path or URI of input data.
@yield [text_data] Gives extracted text data to the block.
The block may be called zero or more times.
@yieldparam [Data] text_data The extracted text data.
You can get text data by `text_data.body`.
@return [void]
# File lib/chupa-text/extractor.rb, line 61 def extract(input, &block) extract_recursive(ensure_data(input), &block) end
Private Instance Methods
ensure_data(input)
click to toggle source
# File lib/chupa-text/extractor.rb, line 66 def ensure_data(input) if input.is_a?(Data) input else InputData.new(input) end end
extract_recursive(target) { |utf8_data| ... }
click to toggle source
# File lib/chupa-text/extractor.rb, line 86 def extract_recursive(target, &block) debug do "#{log_tag}[extract][target] <#{target.uri}>:<#{target.mime_type}>" end decomposer = find_decomposer(target) if decomposer.nil? if target.text_plain? debug {"#{log_tag}[extract][text-plain]"} utf8_data = target.to_utf8_body_data yield(utf8_data) utf8_data.release unless target == utf8_data else debug {"#{log_tag}[extract][decomposer] not found"} if target.text? utf8_data = target.to_utf8_body_data yield(utf8_data) utf8_data.release unless target == utf8_data end end else debug {"#{log_tag}[extract][decomposer] #{decomposer.class}"} with_timeout(target) do decomposer.decompose(target) do |decomposed| begin debug do "#{log_tag}[extract][decomposed] " + "#{decomposer.class}: " + "<#{target.uri}>: " + "<#{target.mime_type}> -> <#{decomposed.mime_type}>" end extract_recursive(decomposed, &block) ensure decomposed.release end end end end end
find_decomposer(data)
click to toggle source
# File lib/chupa-text/extractor.rb, line 74 def find_decomposer(data) candidates = [] @decomposers.each do |decomposer| score = decomposer.target_score(data) next if score.nil? candidates << [score, decomposer] end return nil if candidates.empty? candidate = candidates.sort_by {|score, _| -score}.first candidate[1] end
log_tag()
click to toggle source
# File lib/chupa-text/extractor.rb, line 138 def log_tag "[extractor]" end
with_timeout(data) { || ... }
click to toggle source
# File lib/chupa-text/extractor.rb, line 125 def with_timeout(data, &block) timeout = TimeoutValue.new("#{log_tag}[timeout]", data.timeout).raw if timeout begin Timeout.timeout(timeout, &block) rescue Timeout::Error raise TimeoutError.new(data, timeout) end else yield end end