class ChupaText::Data

Attributes

attributes[R]

@return [Attributes] The attributes of the data.

body[RW]

@return [String, nil] The content of the data, `nil` if the data

doesn't have any content.
expected_screenshot_size[RW]

@return [Array<Integer, Integer>] the expected screenshot size.

limit_as[RW]

@return [Numeric, String, nil] the max memory on extraction by

external command.
limit_cpu[RW]

@return [Numeric, String, nil] the max CPU time on extraction by

external command.
max_body_size[RW]

@return [Integer, nil] the max body size in bytes.

need_screenshot[W]

@param [Bool] value `true` when screenshot is needed. @return [Bool] the specified value

path[RW]

@return [String, nil] The path associated with the content of

the data, `nil` if the data doesn't associated with any file.

The path may not be related with the original content. For
example, `"/tmp/XXX.txt"` may be returned for the data of
`"http://example.com/XXX.txt"`.

This value is useful to use an external command to extract
text and meta-data.
screenshot[RW]

@return [Screenshot, nil] The screenshot of the data. For example,

the first page image for PDF file.text.
size[RW]

@return [Integer, nil] The byte size of the data, `nil` if the data

doesn't have any content.
source[RW]

@return [Data, nil] The source of the data. For example, text

data (`hello.txt`) in archive data (`hello.tar`) have the
archive data in {#source}.
timeout[RW]

@return [Numeric, String, nil] the timeout on extraction.

uri[R]

@return [URI, nil] The URI of the data if the data is for remote

or local file, `nil` if the data isn't associated with any
URIs.

Public Class Methods

new(options={}) click to toggle source
# File lib/chupa-text/data.rb, line 82
def initialize(options={})
  @uri = nil
  @body = nil
  @size = nil
  @path = nil
  @mime_type = nil
  @attributes = Attributes.new
  @source = nil
  @screenshot = nil
  @need_screenshot = true
  @expected_screenshot_size = [200, 200]
  @max_body_size = nil
  @timeout = nil
  @limit_cpu = nil
  @limit_as = nil
  @options = options || {}
  source_data = @options[:source_data]
  if source_data
    merge!(source_data)
    @source = source_data
  end
end

Public Instance Methods

[](name) click to toggle source
# File lib/chupa-text/data.rb, line 174
def [](name)
  @attributes[name]
end
[]=(name, value) click to toggle source
# File lib/chupa-text/data.rb, line 178
def []=(name, value)
  @attributes[name] = value
end
extension() click to toggle source

@return [String, nil] Normalized extension as String if {#uri}

is not `nil`, `nil` otherwise. The normalized extension uses
lower case like `pdf` not `PDF`.
# File lib/chupa-text/data.rb, line 200
def extension
  return nil if @uri.nil?
  if @uri.is_a?(URI::HTTP) and @uri.path.end_with?("/")
    "html"
  else
    File.extname(@uri.path).downcase.gsub(/\A\./, "")
  end
end
initialize_copy(object) click to toggle source
Calls superclass method
# File lib/chupa-text/data.rb, line 105
def initialize_copy(object)
  super
  @attributes = @attributes.dup
  self
end
merge!(data) click to toggle source

Merges metadata from data.

@param [Data] data The data to be merged.

@return [void]

# File lib/chupa-text/data.rb, line 116
def merge!(data)
  self.uri = data.uri
  self.path = data.path
  data.attributes.each do |name, value|
    self[name] = value
  end
  if data.mime_type
    self["source-mime-types"] ||= []
    self["source-mime-types"].unshift(data.mime_type)
  end
  self.need_screenshot = data.need_screenshot?
  self.expected_screenshot_size = data.expected_screenshot_size
  self.max_body_size = data.max_body_size
  self.timeout = data.timeout
  self.limit_cpu = data.limit_cpu
  self.limit_as = data.limit_as
end
mime_type() click to toggle source

@return [String] The MIME type of the data. If MIME type

isn't set, guesses MIME type from path and body.

@return [nil] If MIME type isn't set and it can't guess MIME type

from path and body.
# File lib/chupa-text/data.rb, line 186
def mime_type
  @mime_type || guess_mime_type
end
mime_type=(type) click to toggle source

@param [String, nil] type The MIME type of the data. You can

unset MIME type by `nil`. If you unset MIME type, MIME type
is guessed from path and body of the data.
# File lib/chupa-text/data.rb, line 193
def mime_type=(type)
  @mime_type = type
end
need_screenshot?() click to toggle source

@return [Bool] `true` when screenshot is needed if available.

# File lib/chupa-text/data.rb, line 222
def need_screenshot?
  @need_screenshot
end
open() { |string_io| ... } click to toggle source
# File lib/chupa-text/data.rb, line 161
def open
  yield(StringIO.new(body))
end
peek_body(size) click to toggle source
# File lib/chupa-text/data.rb, line 168
def peek_body(size)
  _body = body
  return nil if _body.nil?
  _body[0, size]
end
release() click to toggle source
# File lib/chupa-text/data.rb, line 165
def release
end
text?() click to toggle source

@return [Bool] true if MIME type is “text/XXX”, false

otherwise.
# File lib/chupa-text/data.rb, line 211
def text?
  (mime_type || "").start_with?("text/")
end
text_plain?() click to toggle source

@return [Bool] true if MIME type is “text/plain”, false

otherwise.
# File lib/chupa-text/data.rb, line 217
def text_plain?
  mime_type == "text/plain"
end
to_utf8_body_data() click to toggle source
# File lib/chupa-text/data.rb, line 226
def to_utf8_body_data
  b = nil
  if @max_body_size
    open do |input|
      b = input.read(@max_body_size)
    end
  else
    b = body
  end
  return self if b.nil?

  converter = UTF8Converter.new(b)
  utf8_body = converter.convert
  if @max_body_size.nil? and b.equal?(utf8_body)
    self
  else
    TextData.new(utf8_body, source_data: self)
  end
end
uri=(uri) click to toggle source

@param [String, URI, nil] uri The URI for the data. If `uri` is

`nil`, it means that the data isn't associated with any URIs.
# File lib/chupa-text/data.rb, line 136
def uri=(uri)
  case uri
  when Pathname
    file_uri = ""
    target = uri.expand_path
    loop do
      target, base = target.split
      file_uri = "/#{CGI.escape(base.to_s)}#{file_uri}"
      break if target.root?
    end
    file_uri = "file://#{file_uri}"
    @uri = URI.parse(file_uri)
    self.path = uri
  when NilClass
    @uri = nil
    self.path = nil
  else
    unless uri.is_a?(URI)
      uri = URI.parse(uri)
    end
    @uri = uri
    self.path = @uri.path
  end
end

Private Instance Methods

change_encoding(string, encoding) { |string| ... } click to toggle source
# File lib/chupa-text/data.rb, line 268
def change_encoding(string, encoding)
  return if string.nil?
  begin
    original_encoding = string.encoding
    string.force_encoding(encoding)
    yield(string)
  ensure
    string.force_encoding(original_encoding)
  end
end
guess_mime_type() click to toggle source
# File lib/chupa-text/data.rb, line 247
def guess_mime_type
  guess_mime_type_from_uri or
    guess_mime_type_from_body
end
guess_mime_type_from_body() click to toggle source
# File lib/chupa-text/data.rb, line 256
def guess_mime_type_from_body
  mime_type = nil
  chunk = peek_body(1024)
  change_encoding(chunk, "UTF-8") do |utf8_chunk|
    return nil unless utf8_chunk.valid_encoding?
    n_null_characters = utf8_chunk.count("\u0000")
    return nil if n_null_characters > (utf8_chunk.bytesize * 0.01)
    mime_type = "text/plain"
  end
  mime_type
end
guess_mime_type_from_uri() click to toggle source
# File lib/chupa-text/data.rb, line 252
def guess_mime_type_from_uri
  MIMEType.registry.find(extension)
end