class Paperweight::Download

Most of this from gist.github.com/janko-m/7cd94b8b4dd113c2c193

Constants

DOWNLOAD_ERRORS
Error

Public Class Methods

download(url) click to toggle source
# File lib/paperweight/download.rb, line 35
def self.download(url)
  new.download(url)
end

Public Instance Methods

download(url) click to toggle source
# File lib/paperweight/download.rb, line 18
def download(url)
  # Finally we download the file. Here we mustn't use simple #open that
  # open-uri overrides, because this is vulnerable to shell execution
  # attack (if #open method detects a starting pipe (e.g. "| ls"), it will
  # execute the following as a shell command).
  normalize_download(uri_from(url).open(open_options))
rescue *DOWNLOAD_ERRORS => error
  message = error.message

  # open-uri will throw a RuntimeError when it detects a redirection loop,
  # so we want to reraise the exception if it was some other RuntimeError
  raise if error.is_a?(RuntimeError) && message !~ /redirection/

  # We raise our unified Error class
  raise Error, "download failed (#{url}): #{message}"
end

Private Instance Methods

normalize_download(file) click to toggle source

open-uri will return a StringIO instead of a Tempfile if the filesize is less than 10 KB, so we patch this behaviour by converting it into a Tempfile.

# File lib/paperweight/download.rb, line 44
def normalize_download(file)
  return file unless file.is_a?(StringIO)

  # We need to open it in binary mode for Windows users.
  Tempfile.new('download-', binmode: true).tap do |tempfile|
    # IO.copy_stream is the most efficient way of data transfer.
    IO.copy_stream(file, tempfile.path)

    # We add the metadata that open-uri puts on the file
    # (e.g. #content_type)
    OpenURI::Meta.init(tempfile)
  end
end
open_options() click to toggle source
# File lib/paperweight/download.rb, line 58
def open_options
  max_size = Paperweight.config.max_size

  {}.tap do |options|
    # It was shown that in a random sample approximately 20% of websites
    # will simply refuse a request which doesn't have a valid User-Agent.
    options['User-Agent'] = 'Paperweight'

    # It's good to shield ourselves from files that are too big. open-uri
    # will call this block as soon as it gets the "Content-Length" header,
    # which means that we can bail out before we download the file.
    options[:content_length_proc] = lambda { |size|
      if size && size > max_size
        raise Error, "file is too big (max is #{max_size})"
      end
    }
  end
end
uri_from(url) click to toggle source

Disabling :reek:ManualDispatch here because we don't control the URI API

# File lib/paperweight/download.rb, line 78
def uri_from(url)
  # This will raise an InvalidURIError if the URL is very wrong. It will
  # still pass for strings like "foo", though.
  url = URI(url)

  # We need to check if the URL was either http://, https:// or ftp://,
  # because these are the only ones we can download from. open-uri will add
  # the #open method only to these ones, so this is a good check.
  raise Error, 'url was invalid' unless url.respond_to?(:open)

  url
end