module UrlReader
Constants
- REQUEST_OPEN_TIMEOUT
- REQUEST_TIMEOUT
Public Instance Methods
read_url(url, options = {})
click to toggle source
# File lib/url_reader.rb, line 17 def read_url(url, options = {}) self.class.last_response_headers = nil self.class.last_response_cookies = nil self.class.last_cache_used = false if defined?(Rails) && Rails.env.development? if ENV['READ_URL_CACHE_NOT_USE'] != 'true' ckey = cache_key(url, options) if res = cache.read_entry(ckey) self.class.last_cache_used = true res else read_url_core_with_cache_write(url, options, ckey) end else read_url_core_with_cache_write(url, options) end else read_url_core(url, options) end end
Private Instance Methods
cache()
click to toggle source
# File lib/url_reader.rb, line 40 def cache @cache ||= begin %x(mkdir -p #{Rails.root}/tmp/cache/url_reader) FileCache.new(File.join(Rails.root, 'tmp/cache/url_reader')) end end
cache_key(url, options)
click to toggle source
# File lib/url_reader.rb, line 47 def cache_key(url, options) "#{url}?#{options.to_s}" end
read_url_core(url, options)
click to toggle source
# File lib/url_reader.rb, line 59 def read_url_core(url, options) valid_url = fixed_url(url) headers = {} headers.merge!(options[:headers]) if options[:headers] headers[:user_agent] = options[:user_agent] if options[:user_agent] cookies = {} cookies.merge!(options[:cookies]) if options[:cookies] hash = { url: valid_url, timeout: options[:request_timeout] || REQUEST_TIMEOUT, open_timeout: options[:request_open_timeout] || REQUEST_OPEN_TIMEOUT, headers: headers, cookies: cookies } response = begin if options[:method] == :post RestClient::Request.execute(hash.merge(method: :post, payload: options[:params])) else RestClient::Request.execute(hash.merge(method: :get)) end rescue RestClient::ResourceNotFound, RestClient::InternalServerError, RestClient::RequestTimeout, RestClient::ServerBrokeConnection, Errno::ECONNREFUSED, Errno::ECONNRESET => e ne = ReadError.new(e, "Read #{hash[:url]} failed") ignore_errors = options[:ignore_errors] || [] ignore_errors << 'PageNotFound' if options[:ignore_not_found] ignore_errors << 'InternalServerError' if options[:ignore_server_error] return nil if ignore_errors.map { |x| x.is_a?(Integer) ? x : ReadError.const_get(x) }.include?(ne.type) raise ne end return nil unless response self.class.last_response_headers = response.headers self.class.last_response_cookies = response.cookies image_content_type = options[:image_content_type] return resolve_encoding(response) unless response.headers[:content_type] =~ /^image\// || (image_content_type && response.headers[:content_type] == image_content_type) response.to_str end
read_url_core_with_cache_write(url, options, ckey = nil)
click to toggle source
# File lib/url_reader.rb, line 51 def read_url_core_with_cache_write(url, options, ckey = nil) ckey ||= cache_key(url, options) res = read_url_core(url, options) return nil if res.nil? cache.write_entry(ckey, res) res end
resolve_encoding(response)
click to toggle source
# File lib/url_reader.rb, line 101 def resolve_encoding(response) response_str = response.to_str encoding = response_encoding(response.headers, response_str) begin return response_str.encode(Encoding::UTF_8, encoding) rescue Encoding::UndefinedConversionError => e return response_str.encode(Encoding::UTF_8, Encoding::CP932) if encoding == Encoding::Shift_JIS return response_str.encode(Encoding::UTF_8, Encoding::CP51932) if encoding == Encoding::EUC_JP raise CannotResolveEncodingError, e end end
response_encoding(response_headers, response_str)
click to toggle source
# File lib/url_reader.rb, line 113 def response_encoding(response_headers, response_str) response_str_utf8 = response_str.toutf8 [response_headers[:content_type].try(:match, /charset=(?<charset>[^;]+)($|;)/), response_str_utf8.match(/<meta .*?content="[^"]*?charset=(?<charset>[^;"]+)/), response_str_utf8.match(/<meta .*?charset="(?<charset>[^"]+)"/)] .map { |x| x.try(:[], 'charset') }.compact .map { |x| Encoding.find(x) rescue nil }.compact .push(Encoding::UTF_8) .first end