class Object
Public Instance Methods
DirectLink(link, timeout = nil, proxy = nil, giveup: false, ignore_meta: false)
click to toggle source
# File lib/directlink.rb, line 312 def DirectLink link, timeout = nil, proxy = nil, giveup: false, ignore_meta: false timeout ||= DirectLink.timeout ArgumentError.new("link should be a <String>, not <#{link.class}>") unless link.is_a? String begin URI link rescue URI::InvalidURIError require "addressable" link = Addressable::URI.escape link end raise DirectLink::ErrorBadLink.new link, true unless URI(link).host struct = Module.const_get(__callee__).class_variable_get :@@directlink google_without_schema_crutch = lambda do if %w{ lh3 googleusercontent com } == URI(link).host.split(?.).last(3) || %w{ lh4 googleusercontent com } == URI(link).host.split(?.).last(3) || %w{ lh5 googleusercontent com } == URI(link).host.split(?.).last(3) || %w{ lh6 googleusercontent com } == URI(link).host.split(?.).last(3) || %w{ bp blogspot com } == URI(link).host.split(?.).last(3) u = DirectLink.google link f = FastImage.new(u, raise_on_failure: true, http_header: {"User-Agent" => "Mozilla"}) w, h = f.size struct.new u, w, h, f.type end end t = google_without_schema_crutch[] and return t # to test that we won't hang for too long if someone like aeronautica.difesa.it will be silent for some reason: # $ bundle console # > NetHTTPUtils.logger.level = Logger::DEBUG # > NetHTTPUtils.request_data "http://www.aeronautica.difesa.it/organizzazione/REPARTI/divolo/PublishingImages/6%C2%B0%20Stormo/2013-decollo%20al%20tramonto%20REX%201280.jpg", # max_read_retry_delay: 5, timeout: 5 begin header = { "User-Agent" => "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36", **( %w{ reddit com } == URI(link).host.split(?.).last(2) || %w{ redd it } == URI(link).host.split(?.) ? {Cookie: "over18=1"} : {} ), } head = NetHTTPUtils.request_data link, :HEAD, header: header, **(proxy ? {proxy: proxy} : {}), **(timeout ? { timeout: timeout, max_start_http_retry_delay: timeout, max_read_retry_delay: timeout, } : {}) rescue Net::ReadTimeout, Errno::ETIMEDOUT rescue NetHTTPUtils::Error => e raise unless 418 == e.code else raise DirectLink::ErrorAssert.new "last_response.uri is not set" unless head.instance_variable_get(:@last_response).uri link = head.instance_variable_get(:@last_response).uri.to_s end # why do we resolve redirects before trying the known adapters? # because they can be hidden behind URL shorteners # also it can resolve NetHTTPUtils::Error(404) before trying the adapter t = google_without_schema_crutch[] and return t # TODO: why again? begin imgur = DirectLink.imgur(link, timeout).sort_by{ |u, w, h, t| - w * h }.map do |u, w, h, t| struct.new u, w, h, t end # `DirectLink.imgur` return value is always an Array return imgur.size == 1 ? imgur.first : imgur rescue DirectLink::ErrorMissingEnvVar end if %w{ imgur com } == URI(link).host.split(?.).last(2) if %w{ 500px com } == URI(link).host.split(?.).last(2) w, h, u, t = DirectLink._500px(link) return struct.new u, w, h, t end begin w, h, u = DirectLink.flickr(link) f = FastImage.new(u, raise_on_failure: true) # , http_header: {"User-Agent" => "Mozilla"} return struct.new u, w, h, f.type rescue DirectLink::ErrorMissingEnvVar end if %w{ www flickr com } == URI(link).host.split(?.) || %w{ flic kr } == URI(link).host.split(?.) if %w{ wikipedia org } == URI(link).host.split(?.).last(2) || %w{ commons wikimedia org } == URI(link).host.split(?.) u = DirectLink.wiki link f = FastImage.new(u, raise_on_failure: true) # , http_header: {"User-Agent" => "Mozilla"} w, h = f.size return struct.new u, w, h, f.type end # TODO protect in two places from eternal recursion begin s, u = DirectLink.reddit(link) unless s raise DirectLink::ErrorBadLink.new link if giveup # TODO: print original url in such cases if there was a recursion f = ->_{ _.type == :a ? _.attr["href"] : _.children.flat_map(&f) } require "kramdown" return f[Kramdown::Document.new(u).root].flat_map do |sublink| DirectLink URI.join(link, sublink).to_s, timeout, giveup: giveup # TODO: maybe subtract from timeout the time we've already wasted end end if u.is_a? Hash return struct.new *u.values_at(*%w{ fallback_url width height }), "video" elsif u.is_a? Array return u.map do |t, x, y, u| struct.new u, x, y, t end end raise DirectLink::ErrorNotFound.new link.inspect if link == u return DirectLink u, timeout, giveup: giveup rescue DirectLink::ErrorMissingEnvVar end if %w{ reddit com } == URI(link).host.split(?.).last(2) || %w{ redd it } == URI(link).host.split(?.) begin return DirectLink.vk(link).map do |w, h, u| struct.new u, w, h end rescue DirectLink::ErrorMissingEnvVar end if %w{ vk com } == URI(link).host.split(?.) begin f = FastImage.new link, raise_on_failure: true, timeout: timeout, **(proxy ? {proxy: "http://#{proxy}"} : {}), http_header: {"User-Agent" => "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"} rescue FastImage::UnknownImageType raise if giveup require "nokogiri" head = NetHTTPUtils.request_data link, :HEAD, header: {"User-Agent" => "Mozilla"}, max_start_http_retry_delay: timeout, timeout: timeout, # NetHTTPUtild passes this as read_timeout to Net::HTTP.start max_read_retry_delay: timeout # and then compares accumulated delay to this # if we use :get here we will download megabytes of files just to giveup on content_type we can't process case head.instance_variable_get(:@last_response).content_type # webmock should provide this when "text/html" ; nil else ; raise end html = Nokogiri::HTML NetHTTPUtils.request_data link, :GET, header: {"User-Agent" => "Mozilla"} if t = html.at_css("meta[@property='og:image']") begin return DirectLink URI.join(link, t[:content]).to_s, nil, *proxy, giveup: true rescue URI::InvalidURIError end end unless ignore_meta h = {} # TODO: maybe move it outside because of possible img[:src] recursion?... l = lambda do |node, s = []| node.element_children.flat_map do |child| next l[child, s + [child.node_name]] unless "img" == child.node_name begin [[s, (h[child[:src]] = h[child[:src]] || DirectLink(URI.join(link, child[:src]).to_s, nil, giveup: true))]] # ... or wait, do we giveup? rescue => e DirectLink.logger.error "#{e} (from no giveup)" [] end end end l[html]. tap{ |results| raise if results.empty? }. group_by(&:first).map{ |k, v| [k.join(?>), v.map(&:last)] }. max_by{ |_, v| v.map{ |i| i.width * i.height }.inject(:+) }.last else # TODO: maybe move this to right before `rescue` line w, h = f.size struct.new f.instance_variable_get(:@parsed_uri).to_s, w, h, f.type end end