class ProxyRotater

Constants

CONCURRENT_PROCESS
PROXY_UPDATE_WAIT
VERSION

Public Class Methods

new(domain, options = {}) click to toggle source
# File lib/proxy_rotater.rb, line 13
def initialize(domain, options = {})
  @domain = domain
  @interval_sec = 1
  @req_limit_per_hour = 500
  @request_workers = CONCURRENT_PROCESS

  @available = []
  @over_heated = []
  @failed = []

  @custom_fail = []

  get_proxies
end

Public Instance Methods

add_custom_fail(&block) click to toggle source
# File lib/proxy_rotater.rb, line 80
def add_custom_fail(&block)
  @custom_fail << block
end
check() click to toggle source
# File lib/proxy_rotater.rb, line 71
def check
  disable_timeouted
  check_req_limit
  revival
  get_proxies if @available.size < @request_workers
  sleep PROXY_UPDATE_WAIT if @available.empty?
  sort
end
get(urls, do_retry = true) click to toggle source
# File lib/proxy_rotater.rb, line 28
def get(urls, do_retry = true)
  urls = [urls] unless urls.kind_of?(Array)
  retry_url = []

  concurrency = urls.size > @request_workers ? @request_workers : urls.size
  results = urls.each_slice(concurrency).map do |round_url|
    round_result = Parallel.map_with_index(round_url, in_processes: concurrency) do |url, i|
      res = @available[i].get_url(url)
      if res.nil?
        next
      end

      if @custom_fail.any?{|error_proc| error_proc.call(res)}
        @available.timeout = true
        next
      end
      {
        body: res.body,
        response: res.response
      }
    end
    check
    round_result.each_with_index.inject({}) do |hash, value|
      retry_url << round_url[value[1]] if value[0].nil?
      key = round_url[value[1]]
      hash[key] = value[0]
      hash
    end
  end

  r = {}.tap do |hash|
    results.each{|r|hash.merge!(r)}
  end

  unless retry_url.empty?
    if do_retry
      retried = get(retry_url, false)
      r.merge!(retried)
    end
  end
  r
end

Private Instance Methods

check_req_limit() click to toggle source
# File lib/proxy_rotater.rb, line 103
def check_req_limit
  proxies = @available.group_by do |proxy|
    if is_per_sec_limit_over?(proxy.get_request_intervals)
      proxy.return_at = Time.now.to_f + @interval_sec
      :disabled
    elsif is_per_hour_limit_over?(proxy.requested_at)
      proxy.return_at = Time.now.to_f + 60 * 30
      :disabled
    else
      :available
    end
  end

  @available = proxies[:available] if proxies[:available]
  @over_heated.concat(proxies[:disabled]) if proxies[:disabled]
end
disable_timeouted() click to toggle source
# File lib/proxy_rotater.rb, line 97
def disable_timeouted
  proxies = @available.group_by{|proxy|proxy.timeout}
  @available = proxies[false] 
  @failed.concat(proxies[true]) unless proxies[true].nil?
end
get_proxies() click to toggle source
# File lib/proxy_rotater.rb, line 85
def get_proxies
  values = get_proxylist_from_html

  proxies = Parallel.map(values, in_processes: values.size) do |value|
    Proxy.new(value)
  end
  proxies = proxies.group_by{|proxy|proxy.timeout}

  @available = proxies[false]
  @failed = proxies[true]
end
get_proxylist_from_html() click to toggle source
# File lib/proxy_rotater.rb, line 151
def get_proxylist_from_html
  list = GetProxy.get
  existing_proxies = [].concat([@available, @failed, @over_heated]).flatten!
  list.delete_if do |line|
    existing_proxies.any?{|proxy| proxy.ip_address == line[:ip_address]}
  end
end
is_per_hour_limit_over?(req_times) click to toggle source
# File lib/proxy_rotater.rb, line 129
def is_per_hour_limit_over?(req_times)
  now = Time.now.to_f
  in_hour_index = req_times.find_index do |timestamp|
    timestamp > now - (60 * 60)
  end
  return false if in_hour_index.nil?
  req_times.slice(in_hour_index..-1).size > @req_limit_per_hour
end
is_per_sec_limit_over?(req_intervals) click to toggle source
# File lib/proxy_rotater.rb, line 120
def is_per_sec_limit_over?(req_intervals)
  size = req_intervals.size
  return false if size == 0
  req_time_ave = req_intervals
    .slice(-1 * size, 10)
    .inject(0){|sum, i|sum += i}/size
  req_time_ave > @interval_sec
end
revival() click to toggle source
# File lib/proxy_rotater.rb, line 138
def revival
  proxies = @over_heated.group_by do |proxy|
    proxy.return_at < Time.now.to_f
  end
  return if proxies.empty?
  @over_heated = proxies[false]
  @available.concat(proxies[true]) if proxies[true]
end
sort() click to toggle source
# File lib/proxy_rotater.rb, line 147
def sort
  @available.sort_by{|proxy| proxy.last_response_time}
end