class ProxyRotater
Constants
- CONCURRENT_PROCESS
- PROXY_UPDATE_WAIT
- VERSION
Public Class Methods
new(domain, options = {})
click to toggle source
# File lib/proxy_rotater.rb, line 13 def initialize(domain, options = {}) @domain = domain @interval_sec = 1 @req_limit_per_hour = 500 @request_workers = CONCURRENT_PROCESS @available = [] @over_heated = [] @failed = [] @custom_fail = [] get_proxies end
Public Instance Methods
add_custom_fail(&block)
click to toggle source
# File lib/proxy_rotater.rb, line 80 def add_custom_fail(&block) @custom_fail << block end
check()
click to toggle source
# File lib/proxy_rotater.rb, line 71 def check disable_timeouted check_req_limit revival get_proxies if @available.size < @request_workers sleep PROXY_UPDATE_WAIT if @available.empty? sort end
get(urls, do_retry = true)
click to toggle source
# File lib/proxy_rotater.rb, line 28 def get(urls, do_retry = true) urls = [urls] unless urls.kind_of?(Array) retry_url = [] concurrency = urls.size > @request_workers ? @request_workers : urls.size results = urls.each_slice(concurrency).map do |round_url| round_result = Parallel.map_with_index(round_url, in_processes: concurrency) do |url, i| res = @available[i].get_url(url) if res.nil? next end if @custom_fail.any?{|error_proc| error_proc.call(res)} @available.timeout = true next end { body: res.body, response: res.response } end check round_result.each_with_index.inject({}) do |hash, value| retry_url << round_url[value[1]] if value[0].nil? key = round_url[value[1]] hash[key] = value[0] hash end end r = {}.tap do |hash| results.each{|r|hash.merge!(r)} end unless retry_url.empty? if do_retry retried = get(retry_url, false) r.merge!(retried) end end r end
Private Instance Methods
check_req_limit()
click to toggle source
# File lib/proxy_rotater.rb, line 103 def check_req_limit proxies = @available.group_by do |proxy| if is_per_sec_limit_over?(proxy.get_request_intervals) proxy.return_at = Time.now.to_f + @interval_sec :disabled elsif is_per_hour_limit_over?(proxy.requested_at) proxy.return_at = Time.now.to_f + 60 * 30 :disabled else :available end end @available = proxies[:available] if proxies[:available] @over_heated.concat(proxies[:disabled]) if proxies[:disabled] end
disable_timeouted()
click to toggle source
# File lib/proxy_rotater.rb, line 97 def disable_timeouted proxies = @available.group_by{|proxy|proxy.timeout} @available = proxies[false] @failed.concat(proxies[true]) unless proxies[true].nil? end
get_proxies()
click to toggle source
# File lib/proxy_rotater.rb, line 85 def get_proxies values = get_proxylist_from_html proxies = Parallel.map(values, in_processes: values.size) do |value| Proxy.new(value) end proxies = proxies.group_by{|proxy|proxy.timeout} @available = proxies[false] @failed = proxies[true] end
get_proxylist_from_html()
click to toggle source
# File lib/proxy_rotater.rb, line 151 def get_proxylist_from_html list = GetProxy.get existing_proxies = [].concat([@available, @failed, @over_heated]).flatten! list.delete_if do |line| existing_proxies.any?{|proxy| proxy.ip_address == line[:ip_address]} end end
is_per_hour_limit_over?(req_times)
click to toggle source
# File lib/proxy_rotater.rb, line 129 def is_per_hour_limit_over?(req_times) now = Time.now.to_f in_hour_index = req_times.find_index do |timestamp| timestamp > now - (60 * 60) end return false if in_hour_index.nil? req_times.slice(in_hour_index..-1).size > @req_limit_per_hour end
is_per_sec_limit_over?(req_intervals)
click to toggle source
# File lib/proxy_rotater.rb, line 120 def is_per_sec_limit_over?(req_intervals) size = req_intervals.size return false if size == 0 req_time_ave = req_intervals .slice(-1 * size, 10) .inject(0){|sum, i|sum += i}/size req_time_ave > @interval_sec end
revival()
click to toggle source
# File lib/proxy_rotater.rb, line 138 def revival proxies = @over_heated.group_by do |proxy| proxy.return_at < Time.now.to_f end return if proxies.empty? @over_heated = proxies[false] @available.concat(proxies[true]) if proxies[true] end
sort()
click to toggle source
# File lib/proxy_rotater.rb, line 147 def sort @available.sort_by{|proxy| proxy.last_response_time} end