class TheMask::Socket
Constants
- DEFAULT_OPEN_TIMEOUT
- DEFAULT_READ_TIMEOUT
- FORCE_READ
- GENERAL_TIMEOUT
- MAXIMUM_TRIES
- MINIMUM_PAGE_LENGTH
- MIN_PROXY_RESPONSE_TIME
- RESET_USER_AGENT
- SOCKS_INCREASE_TIMEOUTS
Public Class Methods
new(options = {})
click to toggle source
# File lib/the_mask/socket.rb, line 17 def initialize(options = {}) @proxies = nil @socks_increase_timeouts = options[:socks_increase_timeouts] || SOCKS_INCREASE_TIMEOUTS @timeout = options[:timeout] || GENERAL_TIMEOUT @max_tries = options[:max_tries] || MAXIMUM_TRIES @force = options[:force] || FORCE_READ @min_page_length = options[:min_page_length] || MINIMUM_PAGE_LENGTH @reset_user_agent = options[:reset_ua] || RESET_USER_AGENT @min_proxy_response_time = options[:min_proxy_response_time] || MIN_PROXY_RESPONSE_TIME @agent = Mechanize.new @agent.history.max_size = 0 @open_timeout = options[:open_timeout] || DEFAULT_OPEN_TIMEOUT @read_timeout = options[:read_timeout] || DEFAULT_READ_TIMEOUT if options[:proxies] @proxies = TheMask::ProxyList.new(options[:proxies]) else @proxies = TheMask::ProxyList.new([options[:proxy]]) end @agent.user_agent = TheMask.get_random_user_agent_str unless @reset_user_agent end
Public Instance Methods
open_url(url)
click to toggle source
# File lib/the_mask/socket.rb, line 42 def open_url(url) read_proc = Proc.new do proxy = nil # Selected proxy tries = 0 # Total URL retrieval tries page_data = nil # Retrieved page html data timeout_adjustments = nil # Adjustments to timeouts based on proxy type # Variables for timing the GET request end_time = nil start_time = nil begin tries += 1 if !@force && tries > @max_tries raise "TheMask: maximum tries reached for URL = #{url} after #{tries} tries. Check the availability of the host or your proxy settings." end @agent.user_agent = TheMask.get_random_user_agent_str if @reset_user_agent begin unless @proxies.nil? begin proxy = @proxies.get_proxy if proxy.is_SOCKS? @agent.agent.set_socks proxy.ip, proxy.port timeout_adjustments = calculate_timeouts @socks_increase_timeouts elsif proxy.is_HTTP? if proxy.username && proxy.password @agent.set_proxy proxy.ip, proxy.port, proxy.username, proxy.password else @agent.set_proxy proxy.ip, proxy.port end timeout_adjustments = calculate_timeouts else raise "TheMask: unknown proxy type '#{proxy.type}'." end end end rescue Timeout::ExitException => e # Exception timeout from mechanize @proxies.remove_proxy!(proxy) retry end @agent.open_timeout = timeout_adjustments[:open_timeout] @agent.read_timeout = timeout_adjustments[:read_timeout] Timeout::timeout(timeout_adjustments[:timeout]) do start_time = Time.now page_data = @agent.get url end_time = Time.now end rescue Errno::ETIMEDOUT => e retry rescue Net::HTTP::Persistent::Error => e retry rescue Timeout::Error => e retry rescue SOCKSError => e retry rescue SignalException => e retry rescue Net::HTTPNotFound => e retry rescue URI::InvalidURIError => e retry rescue Mechanize::ResponseCodeError => e retry rescue Net::OpenTimeout => e retry rescue Net::HTTPInternalServerError => e retry rescue retry end unless @min_proxy_response_time.nil? || start_time.nil? || end_time.nil? # Remove proxy from list if response time is longer than the minimum response time provided in options response_time = end_time - start_time @proxies.remove_proxy!(proxy) if response_time > @min_proxy_response_time end page_data end if @force while true data = read_proc.call unless data.nil? || data.body.to_s.empty? || data.body.to_s.length < @min_page_length return data.body end end end read_proc.call.body end
Private Instance Methods
calculate_timeouts(increase_magnitude = 0)
click to toggle source
# File lib/the_mask/socket.rb, line 141 def calculate_timeouts(increase_magnitude = 0) { timeout: (@timeout + (@timeout * increase_magnitude)), open_timeout: (@open_timeout + (@open_timeout * increase_magnitude)), read_timeout: (@read_timeout + (@read_timeout * increase_magnitude)) } end