class Scruber::QueueAdapters::AbstractAdapter::Page

Queue page wrapper

@author Ivan Goncharov

@attr [Object] id ID of page. Will be autogenerated if not passed @attr [String] url URL of page @attr [String] method Request method, post, get, head @attr [String] user_agent Fixed User-Agent for requesting this page @attr [Hash] headers Headers for requesting this page @attr [Object] fetcher_agent_id ID of FetcherAgent, assigned to this page @attr [Object] proxy_id ID of proxy, assigned to this page @attr [String] response_body Response body @attr [Integer] response_code Response code @attr [Hash] response_headers Response headers @attr [Float] response_total_time Response total time @attr [Integer] retry_at Minimal timestamp of next retry @attr [Integer] fetched_at Download completion timestamp @attr [Integer] retry_count Number of download attempts @attr [Integer] max_retry_times Max number of download attempts @attr [Integer] enqueued_at Timestamp added to the queue @attr [String] page_type Page type @attr [Scruber::QueueAdapters::AbstractAdapter::Page] queue Queue object @attr [Integer] priority Priority of page in queue for fetcher @attr [Integer] processed_at Processed by parser timestamp @attr [Hash] options All options

Attributes

body[RW]
enqueued_at[RW]
fetched_at[RW]
fetcher_agent_id[RW]
headers[RW]
id[RW]
max_retry_times[RW]
method[RW]
options[RW]
page_type[RW]
priority[RW]
processed_at[RW]
proxy_id[RW]
queue[RW]
response_body[RW]
response_code[RW]
response_headers[RW]
response_total_time[RW]
retry_at[RW]
retry_count[RW]
url[RW]
user_agent[RW]

Public Class Methods

new(queue, options={}) click to toggle source
# File lib/scruber/queue_adapters/abstract_adapter.rb, line 60
def initialize(queue, options={})
  @queue = queue

  options = options.with_indifferent_access
  @options = options
  @id = options.fetch(:id) { generate_page_id }
  @url = options.fetch(:url) { raise "URL not provided" }
  @method = options.fetch(:method) { :get }
  @user_agent = options.fetch(:user_agent) { nil }
  @body = options.fetch(:body) { nil }
  @headers = options.fetch(:headers) { {} }
  @fetcher_agent_id = options.fetch(:fetcher_agent_id) { nil }
  @proxy_id = options.fetch(:proxy_id) { nil }
  @response_body = options.fetch(:response_body) { nil }
  @response_code = options.fetch(:response_code) { nil }
  @response_headers = options.fetch(:response_headers) { {} }
  @response_total_time = options.fetch(:response_total_time) { nil }
  @retry_at = options.fetch(:retry_at) { 0 }
  @fetched_at = options.fetch(:fetched_at) { 0 }
  @retry_count = options.fetch(:retry_count) { 0 }
  @max_retry_times = options.fetch(:max_retry_times) { nil }
  @enqueued_at = options.fetch(:enqueued_at) { 0 }
  @page_type = options.fetch(:page_type) { :seed }
  # @queue = options.fetch(:queue) { 'default' }
  @priority = options.fetch(:priority) { 0 }
  @processed_at = options.fetch(:processed_at) { 0 }

  @_fetcher_agent = false
  @_proxy = false
  @_redownload = false
end

Public Instance Methods

[](k) click to toggle source
# File lib/scruber/queue_adapters/abstract_adapter.rb, line 147
def [](k)
  instance_variable_get("@#{k.to_s}")
end
delete() click to toggle source

Delete page from queue

@return [void]

# File lib/scruber/queue_adapters/abstract_adapter.rb, line 155
def delete
  raise NotImplementedError
end
fetcher_agent() click to toggle source

Returns assigned to this page FetcherAgent

@return [Scruber::Helpers::FetcherAgent] Agent object

# File lib/scruber/queue_adapters/abstract_adapter.rb, line 96
def fetcher_agent
  if @_fetcher_agent == false
    @_fetcher_agent = (@fetcher_agent_id ? Scruber::Helpers::FetcherAgent.find(@fetcher_agent_id) : nil)
  else
    @_fetcher_agent
  end
end
processed!() click to toggle source

Mark page as processed by parser and save it

@return [void]

# File lib/scruber/queue_adapters/abstract_adapter.rb, line 163
def processed!
  @processed_at = Time.now.to_i
  @_redownload = false
  save
end
proxy() click to toggle source

Returns assigned to this page proxy

@return [Proxy] proxy object

# File lib/scruber/queue_adapters/abstract_adapter.rb, line 108
def proxy
  if @_proxy == false
    @_proxy = (@proxy_id ? Scruber::Helpers::ProxyRotator.find(@proxy_id) : nil)
  else
    @_proxy
  end
end
redownload!(new_retry_count=nil) click to toggle source

Mark page as pending and return to queue

@param new_retry_count [Integer] new count of reties. Allows to reset retries count

@return [void]

# File lib/scruber/queue_adapters/abstract_adapter.rb, line 175
def redownload!(new_retry_count=nil)
  @_redownload = true

  @processed_at = 0
  if new_retry_count
    @retry_count = new_retry_count
  else
    @retry_count += 1
  end
  @fetched_at = 0
  @response_body = nil
  save
end
response_cookies() click to toggle source

Returns cookies from response headers

@return [Array] array of cookies

# File lib/scruber/queue_adapters/abstract_adapter.rb, line 120
def response_cookies
  cookies = self.response_headers['Set-Cookie']
  if cookies.blank?
    []
  else
    if cookies.is_a?(Array)
      cookies
    else
      [cookies]
    end
  end
end
save() click to toggle source
# File lib/scruber/queue_adapters/abstract_adapter.rb, line 133
def save
  raise NotImplementedError
end
sent_to_redownload?() click to toggle source

Marked as page for redownloading

@return [Boolean] true if need to redownload

# File lib/scruber/queue_adapters/abstract_adapter.rb, line 193
def sent_to_redownload?
  @_redownload
end
url_join(link_url) click to toggle source

Join url of current page with another path or url @param link_url [String] link

@return [String] joined url

# File lib/scruber/queue_adapters/abstract_adapter.rb, line 143
def url_join(link_url)
  URI.join(url, link_url).to_s
end

Private Instance Methods

generate_page_id() click to toggle source
# File lib/scruber/queue_adapters/abstract_adapter.rb, line 199
def generate_page_id
  Digest::MD5.hexdigest @options.slice(:method, :url, :headers, :body).to_json
end