class Apify::Core::Fetcher

Attributes

result[RW]
sources[RW]

Public Class Methods

base_url_for(url_or_array, base_url) click to toggle source
# File lib/apify_core/fetcher.rb, line 165
def base_url_for(url_or_array, base_url)

  if url_or_array.respond_to?(:each)
    result = []
    url_or_array.each do |url|
      #url = URI(URI.encode(url))
      url = URI(url)
      raise ArgumentError, "No host provided." if url.host.nil? and base_url.nil?
      result << (url.host ? url.to_s : "#{base_url}#{url}")
    end
    result
  else
    #url = URI(URI.encode(url_or_array))
    url = URI(url_or_array)
    raise ArgumentError, "No host provided." if url.host.nil? and base_url.nil?
    (url.host ? url : "#{base_url}#{url}")
  end

end
download( url, method=:normal ) click to toggle source
# File lib/apify_core/fetcher.rb, line 130
def download( url, method=:normal )
  sleep @delay if @delay.to_i > 0
  result = case method
  when :js
    headless = Headless.new
    headless.start
    browser = Watir::Browser.new
    browser.goto url
    html = browser.html
    headless.destroy
    print "+"
    html
  when :normal
    begin
      html = RestClient.get(url,
              'Accept'          => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
              'Accept-Language' => 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4',
              'Connection'      => 'keep-alive',
              'User-Agent'      => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/39.0.2171.65 Chrome/39.0.2171.65 Safari/537.36')
      d = Nokogiri::HTML(html)
      charset = d.search('meta[@http-equiv="content-type"]').last['content'].match(/charset=(.+)/)[1].downcase rescue nil
      if charset.present? and charset != 'utf-8'
        html = html.force_encoding(charset).encode("utf-8", undef: :replace)
      end
      print "+"
      html
    rescue RestClient::RequestTimeout, RestClient::ResourceNotFound, RestClient::InternalServerError, URI::InvalidURIError, RestClient::Forbidden,RestClient::BadGateway, RestClient
      print "-"
      return
    end
  end

  result
end
new( pages, processes=2, delay=0 ) click to toggle source
# File lib/apify_core/fetcher.rb, line 7
def initialize( pages, processes=2, delay=0 )
  @pages = pages
  @processes = processes
  @delay = delay
end
paginate( opts={} ) click to toggle source

Fetcher::Fetcher.paginate(url: ‘site.com’, to_replace: ‘(/?)$’, pagination: ‘?page=<% 1,5,1 %>’)

# File lib/apify_core/fetcher.rb, line 102
def paginate( opts={} )
  pagination = opts[:pagination] || '?page=<% 1,5,1 %>'
  to_replace = opts[:to_replace] || '(\/?)\Z'
  url_or_array_of_urls = opts[:url]
  raise ArgumentError, "URL parameter missing" if url_or_array_of_urls.nil?
  regexp = /<%\s?+(\d+,\d+,\d+)\s?+%>/
  pattern = pagination.scan(regexp)
  return [opts[:url]] if pattern.count == 0
  raise ArgumentError, "Only one pagination pattern allowed." if pattern.count > 1
  result = []
  pager_args = pattern.first.first.split(',').map(&:strip).map(&:to_i)
  range = (pager_args[0]..pager_args[1])
  range.step(pager_args[2]).each do |page|

    to_append = pagination.gsub(regexp, page.to_s)
    if url_or_array_of_urls.respond_to?(:each)
      url_or_array_of_urls.each do |url|
        result << url.to_s.chomp('/').gsub(Regexp.new(to_replace), to_append)
      end
    else
      result << url_or_array_of_urls.to_s.chomp('/').gsub(Regexp.new(to_replace), to_append)
    end

  end

  result
end

Public Instance Methods

perform() click to toggle source
# File lib/apify_core/fetcher.rb, line 43
def perform
  parenthesis_args = /\([\"\'](.*?)[\"\']\)/
  self.sources.each do |source|

    if @pages[source].key?(:from)
      expression = @pages[source][:from]
      statement = {
        select:   expression.match(/select#{parenthesis_args}/)[1],
        from:     expression.match(/from#{parenthesis_args}/)[1],
        filters:  (expression.match(/filter#{parenthesis_args}/)[1].split('|').map(&:strip) rescue nil),
      }
      statement[:filters] = ['mapattr_href', 'map_urlencode'] unless statement[:filters].present?
      urls = Filter.apply(Parser.fetch(statement[:select], self.send(statement[:from])[:pages]), statement[:filters])
      @pages[source][:pages] = self.class.base_url_for(urls, @pages[source][:host])
    end
    src = self.send(source)
    method = src[:js] ? :js : :normal
    processes_number = (method == :js ? 1 : @processes )
    src[:pages] = ::Parallel.map(src[:pages], in_processes: processes_number) do |url_or_array|
      if url_or_array.respond_to?(:each)
        res = []
        url_or_array.each do |url|

          res << self.class.download( url, method )
        end
        res
      else
        self.class.download( url_or_array, method )
      end
    end.flatten
  end

  result = {}
  self.sources.each do |source|
    src = self.send(source)
    pattern = src[:pattern] ? src[:pattern].dup : false
    elem = if src[:pattern]
      src[:pages].map{ |html| Parser.new(html, src[:pattern]).perform }
    else

      src[:pages]
    end

    result[source] = elem if pattern
    instance_variable_set("@#{source}".to_sym, elem )
  end

  @result = result
  @json = result
end
prepare() click to toggle source
# File lib/apify_core/fetcher.rb, line 13
def prepare
  @pages.each do |key, value|
    value[:url] = Filter.apply(value[:url], ['map_urlencode'])
    self.class.send(:attr_reader, key.to_sym)
    js  = value[:js] || false
    pattern = value[:pattern]
    host = value[:host]
    if value[:from]
      v =  value
      instance_variable_set("@#{key}".to_sym, v)
      next
    end
    url = self.class.base_url_for(value[:url], value[:host])

    if value[:paginate]
      pages = self.class.paginate(url: url, to_replace: value[:paginate][0], pagination: value[:paginate][1])
      result = { pages: pages }
    else
      url = (url.respond_to?(:each) ? url : url.to_s)
      result = { pages: [url] }
    end
    result[:js] = js
    result[:pattern] = pattern
    result[:host] = host
    instance_variable_set("@#{key}".to_sym, result)

  end
  self.sources = @pages.keys
end
to_json() click to toggle source
# File lib/apify_core/fetcher.rb, line 94
def to_json
  (@json || perform).to_json
end