class PageByPage::Fetch

Public Class Methods

new(opt = {}, &block) click to toggle source
Calls superclass method PageByPage::Common::new
# File lib/page_by_page/fetch.rb, line 11
def initialize(opt = {}, &block)
  @from, @step, @to = 1, 1, Float::INFINITY
  super
  @enum = Enum.new(enum_options)
  @enum = MutexEnum.new(@enum) if defined? @threads
end

Public Instance Methods

enumerator(e) click to toggle source
# File lib/page_by_page/fetch.rb, line 34
def enumerator e
  @enumerator = e
end
from(n) click to toggle source
# File lib/page_by_page/fetch.rb, line 22
def from n
  @from = n
end
iterator() click to toggle source
# File lib/page_by_page/fetch.rb, line 47
def iterator
  Enumerator.new do |yielder|
    items_enum.each do |_, items|
      items.each do |i|
        yielder.yield(i)
      end
    end
  end
end
process() click to toggle source
# File lib/page_by_page/fetch.rb, line 38
def process
  nodes_2d = defined?(@threads) ? parallel_fetch : _fetch
  puts if @progress

  nodes_2d.sort.each_with_object([]) do |key_items, res|
    res.concat key_items[1] unless key_items[1].nil?
  end
end
step(n) click to toggle source
# File lib/page_by_page/fetch.rb, line 26
def step n
  @step = n
end
threads(n) click to toggle source
# File lib/page_by_page/fetch.rb, line 30
def threads n
  @threads = n
end
url(tmpl) click to toggle source
# File lib/page_by_page/fetch.rb, line 18
def url tmpl
  @tmpl = ERB.new tmpl
end

Protected Instance Methods

_fetch() click to toggle source
# File lib/page_by_page/fetch.rb, line 59
def _fetch
  pages = {}

  items_enum.each do |page_num, items|
    pages[page_num] = items
  end

  pages
end
enum_options() click to toggle source
# File lib/page_by_page/fetch.rb, line 101
def enum_options
  {from: @from, step: @step, limit: limit, enumerator: @enumerator}
end
items_enum() click to toggle source
# File lib/page_by_page/fetch.rb, line 69
def items_enum
  Enumerator.new do |yielder|
    items = [nil]
    catch :no_more do
      until items.empty?
        n = @enum.next
        break if n.nil?

        url = @tmpl.result binding
        doc = parse url
        items = doc.css @selector
        yielder.yield(n, items)

        update_progress Thread.current, n if @progress
        sleep @interval if @interval
      end
    end
  end
end
parallel_fetch() click to toggle source
# File lib/page_by_page/fetch.rb, line 89
def parallel_fetch
  ts = @threads.times.map do |n|
    Thread.new do
      Thread.current[:sub] = _fetch
    end
  end
  ts.each_with_object({}) do |t, pages|
    t.join
    pages.merge! t[:sub]
  end
end