class PageByPage::Jump

Public Instance Methods

iterate(selector) click to toggle source
# File lib/page_by_page/jump.rb, line 12
def iterate selector
  @iterate = selector
end
process() click to toggle source
# File lib/page_by_page/jump.rb, line 16
def process
  url, items, page_count = @start, [], 0

  while true do
    doc = parse url
    doc.css(@selector).each{ |item| items << item }

    page_count += 1
    update_progress Thread.current, page_count if @progress
    break if page_count >= limit

    next_url = doc.at_css(@iterate)
    break unless next_url

    path = next_url.attr('href')
    url = path.start_with?('/') ? concat_host(path) : path

    sleep @interval if @interval
  end

  puts if @progress
  items
end
start(url) click to toggle source
# File lib/page_by_page/jump.rb, line 8
def start url
  @start = url
end

Private Instance Methods

concat_host(path) click to toggle source
# File lib/page_by_page/jump.rb, line 42
def concat_host path
  @prefix = (
    regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/
    @start.gsub(regex, '\1')
  )
  File.join @prefix, path
end