class PageByPage::Jump
Public Instance Methods
iterate(selector)
click to toggle source
# File lib/page_by_page/jump.rb, line 12 def iterate selector @iterate = selector end
process()
click to toggle source
# File lib/page_by_page/jump.rb, line 16 def process url, items, page_count = @start, [], 0 while true do doc = parse url doc.css(@selector).each{ |item| items << item } page_count += 1 update_progress Thread.current, page_count if @progress break if page_count >= limit next_url = doc.at_css(@iterate) break unless next_url path = next_url.attr('href') url = path.start_with?('/') ? concat_host(path) : path sleep @interval if @interval end puts if @progress items end
start(url)
click to toggle source
# File lib/page_by_page/jump.rb, line 8 def start url @start = url end
Private Instance Methods
concat_host(path)
click to toggle source
# File lib/page_by_page/jump.rb, line 42 def concat_host path @prefix = ( regex = path.start_with?('/') ? /([^:|\/])\/.*/ : /(.*[^:|\/])\/.*/ @start.gsub(regex, '\1') ) File.join @prefix, path end