module ListSpider

Constants

DEFAULT_CONCURRNET_MAX
DEFAULT_INTERVAL
NO_LIMIT_CONCURRENT
RANDOM_TIME
VERSION

Attributes

save_file[RW]

Public Class Methods

add_task(task) click to toggle source
# File lib/list_spider.rb, line 147
def add_task(task)
  if task.is_a? Array
    filter_list(task)
  elsif task.is_a?TaskStruct
    filter_list([task])
  else
    puts "error task type:#{task.class}"
  end
end
get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX) click to toggle source
# File lib/list_spider.rb, line 126
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
  if interval.is_a? Range
    @random_time_range = interval
    interval = RANDOM_TIME
  end

  filter_list(down_list)
  @interval = interval
  @max = max
  @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
  @succeed_size = 0
  @failed_size = 0

  puts "total size:#{@down_list.size}"
  event_machine_start_list(next_task, method(:complete))
end
get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX) click to toggle source
# File lib/list_spider.rb, line 143
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
  get_list([task], interval: interval, max: max)
end
stop() click to toggle source
# File lib/list_spider.rb, line 157
def stop
  stop_machine
end

Private Class Methods

call_parse_method(task_struct) click to toggle source
# File lib/list_spider.rb, line 246
def call_parse_method(task_struct)
  task_struct.parse_method.call(task_struct) if task_struct.parse_method
end
complete(_multi, success_list, failed_list) click to toggle source
# File lib/list_spider.rb, line 250
def complete(_multi, success_list, failed_list)
  @succeed_size += success_list.size
  @failed_size += failed_list.size
  @succeed_list.concat(success_list)
  @failed_list.concat(failed_list)

  todo = next_task

  if todo.empty?
    stop_machine
  else
    if @interval != 0
      if !success_list.empty? || !failed_list.empty?
        if @interval == RANDOM_TIME
          sleep(rand(@random_time_range))
        else
          sleep(@interval)
        end
      end
    end
    event_machine_down(todo, method(:complete))
  end
end
event_machine_down(link_struct_list, callback = nil) click to toggle source
# File lib/list_spider.rb, line 163
def event_machine_down(link_struct_list, callback = nil)
  failed_list = []
  succeed_list = []
  multi = EventMachine::MultiRequest.new
  begin_time = Time.now

  for_each_proc =
    proc do |task_struct|
      http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
      http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
      task_struct.request_object = http_req

      http_req.callback do
        s = http_req.response_header.status
        puts "#{Time.now}, http status code: #{s}"

        if s == 200 && @save_file
          local_dir = File.dirname(task_struct.local_path)
          FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
          begin
            File.open(task_struct.local_path, 'wb') do |f|
              f << if @convert_to_utf8 == true
                     SpiderHelper.to_utf8(http_req.response)
                   else
                     http_req.response
                   end
            end
            call_parse_method(task_struct)
            succeed_list << task_struct
          rescue StandardError => exception
            puts exception
          end
        end
        task_struct.callback.call(task_struct, http_req) if task_struct.callback
      end

      http_req.errback do
        puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"

        task_struct.errback.call(task_struct, http_req) if task_struct.errback
      end

      begin
        if @save_file
          multi.add task_struct.local_path, http_req
        else
          multi.add SecureRandom.uuid, http_req
        end
      rescue StandardError => exception
        puts exception
        puts task_struct.href
        puts task_struct.local_path
        stop_machine
      end
    end

  cb =
    proc do
      end_time = Time.now
      puts "use time:#{end_time - begin_time} seconds"
      if callback.nil?
        stop_machine
      else
        callback.call(multi, succeed_list, failed_list)
      end
    end
  link_struct_list.each(&for_each_proc)
  multi.callback(&cb)
end
event_machine_start_list(down_list, callback = nil) click to toggle source
# File lib/list_spider.rb, line 274
def event_machine_start_list(down_list, callback = nil)
  EventMachine.run do
    @succeed_list = []
    @failed_list = []
    @begin_time = Time.now
    if down_list.empty?
      if callback
        callback.call(nil, [], [])
      else
        stop_machine
      end
    else
      event_machine_down(down_list, callback)
    end
  end
end
filter_list(down_list) click to toggle source
# File lib/list_spider.rb, line 291
def filter_list(down_list)
  return unless @save_file

  down_list.each do |ts|
    if !ts.overwrite_exist && File.exist?(ts.local_path)
      call_parse_method(ts)
    elsif @local_path_set.add?(ts.local_path)
      @down_list << ts
    end
  end
end
next_task() click to toggle source
# File lib/list_spider.rb, line 242
def next_task
  @down_list.shift(@max)
end
stop_machine() click to toggle source
# File lib/list_spider.rb, line 233
def stop_machine
  puts "success size:#{@succeed_size}"
  puts "failed size:#{@failed_size}"
  @end_time = Time.now
  puts "total use time:#{@end_time - @begin_time} seconds"
  EventMachine.stop
  @local_path_set.clear
end