module ListSpider
Constants
- DEFAULT_CONCURRNET_MAX
- DEFAULT_INTERVAL
- NO_LIMIT_CONCURRENT
- RANDOM_TIME
- VERSION
Attributes
save_file[RW]
Public Class Methods
add_task(task)
click to toggle source
# File lib/list_spider.rb, line 147 def add_task(task) if task.is_a? Array filter_list(task) elsif task.is_a?TaskStruct filter_list([task]) else puts "error task type:#{task.class}" end end
get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
click to toggle source
# File lib/list_spider.rb, line 126 def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX) if interval.is_a? Range @random_time_range = interval interval = RANDOM_TIME end filter_list(down_list) @interval = interval @max = max @max = @down_list.size if @max == NO_LIMIT_CONCURRENT @succeed_size = 0 @failed_size = 0 puts "total size:#{@down_list.size}" event_machine_start_list(next_task, method(:complete)) end
get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
click to toggle source
# File lib/list_spider.rb, line 143 def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX) get_list([task], interval: interval, max: max) end
stop()
click to toggle source
# File lib/list_spider.rb, line 157 def stop stop_machine end
Private Class Methods
call_parse_method(task_struct)
click to toggle source
# File lib/list_spider.rb, line 246 def call_parse_method(task_struct) task_struct.parse_method.call(task_struct) if task_struct.parse_method end
complete(_multi, success_list, failed_list)
click to toggle source
# File lib/list_spider.rb, line 250 def complete(_multi, success_list, failed_list) @succeed_size += success_list.size @failed_size += failed_list.size @succeed_list.concat(success_list) @failed_list.concat(failed_list) todo = next_task if todo.empty? stop_machine else if @interval != 0 if !success_list.empty? || !failed_list.empty? if @interval == RANDOM_TIME sleep(rand(@random_time_range)) else sleep(@interval) end end end event_machine_down(todo, method(:complete)) end end
event_machine_down(link_struct_list, callback = nil)
click to toggle source
# File lib/list_spider.rb, line 163 def event_machine_down(link_struct_list, callback = nil) failed_list = [] succeed_list = [] multi = EventMachine::MultiRequest.new begin_time = Time.now for_each_proc = proc do |task_struct| http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options) http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback task_struct.request_object = http_req http_req.callback do s = http_req.response_header.status puts "#{Time.now}, http status code: #{s}" if s == 200 && @save_file local_dir = File.dirname(task_struct.local_path) FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir) begin File.open(task_struct.local_path, 'wb') do |f| f << if @convert_to_utf8 == true SpiderHelper.to_utf8(http_req.response) else http_req.response end end call_parse_method(task_struct) succeed_list << task_struct rescue StandardError => exception puts exception end end task_struct.callback.call(task_struct, http_req) if task_struct.callback end http_req.errback do puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}" task_struct.errback.call(task_struct, http_req) if task_struct.errback end begin if @save_file multi.add task_struct.local_path, http_req else multi.add SecureRandom.uuid, http_req end rescue StandardError => exception puts exception puts task_struct.href puts task_struct.local_path stop_machine end end cb = proc do end_time = Time.now puts "use time:#{end_time - begin_time} seconds" if callback.nil? stop_machine else callback.call(multi, succeed_list, failed_list) end end link_struct_list.each(&for_each_proc) multi.callback(&cb) end
event_machine_start_list(down_list, callback = nil)
click to toggle source
# File lib/list_spider.rb, line 274 def event_machine_start_list(down_list, callback = nil) EventMachine.run do @succeed_list = [] @failed_list = [] @begin_time = Time.now if down_list.empty? if callback callback.call(nil, [], []) else stop_machine end else event_machine_down(down_list, callback) end end end
filter_list(down_list)
click to toggle source
# File lib/list_spider.rb, line 291 def filter_list(down_list) return unless @save_file down_list.each do |ts| if !ts.overwrite_exist && File.exist?(ts.local_path) call_parse_method(ts) elsif @local_path_set.add?(ts.local_path) @down_list << ts end end end
next_task()
click to toggle source
# File lib/list_spider.rb, line 242 def next_task @down_list.shift(@max) end
stop_machine()
click to toggle source
# File lib/list_spider.rb, line 233 def stop_machine puts "success size:#{@succeed_size}" puts "failed size:#{@failed_size}" @end_time = Time.now puts "total use time:#{@end_time - @begin_time} seconds" EventMachine.stop @local_path_set.clear end