require “logstash/inputs/base” require “logstash/namespace” require “stud/interval” require “mechanize”
class LogStash::Inputs::Crawler
< LogStash::Inputs::Base
config_name "crawler" # If undefined, Logstash will complain, even if codec is unused. default :codec, "plain" # The message string to use in the event. config :url, :validate => :string, :required => true # Set how depth should be explore. config :deep, :validate => :number, :default => 3 #Set de interval for stoppable_sleep config :interval, :validate => :number, :default => 86400 public def register @prof = 1 @links = [] @cuenta = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] @agent = Mechanize.new @agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE @agent.redirection_limit = 500 @cont = 0 @url_actual = @url @cola = [] @cola << @url end # def register def run(queue) # we can abort the loop if stop? becomes true while !stop? loop do @url_actual = @cola.shift if (!@links.include?(@url_actual)) begin @page = @agent.get(@url_actual) rescue Mechanize::ResponseCodeError => exception if exception.response_code != '200' @url_actual = @cola.shift end retry end @page.links_with(:href => /^https?/).each do |link| @cola << link.href @cuenta[@prof] = @cuenta[@prof] + 1 end @links << @url_actual end if (@cuenta[@prof-1] == @links.length) @prof = @prof + 1 end break if @prof >= @deep end @links.each do |link| pagina = @agent.get(link) #content = pagina.body event = LogStash::Event.new("link" => link) decorate(event) queue << event end event = LogStash::Event.new("numero_de_links" => @links.length) decorate(event) queue << event Stud.stoppable_sleep(@interval) { stop? } end # loop end # def run def stop # nothing to do in this case so it is not necessary to define stop # examples of common "stop" tasks: # * close sockets (unblocking blocking reads/accets) # * cleanup temporary files # * terminate spawned threads end
end # class LogStash::Inputs::Crawler