require “logstash/inputs/base” require “logstash/namespace” require “stud/interval” require “mechanize”

class LogStash::Inputs::Crawler < LogStash::Inputs::Base

config_name "crawler"

# If undefined, Logstash will complain, even if codec is unused.
default :codec, "plain"

# The message string to use in the event.
config :url, :validate => :string, :required => true

# Set how depth should be explore.
config :deep, :validate => :number, :default => 3

#Set de interval for stoppable_sleep
config :interval, :validate => :number, :default => 86400

public
def register
      @prof = 1
      @links = []
      @cuenta = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
      @agent = Mechanize.new
      @agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
      @agent.redirection_limit = 500
      @cont = 0
      @url_actual = @url
      @cola = []
      @cola << @url
end # def register

def run(queue)
  # we can abort the loop if stop? becomes true
  while !stop?

      loop do
              @url_actual = @cola.shift
              if (!@links.include?(@url_actual))
                      begin
                              @page = @agent.get(@url_actual)
                      rescue Mechanize::ResponseCodeError => exception
                              if exception.response_code != '200'
                                      @url_actual = @cola.shift
                              end
                              retry
                      end
                      @page.links_with(:href => /^https?/).each do |link|
                              @cola << link.href
                              @cuenta[@prof] = @cuenta[@prof] + 1
                      end
                      @links << @url_actual
              end

              if (@cuenta[@prof-1] == @links.length)
                      @prof = @prof + 1
              end

              break if @prof >= @deep
      end     

      @links.each do |link|
              pagina = @agent.get(link)
              #content = pagina.body
              event = LogStash::Event.new("link" => link)
              decorate(event)
              queue << event
      end

      event = LogStash::Event.new("numero_de_links" => @links.length)
      decorate(event)
      queue << event

    Stud.stoppable_sleep(@interval) { stop? }
  end # loop
end # def run

def stop
  # nothing to do in this case so it is not necessary to define stop
  # examples of common "stop" tasks:
  #  * close sockets (unblocking blocking reads/accets)
  #  * cleanup temporary files
  #  * terminate spawned threads
end

end # class LogStash::Inputs::Crawler