require “logstash/inputs/base” require “logstash/namespace” require “stud/interval” require “set” require “uri” require “nokogiri” require “open-uri”

class LogStash::Inputs::Crawler < LogStash::Inputs::Base

config_name "crawler"

# If undefined, Logstash will complain, even if codec is unused.
default :codec, "plain"

# The message string to use in the event.
config :url, :validate => :string, :required => true

#Set de interval for stoppable_sleep
config :interval, :validate => :number, :default => 86400

public
def register
      @seen_pages = Set.new                      # Keep track of what we've seen
end # def register

def run(queue)
  # we can abort the loop if stop? becomes true
  while !stop?

      crawl_site(@url) do |page,uri|
        event = LogStash::Event.new("link" => uri.to_s)
              decorate(event)
              queue << event
      end

      evento = LogStash::Event.new("paginas_exploradas" => @seen_pages.length)
            decorate(evento)
      queue << evento

    Stud.stoppable_sleep(@interval) { stop? }
  end # loop
end # def run

def stop
  # nothing to do in this case so it is not necessary to define stop
  # examples of common "stop" tasks:
  #  * close sockets (unblocking blocking reads/accets)
  #  * cleanup temporary files
  #  * terminate spawned threads
end

def crawl_site( starting_at, &each_page )
      files = %w[png jpeg jpg gif svg txt js css zip gz asp PNG JPEG JPG GIF SVG TXT JS CSS ZIP GZ ASP]
      starting_uri = URI.parse(starting_at)

      crawl_page = ->(page_uri) do              # A re-usable mini-function
        unless @seen_pages.include?(page_uri)
          @seen_pages << page_uri                # Record that we've seen this
          begin
            doc = Nokogiri.HTML(open(page_uri)) # Get the page
            each_page.call(doc,page_uri)        # Yield page and URI to the block

            # Find all the links on the page
            hrefs = doc.css('a[href]').map{ |a| a['href'] }

            # Make these URIs, throwing out problem ones like mailto:
            uris = hrefs.map{ |href| URI.join( page_uri, href ) rescue nil }.compact.uniq

            # Pare it down to only those pages that are on the same site
            uris.select!{ |uri| uri.host == starting_uri.host }

            # Throw out links to files (this could be more efficient with regex)
            uris.reject!{ |uri| files.any?{ |ext| uri.path.end_with?(".#{ext}") } }

            # Remove #foo fragments so that sub-page links aren't differentiated
            uris.each{ |uri| uri.fragment = nil }

            # Recursively crawl the child URIs
            uris.each{ |uri| crawl_page.call(uri) }

          rescue OpenURI::HTTPError # Guard against 404s
            warn "Skipping invalid link #{page_uri}"
          end
        end
      end
      crawl_page.call( starting_uri )   # Kick it all off!
end

end # class LogStash::Inputs::Crawler