class LogStash::Inputs::Crawler

Public Instance Methods

get_page_content(url) click to toggle source
# File lib/logstash/inputs/crawler.rb, line 87
def get_page_content url
      uri = URI(url)
      request = Net::HTTP::Get.new(uri)
      http = Net::HTTP.new(uri.host, uri.port)       
      # Neet to enable use of SSL if the URL protocol is HTTPS
      http.use_ssl = (uri.scheme == "https")
      response = http.request(request)
      # Check if URL needs to be forwarded because of redirect
      case response
              when Net::HTTPSuccess
                      return response.body
              when Net::HTTPMovedPermanently || Net::HTTPRedirection
                      get_page_content response['location']
      end
end
get_urls_for_page(url,queue) click to toggle source
# File lib/logstash/inputs/crawler.rb, line 57
 def get_urls_for_page(url,queue)
      page_content = get_page_content url    
      # Regex to get all "links" in the page
      urlsa = page_content.scan(/\<a href\=(\"(http|https)\:.*?\")/)             
      urlsa.each { |u| 
              sanitized_url = u.first.gsub(/\"/, '').strip
              if (@urls.include?(sanitized_url) == false) && (@urls.length <= @url_max)
                      @urls.push(sanitized_url)
                      pagina = @agent.get(sanitized_url)
                      content = pagina.body
                      evento = LogStash::Event.new("link" => sanitized_url , "contenido" => content)
                      decorate(evento)
                      queue << evento
                      #puts "/*******************************************************************************/"
                      #puts @urls.length
                      #puts "/*******************************************************************************/"
                      # If Unexpected Error happens when trying to fetch URLs move on to the next URL
                      begin
                              get_urls_for_page(sanitized_url,queue)      
                      rescue Exception => e
                              #puts "/*******************************************************************************/"
                              #puts "Problema al obtener el contenido de : " + sanitized_url
                              #puts "/*******************************************************************************/"
                              next
                      end
              end
      }
      return @urls
end
register() click to toggle source
# File lib/logstash/inputs/crawler.rb, line 25
def register
 @urls = []
 @agent = Mechanize.new
 @agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
run(queue) click to toggle source
# File lib/logstash/inputs/crawler.rb, line 32
def run(queue)
 # we can abort the loop if stop? becomes true
 while !stop?
   start_crawl(queue)      
   Stud.stoppable_sleep(@interval) { stop? }
 end # loop
end
start_crawl(queue) click to toggle source
# File lib/logstash/inputs/crawler.rb, line 49
def start_crawl(queue)
     begin
             get_urls_for_page(@url,queue)                         
     rescue Exception => e
             puts "FALLO DE CRAWL"
     end
end
stop() click to toggle source
# File lib/logstash/inputs/crawler.rb, line 41
def stop
 # nothing to do in this case so it is not necessary to define stop
 # examples of common "stop" tasks:
 #  * close sockets (unblocking blocking reads/accets)
 #  * cleanup temporary files
 #  * terminate spawned threads
end