class LogStash::Inputs::Crawler
Public Instance Methods
get_page_content(url)
click to toggle source
# File lib/logstash/inputs/crawler.rb, line 87 def get_page_content url uri = URI(url) request = Net::HTTP::Get.new(uri) http = Net::HTTP.new(uri.host, uri.port) # Neet to enable use of SSL if the URL protocol is HTTPS http.use_ssl = (uri.scheme == "https") response = http.request(request) # Check if URL needs to be forwarded because of redirect case response when Net::HTTPSuccess return response.body when Net::HTTPMovedPermanently || Net::HTTPRedirection get_page_content response['location'] end end
get_urls_for_page(url,queue)
click to toggle source
# File lib/logstash/inputs/crawler.rb, line 57 def get_urls_for_page(url,queue) page_content = get_page_content url # Regex to get all "links" in the page urlsa = page_content.scan(/\<a href\=(\"(http|https)\:.*?\")/) urlsa.each { |u| sanitized_url = u.first.gsub(/\"/, '').strip if (@urls.include?(sanitized_url) == false) && (@urls.length <= @url_max) @urls.push(sanitized_url) pagina = @agent.get(sanitized_url) content = pagina.body evento = LogStash::Event.new("link" => sanitized_url , "contenido" => content) decorate(evento) queue << evento #puts "/*******************************************************************************/" #puts @urls.length #puts "/*******************************************************************************/" # If Unexpected Error happens when trying to fetch URLs move on to the next URL begin get_urls_for_page(sanitized_url,queue) rescue Exception => e #puts "/*******************************************************************************/" #puts "Problema al obtener el contenido de : " + sanitized_url #puts "/*******************************************************************************/" next end end } return @urls end
register()
click to toggle source
# File lib/logstash/inputs/crawler.rb, line 25 def register @urls = [] @agent = Mechanize.new @agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE end
run(queue)
click to toggle source
# File lib/logstash/inputs/crawler.rb, line 32 def run(queue) # we can abort the loop if stop? becomes true while !stop? start_crawl(queue) Stud.stoppable_sleep(@interval) { stop? } end # loop end
start_crawl(queue)
click to toggle source
# File lib/logstash/inputs/crawler.rb, line 49 def start_crawl(queue) begin get_urls_for_page(@url,queue) rescue Exception => e puts "FALLO DE CRAWL" end end
stop()
click to toggle source
# File lib/logstash/inputs/crawler.rb, line 41 def stop # nothing to do in this case so it is not necessary to define stop # examples of common "stop" tasks: # * close sockets (unblocking blocking reads/accets) # * cleanup temporary files # * terminate spawned threads end