class SocialCrawler::SocialCrawler
Public Class Methods
new()
click to toggle source
# File lib/socialcrawler.rb, line 27 def initialize @map = { twitter: 'twitter.com/', facebook: 'facebook.com/', google_plus: 'plus.google.com/', instagram: 'www.instagram.com', you_tube: 'youtube.com/user', pinterest: 'pinterest.com/', linked_in: 'linkedin.com/', flickr: 'flickr.com/' } end
Public Instance Methods
_put(hash, symbol, value, log=nil)
click to toggle source
# File lib/socialcrawler.rb, line 40 def _put(hash, symbol, value, log=nil) log = Logger.new(STDOUT) if log.nil? if not hash.has_key?(symbol) hash[symbol] = value else hash[symbol] = "#{hash[symbol]} #{value}" log.info("Multiple values for #{symbol} value #{hash[symbol]}") end end
crawl(domain_list_filename, output_list_filename, status_filename=nil, log=nil)
click to toggle source
# File lib/socialcrawler.rb, line 109 def crawl(domain_list_filename, output_list_filename, status_filename=nil, log=nil) log = Logger.new(STDOUT) if log.nil? log.info("Crawler started") status = load_status_cache(status_filename, log) data = load_output_cache(output_list_filename, log) CSV.open(output_list_filename, "wb") do |output| write_data(data, output) CSV.open(status_filename, "wb") do |status_line| write_status(status, status_line) crawl_loop(data, domain_list_filename, log, output, status, status_line) end end end
crawl_loop(data, domain_list_filename, log, output, status, status_line)
click to toggle source
# File lib/socialcrawler.rb, line 126 def crawl_loop(data, domain_list_filename, log, output, status, status_line) CSV.foreach(domain_list_filename) do |row| url = row[0] if status.has_key?(url) next end result = crawl_url(url, log) set_data(result, url, data, output) set_status(result, url, status, status_line) end end
crawl_url(url, log=nil)
click to toggle source
# File lib/socialcrawler.rb, line 62 def crawl_url(url, log=nil) log = Logger.new(STDOUT) if log.nil? log.info("Crawling #{url}") result = Hash.new(:NOT_FOUND) begin page = Nokogiri::HTML(open(url)) title = page.css('title') if not title.nil? result[:title] = title.text.strip end page_to_result(page, result, log) result[:url] = url result[:success] = true result[:message] = '' rescue Exception => e result[:url] = url result[:success] = false result[:message] = "#{e}" end return result end
load_output_cache(output_list_filename, log=nil)
click to toggle source
# File lib/socialcrawler.rb, line 96 def load_output_cache(output_list_filename, log=nil) data = Hash.new() log.info("Loading previous status from #{output_list_filename}") if not File.exist?(output_list_filename) return data end CSV.foreach(output_list_filename) do |row| set_output_cache_data(data, row) log.info("Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.") end return data end
load_status_cache(status_filename, log=nil)
click to toggle source
# File lib/socialcrawler.rb, line 84 def load_status_cache(status_filename, log=nil) status = Hash.new if not status_filename.nil? and File.exists?(status_filename) log.info("Loading previous status from #{status_filename}") CSV.foreach(status_filename) do |row| set_status_cache_data(status, row) end log.info("Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.") end return status end
page_to_result(page, result, log)
click to toggle source
# File lib/socialcrawler.rb, line 50 def page_to_result(page, result, log) links = page.css('a[href]') links.each do |link| link_url = link['href'] @map.each do |k, prefix| if not link_url.index(prefix).nil? _put(result, k, link_url, log) end end end end
Private Instance Methods
set_data(result, url, data, output)
click to toggle source
# File lib/socialcrawler.rb, line 152 def set_data(result, url, data, output) if result[:success] == true data[url] = result output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]] end end
set_output_cache_data(data, row)
click to toggle source
# File lib/socialcrawler.rb, line 168 def set_output_cache_data(data, row) if row.count >= 5 data[row[0]] = { :url => row[0], :title => row[1], :twitter => row[2], :facebook => row[3], :google_plus => row[4] } end end
set_status(result, url, status, status_line)
click to toggle source
# File lib/socialcrawler.rb, line 159 def set_status(result, url, status, status_line) status[url] = { :url => url, :result => result[:success], :message => result[:message] } status_line << [url, result[:success], result[:message]] end
set_status_cache_data(status, row)
click to toggle source
# File lib/socialcrawler.rb, line 180 def set_status_cache_data(status, row) if row.count >= 3 status[row[0]] = { :url => row[0], :result => row[1], :message => row[2] } end end
write_data(data, output)
click to toggle source
# File lib/socialcrawler.rb, line 140 def write_data(data, output) data.each do |k, v| output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]] end end
write_status(status, status_line)
click to toggle source
# File lib/socialcrawler.rb, line 146 def write_status(status, status_line) status.each do |k, v| status_line << [k, v[:success], v[:message]] end end