class SocialCrawler::SocialCrawler

Public Class Methods

new() click to toggle source
# File lib/socialcrawler.rb, line 27
def initialize
  @map = {
      twitter: 'twitter.com/',
      facebook: 'facebook.com/',
      google_plus: 'plus.google.com/',
      instagram: 'www.instagram.com',
      you_tube: 'youtube.com/user',
      pinterest: 'pinterest.com/',
      linked_in: 'linkedin.com/',
      flickr: 'flickr.com/'
  }
end

Public Instance Methods

_put(hash, symbol, value, log=nil) click to toggle source
# File lib/socialcrawler.rb, line 40
def _put(hash, symbol, value, log=nil)
  log = Logger.new(STDOUT) if log.nil?
  if not hash.has_key?(symbol)
    hash[symbol] = value
  else
    hash[symbol] = "#{hash[symbol]} #{value}"
    log.info("Multiple values for #{symbol} value #{hash[symbol]}")
  end
end
crawl(domain_list_filename, output_list_filename, status_filename=nil, log=nil) click to toggle source
# File lib/socialcrawler.rb, line 109
def crawl(domain_list_filename, output_list_filename, status_filename=nil, log=nil)
  log = Logger.new(STDOUT) if log.nil?
  log.info("Crawler started")

  status = load_status_cache(status_filename, log)

  data = load_output_cache(output_list_filename, log)

  CSV.open(output_list_filename, "wb") do |output|
    write_data(data, output)
    CSV.open(status_filename, "wb") do |status_line|
      write_status(status, status_line)
      crawl_loop(data, domain_list_filename, log, output, status, status_line)
    end
  end
end
crawl_loop(data, domain_list_filename, log, output, status, status_line) click to toggle source
# File lib/socialcrawler.rb, line 126
def crawl_loop(data, domain_list_filename, log, output, status, status_line)
  CSV.foreach(domain_list_filename) do |row|
    url = row[0]
    if status.has_key?(url)
      next
    end
    result = crawl_url(url, log)
    set_data(result, url, data, output)
    set_status(result, url, status, status_line)
  end
end
crawl_url(url, log=nil) click to toggle source
# File lib/socialcrawler.rb, line 62
def crawl_url(url, log=nil)
  log = Logger.new(STDOUT) if log.nil?
  log.info("Crawling #{url}")
  result = Hash.new(:NOT_FOUND)
  begin
    page = Nokogiri::HTML(open(url))
    title = page.css('title')
    if not title.nil?
      result[:title] = title.text.strip
    end
    page_to_result(page, result, log)
    result[:url] = url
    result[:success] = true
    result[:message] = ''
  rescue Exception => e
    result[:url] = url
    result[:success] = false
    result[:message] = "#{e}"
  end
  return result
end
load_output_cache(output_list_filename, log=nil) click to toggle source
# File lib/socialcrawler.rb, line 96
def load_output_cache(output_list_filename, log=nil)
  data = Hash.new()
  log.info("Loading previous status from #{output_list_filename}")
  if not File.exist?(output_list_filename)
    return data
  end
  CSV.foreach(output_list_filename) do |row|
    set_output_cache_data(data, row)
    log.info("Loading previous status from #{output_list_filename} finished, #{data.keys.length} loaded.")
  end
  return data
end
load_status_cache(status_filename, log=nil) click to toggle source
# File lib/socialcrawler.rb, line 84
def load_status_cache(status_filename, log=nil)
  status = Hash.new
  if not status_filename.nil? and File.exists?(status_filename)
    log.info("Loading previous status from #{status_filename}")
    CSV.foreach(status_filename) do |row|
      set_status_cache_data(status, row)
    end
    log.info("Loading previous status from #{status_filename} finished, #{status.keys.length} loaded.")
  end
  return status
end
page_to_result(page, result, log) click to toggle source
# File lib/socialcrawler.rb, line 50
def page_to_result(page, result, log)
  links = page.css('a[href]')
  links.each do |link|
    link_url = link['href']
    @map.each do |k, prefix|
      if not link_url.index(prefix).nil?
        _put(result, k, link_url, log)
      end
    end
  end
end

Private Instance Methods

set_data(result, url, data, output) click to toggle source
# File lib/socialcrawler.rb, line 152
def set_data(result, url, data, output)
  if result[:success] == true
    data[url] = result
    output << [url, result[:title], result[:twitter], result[:facebook], result[:google_plus]]
  end
end
set_output_cache_data(data, row) click to toggle source
# File lib/socialcrawler.rb, line 168
def set_output_cache_data(data, row)
  if row.count >= 5
    data[row[0]] = {
        :url => row[0],
        :title => row[1],
        :twitter => row[2],
        :facebook => row[3],
        :google_plus => row[4]
    }
  end
end
set_status(result, url, status, status_line) click to toggle source
# File lib/socialcrawler.rb, line 159
def set_status(result, url, status, status_line)
  status[url] = {
      :url => url,
      :result => result[:success],
      :message => result[:message]
  }
  status_line << [url, result[:success], result[:message]]
end
set_status_cache_data(status, row) click to toggle source
# File lib/socialcrawler.rb, line 180
def set_status_cache_data(status, row)
  if row.count >= 3
    status[row[0]] = {
        :url => row[0],
        :result => row[1],
        :message => row[2]
    }
  end
end
write_data(data, output) click to toggle source
# File lib/socialcrawler.rb, line 140
def write_data(data, output)
  data.each do |k, v|
    output << [k, v[:title], v[:twitter], v[:facebook], v[:google_plus]]
  end
end
write_status(status, status_line) click to toggle source
# File lib/socialcrawler.rb, line 146
def write_status(status, status_line)
  status.each do |k, v|
    status_line << [k, v[:success], v[:message]]
  end
end