class LinkedinCrawler

Public Class Methods

new(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details, cm_hash) click to toggle source
# File lib/linkedincrawler.rb, line 10
def initialize(search_terms, retry_limit, requests, requests_google, requests_google2, solver_details, cm_hash)
  @search_terms = search_terms
  @output = Array.new
  
  @retry_limit = retry_limit
  @retry_count = 0
  
  @requests = requests
  @requests_google = requests_google
  @requests_google2 = requests_google2
  @solver_details = solver_details

  # Handle crawler manager info
  @cm_hash = cm_hash
  @cm_url = cm_hash[:crawler_manager_url] if cm_hash
  @selector_id = cm_hash[:selector_id] if cm_hash
end

Public Instance Methods

check_right_page(profile_url) click to toggle source

Check that it is actually a LinkedIn profile page

# File lib/linkedincrawler.rb, line 66
def check_right_page(profile_url)
  return !profile_url.include?("www.google") &&
         profile_url.include?(".linkedin.") &&
         !profile_url.include?("linkedin.com/pub/dir") &&
         !profile_url.include?("/search") &&
         @retry_count < @retry_limit
end
gen_json() click to toggle source

Print output in JSON

# File lib/linkedincrawler.rb, line 130
def gen_json
  JSON.pretty_generate(@output)
end
get_pages(urls) click to toggle source

Get each page itself

# File lib/linkedincrawler.rb, line 56
def get_pages(urls)
  profiles = urls.select{|u| check_right_page(u)}
  t = TranslatePage.new(profiles, @requests)
  parsed_profiles = t.translate
  parsed_profiles.each do |profile|
    parse_and_report(profile[:url], profile[:html])
  end
end
google_queries() click to toggle source

Run queries on google

# File lib/linkedincrawler.rb, line 39
def google_queries
  begin
    # Run Google search
    g = GeneralScraper.new("site:linkedin.com/pub -site:linkedin.com/pub/dir/", @search_terms, @requests_google, @solver_details, @cm_hash)
    urls = g.getURLs

    # Look for new LI urls
    g2 = GeneralScraper.new("site:linkedin.com/in", @search_terms, @requests_google2, @solver_details, @cm_hash)
    urls = JSON.parse(urls) + JSON.parse(g2.getURLs)
  rescue => e
    report_status("Error running Google Crawler from LinkedIn Crawler: " +e.to_s)
    binding.pry
  end
  return urls
end
parse_and_report(profile_url, profile_html) click to toggle source

Parse each page

# File lib/linkedincrawler.rb, line 86
def parse_and_report(profile_url, profile_html)
  # Parse profile
  l = LinkedinParser.new(profile_html, profile_url, {timestamp: Time.now, search_terms: @search_terms})
  parsed_profile = JSON.parse(l.results_by_job)

  # Check if it failed or succeeded
  if profile_parsing_failed?(parsed_profile)
    report_status("Profile parsing failed for "+profile_url.to_s+". Moving on.")
    report_results(parsed_profile, profile_url)
  else # It succeeded!
    report_results(parsed_profile, profile_url)
  end
end
profile_parsing_failed?(parsed_profile) click to toggle source

Check if profile parsed successfully

# File lib/linkedincrawler.rb, line 81
def profile_parsing_failed?(parsed_profile)
  return (parsed_profile == nil) || parsed_profile.empty? || parsed_profile.first["parsing_failed"]
end
report_incremental(results, link) click to toggle source

Report results back to Harvester incrementally

# File lib/linkedincrawler.rb, line 110
def report_incremental(results, link)
  curl_url = @cm_url+"/relay_results"
  @retry_count = 0
  c = Curl::Easy.http_post(curl_url,
                           Curl::PostField.content('selector_id', @selector_id),
                           Curl::PostField.content('status_message', "Collected " + link),
                           Curl::PostField.content('results', JSON.pretty_generate(results)))
end
report_results(results, link) click to toggle source

Figure out how to report results

# File lib/linkedincrawler.rb, line 101
def report_results(results, link)
  if @cm_url
    report_incremental(results, link)
  else
    save_and_continue(results)
  end
end
report_status(status_msg) click to toggle source

Report Harvester status message

# File lib/linkedincrawler.rb, line 120
def report_status(status_msg)
  if @cm_url
    curl_url = @cm_url+"/update_status"
    c = Curl::Easy.http_post(curl_url,
                             Curl::PostField.content('selector_id', @selector_id),
                             Curl::PostField.content('status_message', status_msg))
  end
end
save_and_continue(parsed_profile) click to toggle source

Add the parsed profile to output, reset the retry count, and continue

# File lib/linkedincrawler.rb, line 75
def save_and_continue(parsed_profile)
  @output += parsed_profile if parsed_profile != nil && !parsed_profile.empty?
  @retry_count = 0
end