class IndeedCrawler

Public Class Methods

new(search_query, location, proxy_list, wait_time, browser_num, cm_hash) click to toggle source
# File lib/indeedcrawler.rb, line 9
def initialize(search_query, location, proxy_list, wait_time, browser_num, cm_hash)
  # Info for query
  @search_query = search_query
  @location = location

  # Settings for request manager
  @requests = RequestManager.new(proxy_list, wait_time, browser_num)

  # Result tracking
  @all_resume_links = Array.new
  @output = Array.new

  # Handle crawler manager info
  @cm_url = cm_hash[:crawler_manager_url] if cm_hash
  @selector_id = cm_hash[:selector_id] if cm_hash
end

Public Instance Methods

add_location(url) click to toggle source

Append location

# File lib/indeedcrawler.rb, line 32
def add_location(url)
  url += "&" if @search_query
  url += "l="+URI.encode_www_form([@location])
end
add_query(url) click to toggle source

Append query

# File lib/indeedcrawler.rb, line 27
def add_query(url)
  url += "&q="+URI.encode_www_form([@search_query])
end
collect_it_all() click to toggle source

Get all the profile links

# File lib/indeedcrawler.rb, line 124
def collect_it_all
  # Generate URL
  url = "http://indeed.com/resumes?co=US"
  url = add_query(url) if @search_query
  url = add_location(url) if @location

  # Get first page and navigate the rest
  page_body = load_restart_page(url, 0)
  html = Nokogiri::HTML(page_body)
  get_page_links(html)

  # Get and parse all results
  parse_resumes

  # Close browsers when done and return results
  @requests.close_all_browsers
  report_status("Finished collecting data for selector "+@search_query.to_s+" "+@location.to_s)
end
get_json() click to toggle source

Get the JSON of results

# File lib/indeedcrawler.rb, line 119
def get_json
  return JSON.pretty_generate(@output)
end
load_next_page(html) click to toggle source

Load the next page

# File lib/indeedcrawler.rb, line 52
def load_next_page(html)
  next_html = load_restart_page("http://indeed.com/resumes"+html.css("a.next").first['href'], 0)
  get_page_links(Nokogiri::HTML(next_html))
end
load_restart_page(url, count) click to toggle source

Load the page and return or restart and retry if needed

# File lib/indeedcrawler.rb, line 58
def load_restart_page(url, count)
  begin
    return @requests.get_page(url)
  rescue
    if count < 2
      @requests.restart_browser
      load_restart_page(url, count+=1)
    end
  end
end
parse_resumes() click to toggle source

Download and parse all resumes

# File lib/indeedcrawler.rb, line 70
def parse_resumes
  @all_resume_links.each do |link|
    resume = load_restart_page(link, 0)
    
    begin
      # Parse resume and add to results
      i = IndeedParser.new(resume, link, {time_scraped: Time.now})
      results = JSON.parse(i.get_results_by_job)
      report_results(results, link)
    rescue => e
      report_status("Error in parsing " + link+": "+e.to_s)
    end
  end
end
report_batch(results) click to toggle source

Report all results in one JSON

# File lib/indeedcrawler.rb, line 95
def report_batch(results)
  results.each do |result|
    @output.push(result)
  end
end
report_incremental(results, link) click to toggle source

Report results back to Harvester incrementally

# File lib/indeedcrawler.rb, line 102
def report_incremental(results, link)
  curl_url = @cm_url+"/relay_results"
  c = Curl::Easy.http_post(curl_url,
                           Curl::PostField.content('selector_id', @selector_id),
                           Curl::PostField.content('status_message', "Collected " + link),
                           Curl::PostField.content('results', JSON.pretty_generate(results)))
end
report_results(results, link) click to toggle source

Figure out how to report results

# File lib/indeedcrawler.rb, line 86
def report_results(results, link)
  if @cm_url
    report_incremental(results, link)
  else
    report_batch(results)
  end
end
report_status(status_msg) click to toggle source

Report Harvester status message

# File lib/indeedcrawler.rb, line 111
def report_status(status_msg)
  curl_url = @cm_url+"/update_status"
  c = Curl::Easy.http_post(curl_url,
                           Curl::PostField.content('selector_id', @selector_id),
                           Curl::PostField.content('status_message', status_msg))
end