class DuckScraper

Public Class Methods

new(working_dir, input_file, output_file, options) click to toggle source
# File lib/linsc/duck.rb, line 21
def initialize(working_dir, input_file, output_file, options)
  @working_dir, @input_file, @output_file, @noproxy =
    working_dir, input_file, output_file, options[:noproxy]

  @headers = get_headers(@input_file)
  @headers.delete("LinkedIn Profile")
  @headers << "Linkedin Import Status" unless @headers.include?("Linkedin Import Status")
  @headers << "Urls" unless @headers.include?("Urls")
  @input_length = CSV.read(@input_file).length - 1
  if File.exist?(@output_file)
    @start = CSV.read(@output_file, headers: true).length
    puts "resuming from row #{@start + 1}"
  else
    create_file(@output_file)
  end
  @cooldown = 5
  @proxies = ProxyHandler.new(@cooldown) unless @noproxy
end

Public Instance Methods

append_ddg_row(row, status, urls) click to toggle source
# File lib/linsc/duck.rb, line 111
def append_ddg_row(row, status, urls)
  row << ["Linkedin Import Status", status]
  row << ["Urls", urls]
  output_row = create_row(row, @headers)
  append_to_csv(@output_file, output_row)
end
create_query(row) click to toggle source
# File lib/linsc/duck.rb, line 185
def create_query(row)
  query_parts = [row["First Name"], row["Last Name"], row["Employer 1 Title"],
                 row["Employer Organization Name 1"]]
  query_parts.collect! do |part|
    part.gsub!(row["Email"], ' ')
    part.downcase.alnum.strip
  end
  "linkedin #{query_parts.join(' ')}"
end
find_profiles() click to toggle source
# File lib/linsc/duck.rb, line 40
def find_profiles
  count = 0

  CSV.foreach(@input_file, headers: true) do |input_row|
    count += 1
    next if @start && @start >= count
    if @proxies
      tries = @proxies.length
    else
      tries = 3
    end
    puts "ddg #{count}/#{@input_length}"
    begin
      lp = input_row["LinkedIn Profile"]
      input_row.delete("LinkedIn Profile")
      if lp && lp.include?('linkedin')
        puts "Existing Linkedin url found, skipping DDG"
        append_ddg_row(input_row, "Using existing url", lp)
        next
      end
      unless sufficient_data?(input_row)
        puts "Insufficient data, skipping"
        append_ddg_row(input_row, "Insufficient Data", nil)
        next
      end
      cert_file = Pathname.new(File.dirname __dir__).realdirpath + '../data/cacert.pem'
      cert_store = OpenSSL::X509::Store.new
      cert_store.add_file(cert_file.to_s)
      agent = Mechanize.new
      agent.cert_store = cert_store

      unless @noproxy
        proxy = @proxies.get_proxy
        agent.set_proxy(proxy.ip, proxy.port, proxy.username, proxy.password)
        agent.user_agent = proxy.user_agent
        puts "proxy: #{proxy.ip}"
      end
      sleep(@cooldown) if @noproxy
      query_string = create_query(input_row)
      puts "query string: #{query_string}"
      ddg_page = agent.get('https://www.duckduckgo.com/html')
      search_form = ddg_page.form_with(id: 'search_form_homepage')
      search_form.q = query_string
      results_page = agent.submit(search_form)
      urls = find_results(results_page, input_row)
      if urls.length > 0
        puts "Success! #{urls.length} possible urls found"
        append_ddg_row(input_row, "DDG results found", urls.join(', '))
      else
        puts "no results found"
        append_ddg_row(input_row, "No DDG results found", nil)
      end
      proxy.good if proxy

    rescue StandardError => msg
      tries -= 1
      if tries > 0
        puts "\n\n"
        puts msg.backtrace
        puts 'RETRYING'
        puts "\n\n"
        proxy.used if proxy
        retry
      else
        append_ddg_row(input_row, msg, nil)
        puts msg.backtrace
      end
    end
  end
end
find_results(page, row) click to toggle source
# File lib/linsc/duck.rb, line 135
def find_results(page, row)
  matches = []
  full_name = "#{row['First Name']} #{row['Last Name']}".gsub(row["Email"], ' ').alnum.strip
  if page.css("#links .results_links_deep")
    results = page.css("#links .results_links_deep")
  else
    return matches
  end
  results.each do |result|
    if result.at_css("a.result__a")

      url_text = result.css("a.result__a").text.alnum
      url = result.at_css('a.result__a')['href']
      bio = result.css("a.result__snippet").text.alnum || ""
      valid_url = true
      short_title = row["Employer 1 Title"].alnum.split.first(2)
      short_employer = row["Employer Organization Name 1"].alnum.split.first

      if result.css("a.large").text.include?("profiles | LinkedIn")
        valid_url = false
      end
      unless url.include?("linkedin") && (url.include?("/in/") || url.include?("/pub/"))
        valid_url = false
      end

      if valid_url && name_check(url_text, full_name)
        if bio.downcase.include?(short_title.join(' ').downcase) && bio.downcase.include?(short_employer.to_s.downcase)
          matches.unshift(url)
        else
          matches.push(url)
        end
      else
      end
    end
  end
  matches
end
name_check(lin_name, csv_name) click to toggle source
# File lib/linsc/duck.rb, line 173
def name_check(lin_name, csv_name)
  csv_array = csv_name.downcase.split(" ")
  lin_array = lin_name.downcase.split(" ")
  match = true
  csv_array.each do |chunk|
    unless lin_array.include?(chunk)
      match = false
    end
  end
  return match
end
sufficient_data?(row) click to toggle source
# File lib/linsc/duck.rb, line 118
def sufficient_data?(row)
  data_presence = 0
  if row["First Name"] && row["First Name"].alnum.strip != ""
    data_presence += 1
  end
  if row["Last Name"] && row["Last Name"].alnum.strip != ""
    data_presence += 1
  end
  if row["Employer Organization Name 1"] && row["Employer Organization Name 1"].alnum.strip != ""
    data_presence += 1
  end
  if row["Employer 1 Title"] && row["Employer 1 Title"].alnum.strip != ""
    data_presence += 1
  end
  data_presence == 4 ? true : false
end