class InstagramScraper

Public Class Methods

new(brands, options = {}) click to toggle source
# File lib/instagram_scraper.rb, line 8
def initialize(brands, options = {})
  @brands = brands
  @min_likes = options[:min_likes] || 500
  @output_file = options[:output_file] || "./Instagram Data (#{brands.sort.join(', ')}).csv"
  @proxies = options[:proxies] || []
  @data = []
end

Public Instance Methods

perform() click to toggle source
# File lib/instagram_scraper.rb, line 16
def perform
  scrape_brands
  store_data_in_csv unless @data.empty?
end

Private Instance Methods

build_query_params(query_hash, brand_id, end_cursor) click to toggle source
# File lib/instagram_scraper.rb, line 63
def build_query_params(query_hash, brand_id, end_cursor)
  {
    query_hash: query_hash,
    variables: {
      id: brand_id,
      first: 50,
      after: end_cursor,
    }.to_json,
  }
end
parse_post_data(post_data) click to toggle source
# File lib/instagram_scraper.rb, line 94
def parse_post_data(post_data) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  publisher = post_data["owner"]["username"]
  likes = post_data["edge_liked_by"]["count"]
  return if likes < @min_likes

  {
    publisher: publisher,
    publisher_url: "#{BASE_URL}/#{publisher}",
    post_url: "#{BASE_URL}/p/#{post_data['shortcode']}",
    likes: likes,
    comments: post_data["edge_media_to_comment"]["count"],
    date: Time.zone.at(post_data["taken_at_timestamp"]).strftime("%d/%m/%Y"),
    caption: post_data["edge_media_to_caption"]["edges"]&.first&.[]("node")&.[]("text")&.gsub(/\n/, " "),
  }
end
scrape_brand_data(brand) click to toggle source
# File lib/instagram_scraper.rb, line 33
def scrape_brand_data(brand)
  brand_url = "#{BASE_URL}/#{brand}"
  brand_data = JSON.parse(URI.open("#{brand_url}/?__a=1").read)["graphql"]["user"]
  {
    id: brand_data["id"],
    brand: brand_data["full_name"],
    brand_url: brand_url,
  }
end
scrape_brand_posts(brand_data, end_cursor = "") click to toggle source
# File lib/instagram_scraper.rb, line 43
def scrape_brand_posts(brand_data, end_cursor = "")
  query_hash = scrape_query_hash
  while end_cursor
    query_params = build_query_params(query_hash, brand_data[:id], end_cursor)
    posts_data = scrape_posts_data(query_params)
    end_cursor = posts_data["page_info"]["end_cursor"]
    posts_data["edges"].each do |post_data|
      post = parse_post_data(post_data["node"])
      @data << brand_data.slice(:brand, :brand_url).merge(post) if post
    end
    puts("Scraped #{@data.count} posts") unless @data.empty?
  end
end
scrape_brands() click to toggle source
# File lib/instagram_scraper.rb, line 23
def scrape_brands
  @brands.each do |brand|
    brand_data = scrape_brand_data(brand)
  rescue OpenURI::HTTPError
    next
  else
    scrape_brand_posts(brand_data)
  end
end
scrape_posts_data(query_params, posts_data = [], proxy_index = 0) click to toggle source
# File lib/instagram_scraper.rb, line 74
def scrape_posts_data(query_params, posts_data = [], proxy_index = 0)
  agent = Mechanize.new
  url = "#{BASE_URL}/graphql/query/?#{URI.encode_www_form(query_params)}"
  while posts_data.empty?
    proxy = @proxies[proxy_index]
    unless proxy
      puts "No more proxies available"
    end

    ip, port = proxy.split(":")
    agent.set_proxy(ip, port.to_i)
    begin
      posts_data = JSON.parse(agent.get(url).body)["data"]["user"]["edge_user_to_photos_of_you"]
    rescue
      proxy_index += 1
    end
  end
  posts_data
end
scrape_query_hash() click to toggle source
# File lib/instagram_scraper.rb, line 57
def scrape_query_hash
  # TODO: scrape bundle name
  bundle_url = "#{BASE_URL}/static/bundles/es6/ProfilePageContainer.js/b10d8b1b32fc.js"
  URI.open(bundle_url).read.match(QUERY_ID_PATTERN)[1]
end
store_data_in_csv() click to toggle source
# File lib/instagram_scraper.rb, line 110
def store_data_in_csv
  headers = @data.first.keys.map { |key| key.to_s.tr("_", " ").capitalize }
  CSV.open(@output_file, "wb", write_headers: true, headers: headers) do |csv|
    @data.each { |post| csv << post.values }
  end
end