class TwitterCrawler

Public Class Methods

new(search_term, operator, cm_hash) click to toggle source
# File lib/twittercrawler.rb, line 10
def initialize(search_term, operator, cm_hash)
  @search_term = search_term
  @operator = operator
  @output = Array.new

  # Handle crawler manager info
  @cm_url = cm_hash[:crawler_manager_url] if cm_hash
  @selector_id = cm_hash[:selector_id] if cm_hash
end

Public Instance Methods

gen_json() click to toggle source

Generate JSON for output

# File lib/twittercrawler.rb, line 107
def gen_json
  JSON.pretty_generate(@output)
end
gen_query() click to toggle source

Generate advanced query

# File lib/twittercrawler.rb, line 21
def gen_query
  if @operator
    return URI.encode(@search_term + " " + @operator)
  else
    return URI.encode(@search_term)
  end
end
gen_query_url(start_tweet, end_tweet) click to toggle source

Generate the query url for Twitter

# File lib/twittercrawler.rb, line 38
def gen_query_url(start_tweet, end_tweet)
  # Base query url
  query_url = "https://twitter.com/i/search/timeline?f=tweets&vertical=news&q="+gen_query+"&src=typd&include_available_features=1&include_entities=1&lang=en"

  # Gen query URL
  if start_tweet && end_tweet
    query_url += "&max_position=TWEET-"+start_tweet+"-"+end_tweet
  end
  return query_url
end
get_tweet_id(tweet) click to toggle source

Get the ID for a tweet

# File lib/twittercrawler.rb, line 68
def get_tweet_id(tweet)
  return tweet[:tweet_link].split("/").last
end
get_tweet_range(parsed_tweets, end_tweet) click to toggle source

Get start and end tweets

# File lib/twittercrawler.rb, line 73
def get_tweet_range(parsed_tweets, end_tweet)
  if end_tweet # Keeep latest tweet as same
    return get_tweet_id(parsed_tweets.last), end_tweet
  else # Get updated start tweet
    return get_tweet_id(parsed_tweets.last), get_tweet_id(parsed_tweets.first)
  end
end
parse_tweets(tweets) click to toggle source

Parse the tweets into html

# File lib/twittercrawler.rb, line 30
def parse_tweets(tweets)
  return tweets.map do |tweet|
    parser = TwitterParser.new(tweet.to_html)
    parser.parse_tweet
  end
end
query_tweets(start_tweet, end_tweet) click to toggle source

Query tweets

# File lib/twittercrawler.rb, line 50
def query_tweets(start_tweet, end_tweet)
  # Run Query and parse results
  c = Curl::Easy.perform(gen_query_url(start_tweet, end_tweet))
  curl_items = JSON.parse(c.body_str)
  tweets = Nokogiri::HTML.parse(curl_items["items_html"]).css(".tweet") if curl_items["items_html"]

  # Save results
  parsed_tweets = parse_tweets(tweets)
  report_results(parsed_tweets, parsed_tweets.length.to_s+" tweets")
  
  # Recurse when needed
  if !parsed_tweets.empty?
    start_tweet, end_tweet = get_tweet_range(parsed_tweets, end_tweet)
    query_tweets(start_tweet, end_tweet)
  end
end
report_batch(results) click to toggle source

Report all results in one JSON

# File lib/twittercrawler.rb, line 91
def report_batch(results)
  results.each do |result|
    @output.push(result)
  end
end
report_incremental(results, link) click to toggle source

Report results back to Harvester incrementally

# File lib/twittercrawler.rb, line 98
def report_incremental(results, link)
  curl_url = @cm_url+"/relay_results"
  c = Curl::Easy.http_post(curl_url,
                           Curl::PostField.content('selector_id', @selector_id),
                           Curl::PostField.content('status_message', "Collected " + link),
                           Curl::PostField.content('results', JSON.pretty_generate(results)))
end
report_results(results, link) click to toggle source

Figure out how to report results

# File lib/twittercrawler.rb, line 82
def report_results(results, link)
  if @cm_url
    report_incremental(results, link)
  else
    report_batch(results)
  end
end