class TwitterCrawler
Public Class Methods
new(search_term, operator, cm_hash)
click to toggle source
# File lib/twittercrawler.rb, line 10 def initialize(search_term, operator, cm_hash) @search_term = search_term @operator = operator @output = Array.new # Handle crawler manager info @cm_url = cm_hash[:crawler_manager_url] if cm_hash @selector_id = cm_hash[:selector_id] if cm_hash end
Public Instance Methods
gen_json()
click to toggle source
Generate JSON for output
# File lib/twittercrawler.rb, line 107 def gen_json JSON.pretty_generate(@output) end
gen_query()
click to toggle source
Generate advanced query
# File lib/twittercrawler.rb, line 21 def gen_query if @operator return URI.encode(@search_term + " " + @operator) else return URI.encode(@search_term) end end
gen_query_url(start_tweet, end_tweet)
click to toggle source
Generate the query url for Twitter
# File lib/twittercrawler.rb, line 38 def gen_query_url(start_tweet, end_tweet) # Base query url query_url = "https://twitter.com/i/search/timeline?f=tweets&vertical=news&q="+gen_query+"&src=typd&include_available_features=1&include_entities=1&lang=en" # Gen query URL if start_tweet && end_tweet query_url += "&max_position=TWEET-"+start_tweet+"-"+end_tweet end return query_url end
get_tweet_id(tweet)
click to toggle source
Get the ID for a tweet
# File lib/twittercrawler.rb, line 68 def get_tweet_id(tweet) return tweet[:tweet_link].split("/").last end
get_tweet_range(parsed_tweets, end_tweet)
click to toggle source
Get start and end tweets
# File lib/twittercrawler.rb, line 73 def get_tweet_range(parsed_tweets, end_tweet) if end_tweet # Keeep latest tweet as same return get_tweet_id(parsed_tweets.last), end_tweet else # Get updated start tweet return get_tweet_id(parsed_tweets.last), get_tweet_id(parsed_tweets.first) end end
parse_tweets(tweets)
click to toggle source
Parse the tweets into html
# File lib/twittercrawler.rb, line 30 def parse_tweets(tweets) return tweets.map do |tweet| parser = TwitterParser.new(tweet.to_html) parser.parse_tweet end end
query_tweets(start_tweet, end_tweet)
click to toggle source
Query tweets
# File lib/twittercrawler.rb, line 50 def query_tweets(start_tweet, end_tweet) # Run Query and parse results c = Curl::Easy.perform(gen_query_url(start_tweet, end_tweet)) curl_items = JSON.parse(c.body_str) tweets = Nokogiri::HTML.parse(curl_items["items_html"]).css(".tweet") if curl_items["items_html"] # Save results parsed_tweets = parse_tweets(tweets) report_results(parsed_tweets, parsed_tweets.length.to_s+" tweets") # Recurse when needed if !parsed_tweets.empty? start_tweet, end_tweet = get_tweet_range(parsed_tweets, end_tweet) query_tweets(start_tweet, end_tweet) end end
report_batch(results)
click to toggle source
Report all results in one JSON
# File lib/twittercrawler.rb, line 91 def report_batch(results) results.each do |result| @output.push(result) end end
report_incremental(results, link)
click to toggle source
Report results back to Harvester incrementally
# File lib/twittercrawler.rb, line 98 def report_incremental(results, link) curl_url = @cm_url+"/relay_results" c = Curl::Easy.http_post(curl_url, Curl::PostField.content('selector_id', @selector_id), Curl::PostField.content('status_message', "Collected " + link), Curl::PostField.content('results', JSON.pretty_generate(results))) end
report_results(results, link)
click to toggle source
Figure out how to report results
# File lib/twittercrawler.rb, line 82 def report_results(results, link) if @cm_url report_incremental(results, link) else report_batch(results) end end