class Birdwatcher::Modules::Urls::Crawl
Constants
- PAGE_TITLE_REGEX
Public Class Methods
info()
click to toggle source
# File lib/birdwatcher/modules/urls/crawl.rb, line 61 def self.info <<-INFO The URL Crawler module crawls shared URLs and enriches them with additional information: * HTTP status code (200, 404, 500, etc.) * Content type (application/html, application/pdf, etc) * Page title (if HTML document) Page titles can be included in the Word Cloud generated with the #{'statuses/word_cloud'.bold} module. #{'CAUTION:'.bold} Depending on the users in the workspace, it might not be safe to blindly request shared URLs. Consider using the #{'PROXY_ADDR'.bold} and #{'PROXY_PORT'.bold} module options. INFO end
Public Instance Methods
run()
click to toggle source
# File lib/birdwatcher/modules/urls/crawl.rb, line 79 def run if option_setting("RETRY_FAILED") urls = current_workspace.urls_dataset .where("crawled_at IS NULL or (crawled_at IS NOT NULL AND http_status IS NULL)") .order(Sequel.desc(:posted_at)) else urls = current_workspace.urls_dataset .where(:crawled_at => nil) .order(Sequel.desc(:posted_at)) end if urls.empty? error("There are currently no URLs in this workspace") return false end threads = thread_pool(option_setting("THREADS").to_i) http_client = Birdwatcher::HttpClient.new( :timeout => option_setting("TIMEOUT").to_i, :retries => option_setting("RETRIES").to_i, :user_agent => option_setting("USER_AGENT"), :http_proxyaddr => option_setting("PROXY_ADDR"), :http_proxyport => (option_setting("PROXY_PORT") ? option_setting("PROXY_PORT").to_i : nil), :http_proxyuser => option_setting("PROXY_USER"), :http_proxypass => option_setting("PROXY_PASS") ) urls.each do |url| threads.process do begin Timeout::timeout(option_setting("TIMEOUT").to_i * 2) do response = http_client.do_head(url.url) url.final_url = response.url url.http_status = response.status url.content_type = response.headers["content-type"] if response.headers.key?("content-type") && response.headers["content-type"].include?("text/html") url.title = extract_page_title(http_client.do_get(response.url).body) end url.crawled_at = Time.now url.save info("Crawled #{url.url.bold} (#{response.status} - #{response.headers["content-type"]})") end rescue => e url.crawled_at = Time.now url.save error("Crawling failed for #{url.url.bold} (#{e.class})") end end end threads.shutdown end
Private Instance Methods
extract_page_title(body)
click to toggle source
# File lib/birdwatcher/modules/urls/crawl.rb, line 130 def extract_page_title(body) title = body.scan(PAGE_TITLE_REGEX).first return nil if title.nil? CGI.unescapeHTML(title.first) end