class Socializer::Scraper::Extractor

Attributes

collectors[W]
url[R]

Public Class Methods

available_collectors() click to toggle source
# File lib/socializer/scraper/extractor.rb, line 53
def available_collectors
  self.instance_methods.select do |name|
    name.to_s.end_with?("_collector")
  end.map do |name|
    name.to_s.gsub(/_collector$/, '').to_sym
  end
end
new(options = {}) click to toggle source
# File lib/socializer/scraper/extractor.rb, line 10
def initialize options = {}
  self.url = options.fetch(:url, nil)
  self.collectors = options.fetch(:collectors, [])
end

Public Instance Methods

collectors() click to toggle source
# File lib/socializer/scraper/extractor.rb, line 26
def collectors
  @collectors.any? ? @collectors : self.class.available_collectors
end
run(*patterns) { |page, collector, found| ... } click to toggle source
# File lib/socializer/scraper/extractor.rb, line 30
def run *patterns, &block
  data, options = {}, patterns.extract_options!
  page_wise = options.delete(:page_wise)
  patterns  = patterns.push(options)

  perform(*patterns) do |page|
    collectors.each do |collector|
      found = send("#{collector}_collector")
      yield(page, collector, found) if block_given?
      if page_wise
        data[collector] ||= {}
        data[collector][@current_url] = found
      else
        data[collector] ||= []
        data[collector].push found
      end
    end
  end

  data.hash_map{|kind, list| [kind, list.hashify_or_collect]}
end
url=(url) click to toggle source

Set the URL to crawl for this Crawler instance.

@param url [string] URL or domain name to crawl. @return string url

# File lib/socializer/scraper/extractor.rb, line 19
def url= url
  return unless url
  @url = URI.parse(url)
  message = "Please, provide a URL that starts with HTTP or HTTPS"
  raise URI::InvalidURIError, message unless @url.url?
end

Protected Instance Methods

page_html() click to toggle source
# File lib/socializer/scraper/extractor.rb, line 64
def page_html
  @html ||= Nokogiri::HTML(@page.body)
end

Private Instance Methods

perform(*patterns) { |page| ... } click to toggle source
# File lib/socializer/scraper/extractor.rb, line 74
def perform *patterns, &block
  options = patterns.extract_options!
  message = "Please, provide a URL that starts with HTTP or HTTPS"
  raise URI::InvalidURIError, message unless @url.url?

  patterns.push(/.*/) if patterns.empty?

  Anemone.crawl(@url, options) do |anemone|
    anemone.threads = 4
    anemone.verbose = true
    anemone.obey_robots_txt = true
    anemone.accept_cookies = true
    anemone.user_agent = "Googlebot"
    anemone.storage = Anemone::Storage.MongoDB
    anemone.focus_crawl{|page| links_matching(page.links, patterns) }
    anemone.on_every_page do |page|
      @page, @html, @current_url = page, nil, page.url
      yield(page)
    end
  end
end