class Socializer::Scraper::Extractor
Attributes
collectors[W]
url[R]
Public Class Methods
available_collectors()
click to toggle source
# File lib/socializer/scraper/extractor.rb, line 53 def available_collectors self.instance_methods.select do |name| name.to_s.end_with?("_collector") end.map do |name| name.to_s.gsub(/_collector$/, '').to_sym end end
new(options = {})
click to toggle source
# File lib/socializer/scraper/extractor.rb, line 10 def initialize options = {} self.url = options.fetch(:url, nil) self.collectors = options.fetch(:collectors, []) end
Public Instance Methods
collectors()
click to toggle source
# File lib/socializer/scraper/extractor.rb, line 26 def collectors @collectors.any? ? @collectors : self.class.available_collectors end
run(*patterns) { |page, collector, found| ... }
click to toggle source
# File lib/socializer/scraper/extractor.rb, line 30 def run *patterns, &block data, options = {}, patterns.extract_options! page_wise = options.delete(:page_wise) patterns = patterns.push(options) perform(*patterns) do |page| collectors.each do |collector| found = send("#{collector}_collector") yield(page, collector, found) if block_given? if page_wise data[collector] ||= {} data[collector][@current_url] = found else data[collector] ||= [] data[collector].push found end end end data.hash_map{|kind, list| [kind, list.hashify_or_collect]} end
url=(url)
click to toggle source
Set the URL to crawl for this Crawler instance.
@param url [string] URL or domain name to crawl. @return string url
# File lib/socializer/scraper/extractor.rb, line 19 def url= url return unless url @url = URI.parse(url) message = "Please, provide a URL that starts with HTTP or HTTPS" raise URI::InvalidURIError, message unless @url.url? end
Protected Instance Methods
page_html()
click to toggle source
# File lib/socializer/scraper/extractor.rb, line 64 def page_html @html ||= Nokogiri::HTML(@page.body) end
page_links()
click to toggle source
# File lib/socializer/scraper/extractor.rb, line 68 def page_links page_html.search("a").map{|a| a.attr("href")}.accumulate end
Private Instance Methods
links_matching(links, patterns = [])
click to toggle source
# File lib/socializer/scraper/extractor.rb, line 96 def links_matching links, patterns = [] return links if patterns.empty? links.select do |link| patterns.detect{|p| link.to_s =~ p} end end
perform(*patterns) { |page| ... }
click to toggle source
# File lib/socializer/scraper/extractor.rb, line 74 def perform *patterns, &block options = patterns.extract_options! message = "Please, provide a URL that starts with HTTP or HTTPS" raise URI::InvalidURIError, message unless @url.url? patterns.push(/.*/) if patterns.empty? Anemone.crawl(@url, options) do |anemone| anemone.threads = 4 anemone.verbose = true anemone.obey_robots_txt = true anemone.accept_cookies = true anemone.user_agent = "Googlebot" anemone.storage = Anemone::Storage.MongoDB anemone.focus_crawl{|page| links_matching(page.links, patterns) } anemone.on_every_page do |page| @page, @html, @current_url = page, nil, page.url yield(page) end end end