class TheScrap::Scrap

Attributes

base_url[RW]
data_proc[RW]
debug[RW]
debug?[RW]
detail_info[RW]
encoding[RW]
html_proc[RW]
item_frag[RW]
result_proc[RW]
url[RW]
verbose[RW]
verbose?[RW]

Public Class Methods

new() click to toggle source
# File lib/the_scrap/scrap.rb, line 26
def initialize()
  @attrs = {}
  @more_info = []
  @debug = false
  #@encoding = 'utf-8'
  @result_proc = []
  @detail_info = []
  @data_proc = []
  @html_proc = []
end

Public Instance Methods

method_missing( method_id, *arguments, &block ) click to toggle source
# File lib/the_scrap/scrap.rb, line 54
def method_missing( method_id, *arguments, &block )
  if(method_id =~ /attr_(.*)=/)
    name = $~[1]
    @attrs[name] = arguments.first
  end
end
retryable( options = {} ) { || ... } click to toggle source
# File lib/the_scrap/scrap.rb, line 37
def retryable( options = {} )
  opts = { :tries => 1, :on => Exception }.merge(options)

  retry_exception, retries = opts[:on], opts[:tries]

  begin
    return yield
  rescue retry_exception
    if (retries -= 1) > 0
      sleep 2
      retry 
    else
      raise
    end
  end
end

Protected Instance Methods

get_attrs( url, doc, item_info ) click to toggle source

TODO document

# File lib/the_scrap/scrap.rb, line 63
def get_attrs( url, doc, item_info )
  @attrs.keys.each do |k|
    unless @attrs[k].is_a? Array
      item_info[k] = doc.css(@attrs[k]).text.strip
    else
      option = @attrs[k]
      if option[0] == :frag_attr
        item_info[k] = doc[option[1]]
        next
      end

      node = doc.css(option[0]).first
      next unless node
      if(option[1] == :inner_html)
        item_info[k] = node.inner_html
      elsif(option[1] == :join)
        item_info[k] = doc.css(option[0]).map{|i|i.text}.join(',')
      elsif(option[1] == :array)
        item_info[k] = doc.css(option[0]).map{|i|i.text}
      else
        if [:href,:src].include? option[1].to_sym
          #why ???
          src = node[option[1]].strip.gsub(" ","%20")
          begin
            item_info[k] = URI.join(base_url||url,src).to_s  
          rescue
            item_info[k] = src.to_s
          end
        else
          item_info[k] = node[option[1]].strip
        end
      end
    end
  end
end