class TheScrap::DetailObj

Public Instance Methods

do_scrap( url, item_info ) click to toggle source
# File lib/the_scrap/detail_obj.rb, line 10
def do_scrap( url, item_info )
  html = open(url).read
  html_proc.each do |dp|
    html = dp.call(html)
  end

  doc = Nokogiri::HTML(html,nil,encoding)
  get_attrs(url,doc,item_info)

  #has detail page?
  #可以递归下层
  detail_info.each do |detail|
    detail[0].scrap(item_info[detail[1]],item_info)
  end

  #proc data
  data_proc.each do |dp|
    dp.call(url,item_info)
  end

  #proc result
  #此处可以单独指定对明细信息的入库处理
  result_proc.each do |rp|
    rp.call(url,[item_info])
  end

  pp item_info if debug?
  return item_info
end
scrap( url, item_info ) click to toggle source
# File lib/the_scrap/detail_obj.rb, line 4
def scrap( url, item_info )
  return retryable(:tries => 3, :on => Timeout::Error) do
    do_scrap(url,item_info)
  end
end