class TheScrap::ListObj

Attributes

get_next_url[RW]
get_page_count[RW]
has_many_pages[RW]
item_filters[RW]
next_page_css[RW]
pager_method[RW]

Public Class Methods

new() click to toggle source
Calls superclass method
# File lib/the_scrap/list_obj.rb, line 11
def initialize()
  super
  @item_filters = []
end

Public Instance Methods

scrap( url ) click to toggle source
# File lib/the_scrap/list_obj.rb, line 16
def scrap( url )
  items = []

  html = open(url)
  html_proc.each do |dp|
    html = dp.call(html)
  end

  doc = Nokogiri::HTML(html,nil,encoding)
  doc.css(item_frag).each do |item|

    item_info = {}
    get_attrs(url,item,item_info)

    #filter items
    need_skip = false
    item_filters.each do |filter|
      unless filter.call(item_info)
        need_skip = true
        break
      end
    end
    next if need_skip

    #has detail page?
    detail_info.each do |detail|
      detail[0].scrap(item_info[detail[1]],item_info)
    end

    #proc result
    data_proc.each do |dp|
      dp.call(url,item_info)
    end

    items << item_info

    pp item_info if debug?
    break if debug?
  end

  result_proc.each do |rp|
    rp.call(url,items)
  end

  return doc,items
end
scrap_list() click to toggle source
# File lib/the_scrap/list_obj.rb, line 63
def scrap_list
  doc,items = retryable(:tries => 3, :on => Timeout::Error) do
    scrap(url)
  end

  return unless  @has_many_pages

  #TODO Refactor it
  next_page_url = nil
  prev_page_url = nil
  if @pager_method == :next_page #有下一页连接的方式
    while node = doc.css(next_page_css).first
      next_page_url = URI.join(next_page_url||url,node['href']).to_s
      break if prev_page_url == next_page_url

      puts "url: #{next_page_url}" if verbose?
      doc,items = retryable(:tries => 3, :on => Timeout::Error) do
        scrap(next_page_url)
      end

      prev_page_url = next_page_url
      break if items.count == 0
      break if debug?
    end
  elsif pager_method == :total_pages #可以获取总页数的方式,start by 1
    page_cnt = get_page_count.call(doc)
    (2..page_cnt).each do |idx|
      next_page_url = get_next_url.call(url,idx)
      puts next_page_url if verbose?
      doc,items = retryable(:tries => 3, :on => Timeout::Error) do
        scrap(next_page_url)
      end
      break if items.count == 0
      break if debug?
    end
  elsif pager_method == :total_records
    #TODO
    #可以取到总条数的方式 , 其实也可以使用上一方式(总页数)实现,只是在外部先使用总条数计算一下总页数
  end
end