class TheScrap::ListObj
Attributes
get_next_url[RW]
get_page_count[RW]
has_many_pages[RW]
item_filters[RW]
next_page_css[RW]
pager_method[RW]
Public Class Methods
new()
click to toggle source
Calls superclass method
# File lib/the_scrap/list_obj.rb, line 11 def initialize() super @item_filters = [] end
Public Instance Methods
scrap( url )
click to toggle source
# File lib/the_scrap/list_obj.rb, line 16 def scrap( url ) items = [] html = open(url) html_proc.each do |dp| html = dp.call(html) end doc = Nokogiri::HTML(html,nil,encoding) doc.css(item_frag).each do |item| item_info = {} get_attrs(url,item,item_info) #filter items need_skip = false item_filters.each do |filter| unless filter.call(item_info) need_skip = true break end end next if need_skip #has detail page? detail_info.each do |detail| detail[0].scrap(item_info[detail[1]],item_info) end #proc result data_proc.each do |dp| dp.call(url,item_info) end items << item_info pp item_info if debug? break if debug? end result_proc.each do |rp| rp.call(url,items) end return doc,items end
scrap_list()
click to toggle source
# File lib/the_scrap/list_obj.rb, line 63 def scrap_list doc,items = retryable(:tries => 3, :on => Timeout::Error) do scrap(url) end return unless @has_many_pages #TODO Refactor it next_page_url = nil prev_page_url = nil if @pager_method == :next_page #有下一页连接的方式 while node = doc.css(next_page_css).first next_page_url = URI.join(next_page_url||url,node['href']).to_s break if prev_page_url == next_page_url puts "url: #{next_page_url}" if verbose? doc,items = retryable(:tries => 3, :on => Timeout::Error) do scrap(next_page_url) end prev_page_url = next_page_url break if items.count == 0 break if debug? end elsif pager_method == :total_pages #可以获取总页数的方式,start by 1 page_cnt = get_page_count.call(doc) (2..page_cnt).each do |idx| next_page_url = get_next_url.call(url,idx) puts next_page_url if verbose? doc,items = retryable(:tries => 3, :on => Timeout::Error) do scrap(next_page_url) end break if items.count == 0 break if debug? end elsif pager_method == :total_records #TODO #可以取到总条数的方式 , 其实也可以使用上一方式(总页数)实现,只是在外部先使用总条数计算一下总页数 end end