class NHKore::BingScraper
@author Jonathan Bradley Whited @since 0.2.0
Attributes
regex[R]
site[R]
Public Class Methods
build_url(site,count: DEFAULT_RESULT_COUNT,**kargs)
click to toggle source
# File lib/nhkore/search_scraper.rb, line 98 def self.build_url(site,count: DEFAULT_RESULT_COUNT,**kargs) url = ''.dup url << 'https://www.bing.com/search?' url << URI.encode_www_form( q: "site:#{site}", count: count ) return url end
new(site,regex: nil,url: nil,**kargs)
click to toggle source
Calls superclass method
NHKore::SearchScraper::new
# File lib/nhkore/search_scraper.rb, line 74 def initialize(site,regex: nil,url: nil,**kargs) case site when :futsuu regex = FUTSUU_REGEX if regex.nil? site = FUTSUU_SITE when :yasashii regex = YASASHII_REGEX if regex.nil? site = YASASHII_SITE else raise ArgumentError,"invalid site[#{site}]" end raise ArgumentError,"empty regex[#{regex}]" if regex.nil? @regex = regex @site = site url = self.class.build_url(site,**kargs) if url.nil? # Delete class-specific args (don't pass to Open-URI). kargs.delete(:count) super(url,**kargs) end
Public Instance Methods
scrape(slinks,page=NextPage.new())
click to toggle source
# File lib/nhkore/search_scraper.rb, line 110 def scrape(slinks,page=NextPage.new()) next_page,link_count = scrape_html(slinks,page) if link_count <= 0 scrape_rss(slinks,page,next_page) end return next_page end
scrape_html(slinks,page,next_page=NextPage.new())
click to toggle source
# File lib/nhkore/search_scraper.rb, line 120 def scrape_html(slinks,page,next_page=NextPage.new()) doc = html_doc link_count = 0 anchors = doc.css('a') anchors.each do |anchor| href = anchor['href'].to_s href = Util.unspace_web_str(href).downcase next if ignore_link?(href) if (md = href.match(/first=(\d+)/)) count = md[1].to_i if count > page.count && (next_page.count < 0 || count < next_page.count) next_page.count = count next_page.url = join_url(href) end elsif href =~ regex slinks.add_link(SearchLink.new(href)) link_count += 1 end end return [next_page,link_count] end
scrape_rss(slinks,page,next_page=NextPage.new())
click to toggle source
# File lib/nhkore/search_scraper.rb, line 149 def scrape_rss(slinks,page,next_page=NextPage.new()) link_count = 0 if !@is_file uri = URI(@url) Util.replace_uri_query!(uri,format: 'rss') self.open(uri) doc = rss_doc rss_links = [] doc.items.each do |item| link = item.link.to_s link = Util.unspace_web_str(link).downcase rss_links << link next if ignore_link?(link) next if link !~ regex slinks.add_link(SearchLink.new(link)) link_count += 1 end # For RSS, Bing will keep returning the same links over and over # if it's the last page or the "first=" query is the wrong count. # Therefore, we have to test the previous RSS links (+page.rss_links+). if next_page.empty? && doc.items.length >= 1 && page.rss_links != rss_links next_page.count = (page.count < 0) ? 0 : page.count next_page.count += doc.items.length next_page.rss_links = rss_links uri = URI(page.url.nil? ? @url : page.url) Util.replace_uri_query!(uri,first: next_page.count) next_page.url = uri end end return [next_page,link_count] end