class NHKore::Scraper
@author Jonathan Bradley Whited @since 0.2.0
Constants
- DEFAULT_HEADER
Attributes
kargs[R]
max_redirects[RW]
max_retries[RW]
redirect_rule[RW]
str_or_io[RW]
url[RW]
Public Class Methods
new(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3, redirect_rule: :strict,str_or_io: nil,**kargs)
click to toggle source
max_redirects
defaults to 3 for safety (infinite-loop attack).
All URL options: ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
Pass in +header: {}+ for the default HTTP header fields to be set.
@param eat_cookie [true,false] true to set the HTTP header field 'cookie', which can be an expensive
(time-consuming) operation since it opens the URL again, but necessary for some URLs.
@param redirect_rule
[nil,:lenient,:strict]
Calls superclass method
# File lib/nhkore/scraper.rb, line 53 def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3, redirect_rule: :strict,str_or_io: nil,**kargs) super() if !header.nil? && !is_file # Some sites (Search Engines) hate scrapers, so need HTTP header fields. # If this isn't enough, look at googler for more header fields to set: # - https://github.com/jarun/googler # If necessary, can use Faraday, HTTParty, or RestClient gem and # pass in to str_or_io. header = DEFAULT_HEADER.merge(header) kargs.merge!(header) end @eat_cookie = eat_cookie @is_file = is_file @kargs = kargs @max_redirects = max_redirects @max_retries = max_retries @redirect_rule = redirect_rule self.open(url,str_or_io,is_file: is_file) end
Public Instance Methods
html_doc()
click to toggle source
# File lib/nhkore/scraper.rb, line 99 def html_doc return Nokogiri::HTML(@str_or_io) end
join_url(relative_url)
click to toggle source
# File lib/nhkore/scraper.rb, line 103 def join_url(relative_url) # For a file, don't know what to do. # It would be unsafe to return something else; # for example, it could return a lot of "../../../" to your root dir. return nil if @is_file return URI.join(@url,relative_url) end
open(url,str_or_io=nil,is_file: @is_file)
click to toggle source
# File lib/nhkore/scraper.rb, line 112 def open(url,str_or_io=nil,is_file: @is_file) @is_file = is_file @str_or_io = str_or_io @url = url if str_or_io.nil? if @is_file open_file(url) else fetch_cookie(url) if @eat_cookie open_url(url) end end return self end
open_file(file)
click to toggle source
# File lib/nhkore/scraper.rb, line 129 def open_file(file) @is_file = true @url = file # NHK's website tends to always use UTF-8. @str_or_io = File.open(file,'rt:UTF-8',**@kargs) return self end
open_url(url)
click to toggle source
# File lib/nhkore/scraper.rb, line 139 def open_url(url) max_redirects = (@max_redirects.nil? || @max_redirects < 0) ? 10_000 : @max_redirects max_retries = (@max_retries.nil? || @max_retries < 0) ? 10_000 : @max_retries top_uri = URI(url) top_domain = Util.domain(top_uri.host) begin # Use URI().open() instead of URI.open()/(Kernel.)open() for safety (code-injection attack). # Use URI() instead of URI.parse() because url can be a URI (not just a string). @str_or_io = URI(url).open(redirect: false,**@kargs) @url = url rescue OpenURI::HTTPRedirect => redirect redirect_uri = redirect.uri if (max_redirects -= 1) < 0 raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}") end case @redirect_rule when :lenient,:strict if redirect_uri.scheme != top_uri.scheme raise redirect.exception("redirect scheme[#{redirect_uri.scheme}] does not match original " \ "scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}") end if @redirect_rule == :strict redirect_domain = Util.domain(redirect_uri.host) if redirect_domain != top_domain raise redirect.exception("redirect domain[#{redirect_domain}] does not match original " \ "domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}") end end end url = redirect_uri retry # Must come after HTTPRedirect since a subclass of HTTPError. rescue OpenURI::HTTPError => e raise e.exception("HTTP error[#{e}] at URL[#{url}]") rescue SocketError => e if (max_retries -= 1) < 0 raise e.exception("Socket error[#{e}] at URL[#{url}]") end retry end return self end
read()
click to toggle source
# File lib/nhkore/scraper.rb, line 192 def read @str_or_io = @str_or_io.read if @str_or_io.respond_to?(:read) return @str_or_io end
reopen()
click to toggle source
# File lib/nhkore/scraper.rb, line 198 def reopen return self.open(@url) end
rss_doc()
click to toggle source
# File lib/nhkore/scraper.rb, line 202 def rss_doc require 'rss' return RSS::Parser.parse(@str_or_io,validate: false) end