module AwesomeBot
Validate awesome project URLs
Check links
Command line interface
Get and filter links
Get link status
Output helpers
Project version and constants
Process white list
Command line interface
Constants
- MARKDOWN_LINK_REGEX
This matches, from left to right: a literal [ the link title - i.e. anything up to the next closing bracket a literal ] a literal ( the link destination (optionally enclosed in a single pair of angle brackets) a literal )
- PROJECT
- PROJECT_DESCRIPTION
- PROJECT_URL
- RESULTS_PREFIX
- STATUS_400s
- STATUS_ERROR
- STATUS_OK
- STATUS_OTHER
- STATUS_REDIRECT
- VERSION
Public Class Methods
check(content, options=nil, number_of_threads=1) { |"Links to check: #{links.count}"| ... }
click to toggle source
# File lib/awesome_bot/check.rb, line 9 def check(content, options=nil, number_of_threads=1) if options.nil? white_listed = nil skip_dupe = false timeout = nil delay = 0 base = nil else white_listed = options['whitelist'] skip_dupe = options['allowdupe'] timeout = options['timeout'] delay = options['delay'] delay = 0 if delay.nil? base = options['baseurl'] end links = links_filter(links_find(content, base)) r = Result.new(links, white_listed) r.skip_dupe = skip_dupe r.dupes = r.links.select { |e| r.links.count(e) > 1 } yield "Links to check: #{r.links.count}" if block_given? yield ", #{r.links_white_listed.count} white listed" if r.white_listing && block_given? uniq = r.links.uniq.count yield ", #{uniq} unique" if uniq != r.links.count && block_given? yield "\n" if block_given? total = pad_list r.links.uniq r.links.uniq.each_with_index do |u, j| yield " #{pad_text j + 1, total}. #{u} \n" if block_given? end head = false yield 'Checking URLs: ' if block_given? && r.links.count > 0 r.status = statuses(r.links.uniq, number_of_threads, timeout, head, delay) do |s| yield log_status s if block_given? end yield "\n" if block_given? return r if !r.white_listing || (r.links_white_listed.count == 0) yield 'Checking white listed URLs: ' if block_given? r.white_listed = statuses(r.links_white_listed.uniq, number_of_threads, nil, head, delay) do |s| yield log_status s if block_given? end yield "\n" if block_given? r end
cli()
click to toggle source
# File lib/awesome_bot/cli.rb, line 10 def cli() require 'optparse' ARGV << '-h' if ARGV.empty? options = {} ARGV.options do |opts| opts.banner = "Usage: #{PROJECT} [file or files] \n"\ " #{PROJECT} [options]" opts.on('-f', '--files [files]', Array, 'Comma separated files to check') { |val| options['files'] = val } opts.on('-a', '--allow [errors]', Array, 'Status code errors to allow') { |val| options['errors'] = val } opts.on('--allow-dupe', TrueClass, 'Duplicate URLs are allowed') { |val| options['allow_dupe'] = val } opts.on('--allow-ssl', TrueClass, 'SSL errors are allowed') { |val| options['allow_ssl'] = val } opts.on('--allow-redirect', TrueClass, 'Redirected URLs are allowed') { |val| options['allow_redirect'] = val } opts.on('--allow-timeout', TrueClass, 'URLs that time out are allowed') { |val| options['allow_timeout'] = val } opts.on('--base-url [base url]', String, 'Base URL to use for relative links') { |val| options['base_url'] = val } opts.on('-d', '--request-delay [seconds]', Float, 'Set request delay') { |val| options['delay'] = val } opts.on('-t', '--set-timeout [seconds]', Integer, 'Set connection timeout (default: 30)') { |val| options['timeout'] = val } opts.on('--skip-save-results', TrueClass, 'Skip saving results') { |val| options['no_results'] = val } opts.on('-w', '--white-list [urls]', Array, 'Comma separated URLs to white list') { |val| options['white_list'] = val } opts.on('-v', '--version', String, 'Display version') { |val| puts "#{PROJECT} version #{VERSION}" } opts.on_tail("--help") do puts opts exit end opts.parse! end files = options['files'] if files.nil? files = [] ARGV.each do |a| files.push a if a !~ /^--.*/ end end summary = {} files.each do |f| summary[f] = cli_process(f, options) end if summary.count>1 puts "\nSummary" largest = 0 summary.each do |k, v| s = k.size largest = s if s>largest end summary.each do |k, v| k_display = "%#{largest}.#{largest}s" % k puts "#{k_display}: #{v}" end end summary.each { |k, v| exit 1 unless v==STATUS_OK } end
cli_process(filename, options)
click to toggle source
# File lib/awesome_bot/cli.rb, line 71 def cli_process(filename, options) begin untrusted = File.read filename content = untrusted.encode('UTF-16', :invalid => :replace, :replace => '').encode('UTF-8') rescue => error puts "File open error: #{error}" return error end puts "> Checking links in #{filename}" base = options['base_url'] puts "> Will check relative links with base URL #{base}" unless base.nil? errors = options['errors'] puts "> Will allow errors: #{errors.join ','}" unless errors.nil? skip_dupe = options['allow_dupe'] puts '> Will allow duplicate links' if skip_dupe == true allow_redirects = options['allow_redirect'] puts '> Will allow redirects' if allow_redirects == true allow_ssl = options['allow_ssl'] puts '> Will allow SSL errors' if allow_ssl == true allow_timeouts = options['allow_timeout'] puts '> Will allow network timeouts' if allow_timeouts == true delay = options['delay'] puts "> Will delay each request by #{delay} second#{delay==1? '': 's'}" unless delay.nil? white_listed = options['white_list'] timeout = options['timeout'] puts "> Connection timeout = #{timeout}s" unless timeout.nil? puts "> White list links matching: #{white_listed.join ', '} " unless white_listed.nil? no_results = options['no_results'] if no_results == true puts '> Will not save results' else no_results = false end options = { 'allowdupe' => skip_dupe, 'delay' => delay, 'timeout' => timeout, 'whitelist' => white_listed, 'baseurl' => base } threads = delay == nil ? 10 : 1 r = check(content, options, threads) do |o| print o end digits = number_of_digits content unless r.white_listed.nil? puts "\n> White listed:" o = order_by_loc r.white_listed, content o.each_with_index do |x, k| temp, _ = output(x, k, pad_list(o), digits) puts temp end end allow_redirects = false if allow_redirects.nil? allow_ssl = false if allow_ssl.nil? allow_timeouts = false if allow_timeouts.nil? options = { 'errors' => errors, 'redirect' => allow_redirects, 'ssl' => allow_ssl, 'timeout' => allow_timeouts } if r.success(options) == true puts 'No issues :-)' write_results(filename, r, no_results) write_markdown_results(filename, nil, no_results) return STATUS_OK else filtered_issues = [] puts "\nIssues :-(" print "> Links \n" if r.success_links(options) puts " All OK #{STATUS_OK}" else o = order_by_loc r.statuses_issues(options), content o.each_with_index do |x, k| temp, h = output(x, k, pad_list(o), digits) filtered_issues.push h puts temp end end unless skip_dupe print "> Dupes \n" if r.success_dupe puts " None #{STATUS_OK}" else dupe_hash = r.dupes.uniq.map do |x| temp = {} temp['url'] = x temp end o = order_by_loc dupe_hash, content largest = o.last['loc'].to_s.size o.each_with_index do |d, index| loc = d['loc'] url = d['url'] error = 'Dupe' hash = { 'loc'=> loc, 'link'=> url, 'error'=> error } filtered_issues.push hash print " #{pad_text index + 1, pad_list(r.dupes.uniq)}. " print loc_formatted loc, largest puts " #{url}" end end end write_results(filename, r, no_results) filtered = write_results_filtered(filename, filtered_issues, no_results) write_markdown_results(filename, filtered, no_results) return 'Issues' end end
filter_filename(f)
click to toggle source
# File lib/awesome_bot/write.rb, line 8 def filter_filename(f) f.gsub('/','-') end
get_relative_links(content, base)
click to toggle source
# File lib/awesome_bot/links.rb, line 58 def get_relative_links(content, base) links = [] content.scan(MARKDOWN_LINK_REGEX) { |groups| links << groups.first } links.reject { |x| x.include?('http') || x.include?('#') } .map { |x| x =~ /\S/ ? x.match(/^\S*/) : x } .map { |x| "#{base}#{x}"} end
links_filter(list)
click to toggle source
# File lib/awesome_bot/links.rb, line 13 def links_filter(list) list.reject { |x| x.length < 9 } .map do |x| x.gsub(',','%2c').gsub(/'.*/, '').gsub(/,.*/, '') end .map do |x| if x.include? ')]' x.gsub /\)\].*/, '' elsif (x.scan(')').count == 1) && (x.scan('(').count == 1) x elsif (x.scan(')').count == 2) && (x.scan('(').count == 1) x.gsub(/\)\).*/, ')') elsif (x.scan(')').count > 0) if (x.include? 'wikipedia') if (x.scan(')').count >= 1) && (x.scan('(').count == 0) x.gsub(/\).*/, '') else x end else x.gsub(/\).*/, '') end elsif x.include? '[' # adoc x.gsub(/\[.*/, '') elsif x[-1]=='.' || x[-1]==':' x[0..-2] elsif x[-1]=='.' x[0..-2] elsif x[-3..-1]=='%2c' x[0..-4] else x end end end
links_find(content, url_base=nil)
click to toggle source
# File lib/awesome_bot/links.rb, line 49 def links_find(content, url_base=nil) require 'uri' ext = URI.extract(content, /http()s?/) return ext if url_base.nil? rel = get_relative_links content, url_base return rel + ext end
loc(x, content)
click to toggle source
# File lib/awesome_bot/output.rb, line 9 def loc(x, content) count = 0 lines = content.split "\n" lines.each do |l| count += 1 return count if l.include? x end return count end
loc_formatted(loc, largest=3)
click to toggle source
# File lib/awesome_bot/output.rb, line 19 def loc_formatted(loc, largest=3) line = pad_text loc, largest "[L#{line}]" end
log_status(s)
click to toggle source
# File lib/awesome_bot/output.rb, line 24 def log_status(s) if status_is_redirected? s return STATUS_REDIRECT elsif s == 200 return STATUS_OK elsif (s > 399 && s < 500) return STATUS_400s else return STATUS_OTHER end end
net_status(url, timeout=30, head)
click to toggle source
# File lib/awesome_bot/net.rb, line 6 def net_status(url, timeout=30, head) require 'net/http' require 'openssl' require 'uri' uri = URI.parse url Net::HTTP.start(uri.host, uri.port, :use_ssl => uri.scheme == 'https', :open_timeout => timeout) do |http| ua = {'User-Agent' => 'awesome_bot'} if head request = Net::HTTP::Head.new(uri,ua) else request = Net::HTTP::Get.new(uri,ua) end if uri.userinfo auth_user, auth_pass = uri.userinfo.split(/:/) request.basic_auth auth_user, auth_pass end response = http.request request code = response.code==nil ? 200 : response.code.to_i headers = {} response.each do |k, v| headers[k] = v.force_encoding("utf-8") end # handle incomplete redirect loc = headers['location'] unless loc.nil? loc_uri = URI.parse loc if loc_uri.scheme.nil? new_loc = uri.scheme + '://' + uri.host + loc headers['location'] = new_loc end end return [code, headers] end end
number_of_digits(content)
click to toggle source
# File lib/awesome_bot/output.rb, line 36 def number_of_digits(content) lines = content.split "\n" return pad_list lines end
order_by_loc(list, content)
click to toggle source
# File lib/awesome_bot/output.rb, line 41 def order_by_loc(list, content) list.each do |x| x['loc'] = loc x['url'], content end s = list.sort_by { |h| h['loc'] } return s end
output(x, index, total, largest)
click to toggle source
# File lib/awesome_bot/output.rb, line 50 def output(x, index, total, largest) s = x['status'] loc = x['loc'] status = s == STATUS_ERROR ? '' : s link = x['url'] redirect = status_is_redirected?(s) ? x['headers']['location'] : '' error = s == STATUS_ERROR ? x['error'] : '' hash = { 'loc'=> loc, 'status'=> s, 'link'=> link, 'redirect'=> redirect, 'error'=> error } o = " #{pad_text index + 1, total}. " \ "#{loc_formatted loc, largest} " \ "#{status} " \ "#{link} " \ "#{error}" \ "#{output_redirect x} \n" [o, hash] end
output_redirect(x)
click to toggle source
# File lib/awesome_bot/output.rb, line 78 def output_redirect(x) if status_is_redirected? x['status'] " #{STATUS_REDIRECT} #{x['headers']['location']}" else '' end end
pad_list(list)
click to toggle source
# File lib/awesome_bot/output.rb, line 86 def pad_list(list) list.count.to_s.size end
pad_text(number, digits)
click to toggle source
# File lib/awesome_bot/output.rb, line 90 def pad_text(number, digits) format = "%0#{digits}d" "#{sprintf format, number}" end
status_is_redirected?(status)
click to toggle source
# File lib/awesome_bot/net.rb, line 48 def status_is_redirected?(status) (status > 299) && (status < 400) end
statuses(links, threads, timeout, head=false, delay=0) { |status, u, headers| ... }
click to toggle source
# File lib/awesome_bot/net.rb, line 52 def statuses(links, threads, timeout, head=false, delay=0) require 'parallel' statuses = [] Parallel.each(links, in_threads: threads) do |u| sleep delay begin status, headers = net_status u, timeout, head error = nil rescue => e status = STATUS_ERROR headers = {} error = e end yield status, u, headers if block_given? statuses.push('url' => u, 'status' => status, 'error' => error, 'headers' => headers) end # Parallel statuses end
white_list(list, item)
click to toggle source
# File lib/awesome_bot/white_list.rb, line 4 def white_list(list, item) list.each { |x| return true if item.include? x } false end
write_markdown_results(filename, filtered, silent)
click to toggle source
# File lib/awesome_bot/write.rb, line 12 def write_markdown_results(filename, filtered, silent) return false if silent==true payload = if filtered.nil? {'error'=>false} else results = File.read filtered j = JSON.parse results num = j.count plural = num==1?'':'s' title = "Found #{num} link issue#{plural}" message = "#### Link issue#{plural} by [`awesome_bot`](https://github.com/dkhamsing/awesome_bot)\n\n" message << " Line | Status | Link\n" message << "| ---: | :----: | --- |\n" j.sort_by { |h| h['loc'] }.each do |i| error = i['error'] loc = i['loc'] link = i['link'] s = i['status'] r = i['redirect'] if error=='Dupe' message << "#{loc} | Dupe | #{link} " else status = s==-1? 'Error' : "[#{s}](https://httpstatuses.com/#{s})" message << "#{loc} | #{status} | #{link} " message << "<br> #{error}" unless error =='' message << "redirects to<br>#{r}" unless r=='' end message << "\n" end { 'error' => true, 'title' => title, 'message'=> message } end results_file_filter = filter_filename filename results_file = "#{RESULTS_PREFIX}-#{results_file_filter}-markdown-table.json" File.open(results_file, 'w') { |f| f.write JSON.pretty_generate(payload) } puts "Wrote markdown table results to #{results_file}" return true end
write_results(f, r, silent)
click to toggle source
# File lib/awesome_bot/write.rb, line 62 def write_results(f, r, silent) return false if silent==true results_file_filter = filter_filename f results_file = "#{RESULTS_PREFIX}-#{results_file_filter}.json" r.write results_file puts "\nWrote results to #{results_file}" return true end
write_results_filtered(file, filtered, silent)
click to toggle source
# File lib/awesome_bot/write.rb, line 73 def write_results_filtered(file, filtered, silent) return nil if silent==true results_file_filter = filter_filename file results_file = "#{RESULTS_PREFIX}-#{results_file_filter}-filtered.json" File.open(results_file, 'w') { |f| f.write JSON.pretty_generate(filtered) } puts "Wrote filtered results to #{results_file}" return results_file end