class Hertools::WebsiteParser

Summary: help to get the website info by a url.

Public Instance Methods

crawl_title_and_favicon_file(url, options = {}) click to toggle source

Summary: get the title and favicon file by the url of one webpage. Arguments url: the url of a webpage options:

html_parser: %w[httparty nokogiri net_http]
root_path: existing file directory
# File lib/hertools/website_parser.rb, line 19
def crawl_title_and_favicon_file(url, options = {})
  puts '>>> Parsing the arguments <<<'
  unless parse_url(url)
    puts 'Failed because the bad url!'
    return false
  end
  unless parse_options(options)
    puts 'Failed because the bad options!'
    return false
  end

  puts '>>> Analysing the http response <<<'
  case @html_parser
  when 'nokogiri'
    response = HTTParty.head(@url)
    res = begin
            Nokogiri::HTML(URI.open(url), nil, 'UTF-8')
          rescue StandardError => e
            puts e
            nil
          end
  when 'httparty'
    response = HTTParty.get(@url)
    res = response.body
  else
    response = Net::HTTP.get_response(URI(@url))
    res = response.body.force_encoding("utf-8")
  end
  puts "HttpCode: #{response.code}"

  if res.nil? || res.to_s.empty?
    puts 'No content!'
    @title = @domain_name
    @favicon_url = "#{@index_url}/favicon.ico"
    puts "Use the default favicon url: #{@favicon_url}."
  else
    @title = if nokogiri?
               res.xpath('//head/title')[0]&.content.to_s
             else
               res[%r{<title>\n*(.*)\n*</title>}, 1].to_s
             end
    if @title.empty?
      puts 'Not found the title!'
      puts 'Use the domain name as the title.'
      @title = @domain_name
    end
    unless nokogiri?
      coder = HTMLEntities.new
      @title = coder.decode(@title)
    end
    puts "Title: #{@title}"

    @favicon_url = if nokogiri?
                     favicon_links = res.xpath('//head/link[@rel="icon"]')
                     favicon_links.empty? ? '' : favicon_links[0][:href].to_s
                   else
                     res[/<link rel="icon".*href="([^"]+)/, 1].to_s
                   end
    if @favicon_url.empty?
      puts 'Not found the favicon url!'
      @favicon_url = "#{@index_url}/favicon.ico"
      puts "Use the default favicon url: #{@favicon_url}."
    else
      puts "FaviconUrl: #{@favicon_url}"
      unless @favicon_url.include?('http')
        if @favicon_url.include?('//')
          @favicon_url = "#{@protocol}:#{@favicon_url}"
          puts "Fixed favicon url: #{@favicon_url}"
        elsif @favicon_url.include?('/')
          @favicon_url = "#{@index_url}#{@favicon_url}"
          puts "Fixed favicon url: #{@favicon_url}"
        else
          @favicon_url = "#{@index_url}/favicon.ico"
          puts "Use the default favicon url: #{@favicon_url}."
        end
      end
    end
  end

  if @title.empty? && @favicon_url.empty?
    puts 'Failed because not found the title and favicon url!'
    return false
  end

  identifier = Digest::MD5.hexdigest(@url)
  file_directory_path = "#{@root_path}/#{@domain_name}_#{identifier}"
  puts "FileDirectory: #{file_directory_path}"
  Dir.mkdir(file_directory_path) unless File.directory?(file_directory_path)
  if File.directory?(file_directory_path)
    unless @title.empty?
      info_file_path = "#{file_directory_path}/website_info.txt"
      puts "InfoFilePath: #{info_file_path}"
      open(info_file_path, 'wb') { |f| f << "Title: #{@title}" }
    end

    unless @favicon_url.empty?
      favicon_file_suffix = @favicon_url.split('.').last
      favicon_file_name = Digest::MD5.hexdigest(@favicon_url) + '.' + favicon_file_suffix
      favicon_file_path = "#{file_directory_path}/#{favicon_file_name}"
      puts "FaviconFilePath: #{favicon_file_path}"
      open(favicon_file_path, 'wb') { |f| f << URI.open(@favicon_url).read }
    end
    puts 'Finished!'
    true
  else
    puts 'Failed to create the directory!'
    false
  end
rescue StandardError => e
  puts e
  puts 'Failed because the unexpected exception!'
  false
end

Private Instance Methods

nokogiri?() click to toggle source
# File lib/hertools/website_parser.rb, line 169
def nokogiri?
  @judge_nokogiri ||= @html_parser == 'nokogiri'
end
parse_options(options) click to toggle source
# File lib/hertools/website_parser.rb, line 153
def parse_options(options)
  options = Hash(options)
  html_parser = options.fetch(:html_parser) { 'net_http' }
  @old_html_parser = @html_parser
  @html_parser = %w[httparty nokogiri].include?(html_parser) ? html_parser : 'net_http'
  rejudge_nokogiri if @old_html_parser != @html_parser
  puts "HtmlParser: #{@html_parser}"
  root_path = options.fetch(:root_path) { Dir.pwd }
  @root_path = (File.directory?(root_path) ? root_path : Dir.pwd).chomp('/')
  puts "RootPath: #{@root_path}"
  true
rescue StandardError => e
  puts e
  false
end
parse_url(url) click to toggle source
# File lib/hertools/website_parser.rb, line 135
def parse_url(url)
  @url = String(url)
  puts "Url: #{@url}"
  @url_match = @url.match(%r{(https?)://([^/]+)})
  return false if @url.empty? || @url_match.nil?

  @index_url = @url_match[0]
  puts "IndexUrl: #{@index_url}"
  @protocol = @url_match[1]
  puts "Protocol: #{@protocol}"
  @domain_name = @url_match[2]
  puts "DomainName: #{@domain_name}"
  true
rescue StandardError => e
  puts e
  false
end
rejudge_nokogiri() click to toggle source
# File lib/hertools/website_parser.rb, line 173
def rejudge_nokogiri
  @judge_nokogiri = nil
end