module Pulse::Downloader::WebPageParser

Public Instance Methods

fetch_file_paths(custom_path_root=nil) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 4
def fetch_file_paths(custom_path_root=nil)
  if traverse_folders
    fetch_folders(url, custom_path_root, nil).each do |folder_url|
      fetch_and_parse_response(folder_url, custom_path_root)
    end
  else
    fetch_and_parse_response(url, custom_path_root)
  end
end

Private Instance Methods

add_base_url(str, custom_path_root=nil) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 134
def add_base_url(str, custom_path_root=nil)
  return str if custom_path_root

  if !str.include?('https://') && !str.include?(base_url)
    "https://#{base_url}#{str}"
  else
    str
  end
end
append_two_paths(inner_string, link) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 76
def append_two_paths(inner_string, link)
  return link if inner_string.nil?
  "#{inner_string}/#{link}"
end
extract_all_urls(response, custom_path_root, type) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 81
def extract_all_urls(response, custom_path_root, type)
  parse_html(response.body)
    .to_s
    .split(/\s+/)
    .find_all { |u| u =~ /^https?:/ }
    .compact
    .select { |link| (link.include? type || link.include?(custom_path_root)) }
    .map { |link| add_base_url(link, custom_path_root) }
end
extract_embedded_images(response, type) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 101
def extract_embedded_images(response, type)
  return [] unless scrape_images

  parse_html(response.body)
    .css('img')
    .to_a
    .map { |e| e["src"] }
    .compact
    .select { |link| (link.include? type) }
    .map { |link| add_base_url(link) }
end
extract_file_urls(response, custom_path_root, type) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 56
def extract_file_urls(response, custom_path_root, type)
  return [] if response.body.nil? || response.body.empty?

  remove_artefacts(
    extract_all_urls(response, custom_path_root, type) +
      extract_download_links(response, type) +
      extract_embedded_images(response, type)
  ).uniq
end
extract_hrefs(response, custom_path_root, inner_string) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 66
def extract_hrefs(response, custom_path_root, inner_string)
  parse_html(response.body)
    .css('a')
    .map { |link| "/#{link['href']}" }
    .reject { |link| link == "../" || link == "/../" }
    .reject { |link| link.include?('.') } # Remove files
    .map { |link| append_two_paths(inner_string, link) }
    .map { |link| add_base_url(link, custom_path_root) }
end
fetch_and_parse_response(folder_url, custom_path_root) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 28
def fetch_and_parse_response(folder_url, custom_path_root)
  parse_response(get_response(folder_url), custom_path_root, file_type)
end
fetch_folders(folder_url, custom_path_root, inner_string) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 16
def fetch_folders(folder_url, custom_path_root, inner_string)
  current_paths = extract_hrefs(get_response(folder_url), custom_path_root, inner_string)

  @folder_urls = folder_urls.union(current_paths).uniq.compact

  current_paths.each do |path|
    fetch_folders(path, nil, inner_string)
  end

  folder_urls
end
get_response(folder_url) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 32
def get_response(folder_url)
  @start_time = get_micro_second_time

  response = HTTParty.get(folder_url, verify: verify_ssl, headers: headers)

  @end_time = get_micro_second_time

  if report_time
    print_time
  end

  response
end
parse_html(raw_html) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 130
def parse_html(raw_html)
  Nokogiri::HTML(raw_html)
end
parse_response(response, custom_path_root, file_type) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 46
def parse_response(response, custom_path_root, file_type)
  if file_type.is_a?(Array)
    file_type.flat_map do |type|
      extract_file_urls(response, custom_path_root, type)
    end
  else
    extract_file_urls(response, custom_path_root, file_type)
  end
end
remove_artefacts(urls) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 113
def remove_artefacts(urls)
  urls = remove_extra_escape_characters(urls)
  remove_base64(urls)
end
remove_base64(urls) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 124
def remove_base64(urls)
  urls.reject do |url|
    url.include?(':image/') || url.include?('base64')
  end
end
remove_extra_escape_characters(urls) click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 118
def remove_extra_escape_characters(urls)
  urls.map do |url|
    url.gsub("\">", '')
  end
end