module Pulse::Downloader::WebPageParser
Public Instance Methods
fetch_file_paths(custom_path_root=nil)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 4 def fetch_file_paths(custom_path_root=nil) if traverse_folders fetch_folders(url, custom_path_root, nil).each do |folder_url| fetch_and_parse_response(folder_url, custom_path_root) end else fetch_and_parse_response(url, custom_path_root) end end
Private Instance Methods
add_base_url(str, custom_path_root=nil)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 134 def add_base_url(str, custom_path_root=nil) return str if custom_path_root if !str.include?('https://') && !str.include?(base_url) "https://#{base_url}#{str}" else str end end
append_two_paths(inner_string, link)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 76 def append_two_paths(inner_string, link) return link if inner_string.nil? "#{inner_string}/#{link}" end
extract_all_urls(response, custom_path_root, type)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 81 def extract_all_urls(response, custom_path_root, type) parse_html(response.body) .to_s .split(/\s+/) .find_all { |u| u =~ /^https?:/ } .compact .select { |link| (link.include? type || link.include?(custom_path_root)) } .map { |link| add_base_url(link, custom_path_root) } end
extract_download_links(response, type)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 91 def extract_download_links(response, type) parse_html(response.body) .css('a') .to_a .map { |link| link['href'] } .compact .select { |link| (link.include? type) } .map { |link| add_base_url(link) } end
extract_embedded_images(response, type)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 101 def extract_embedded_images(response, type) return [] unless scrape_images parse_html(response.body) .css('img') .to_a .map { |e| e["src"] } .compact .select { |link| (link.include? type) } .map { |link| add_base_url(link) } end
extract_file_urls(response, custom_path_root, type)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 56 def extract_file_urls(response, custom_path_root, type) return [] if response.body.nil? || response.body.empty? remove_artefacts( extract_all_urls(response, custom_path_root, type) + extract_download_links(response, type) + extract_embedded_images(response, type) ).uniq end
extract_hrefs(response, custom_path_root, inner_string)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 66 def extract_hrefs(response, custom_path_root, inner_string) parse_html(response.body) .css('a') .map { |link| "/#{link['href']}" } .reject { |link| link == "../" || link == "/../" } .reject { |link| link.include?('.') } # Remove files .map { |link| append_two_paths(inner_string, link) } .map { |link| add_base_url(link, custom_path_root) } end
fetch_and_parse_response(folder_url, custom_path_root)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 28 def fetch_and_parse_response(folder_url, custom_path_root) parse_response(get_response(folder_url), custom_path_root, file_type) end
fetch_folders(folder_url, custom_path_root, inner_string)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 16 def fetch_folders(folder_url, custom_path_root, inner_string) current_paths = extract_hrefs(get_response(folder_url), custom_path_root, inner_string) @folder_urls = folder_urls.union(current_paths).uniq.compact current_paths.each do |path| fetch_folders(path, nil, inner_string) end folder_urls end
get_response(folder_url)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 32 def get_response(folder_url) @start_time = get_micro_second_time response = HTTParty.get(folder_url, verify: verify_ssl, headers: headers) @end_time = get_micro_second_time if report_time print_time end response end
parse_html(raw_html)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 130 def parse_html(raw_html) Nokogiri::HTML(raw_html) end
parse_response(response, custom_path_root, file_type)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 46 def parse_response(response, custom_path_root, file_type) if file_type.is_a?(Array) file_type.flat_map do |type| extract_file_urls(response, custom_path_root, type) end else extract_file_urls(response, custom_path_root, file_type) end end
remove_artefacts(urls)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 113 def remove_artefacts(urls) urls = remove_extra_escape_characters(urls) remove_base64(urls) end
remove_base64(urls)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 124 def remove_base64(urls) urls.reject do |url| url.include?(':image/') || url.include?('base64') end end
remove_extra_escape_characters(urls)
click to toggle source
# File lib/pulse/downloader/web_page_parser.rb, line 118 def remove_extra_escape_characters(urls) urls.map do |url| url.gsub("\">", '') end end