class GitFastClone::Runner
Spawns one thread per submodule, and updates them in parallel. They will be cached in the reference directory (see DEFAULT_REFERENCE_REPO_DIR
), and their index will be incrementally updated. This prevents a large amount of data copying.
Constants
- DEFAULT_GIT_ALLOW_PROTOCOL
- DEFAULT_REFERENCE_REPO_DIR
Attributes
Public Class Methods
# File lib/git-fastclone.rb, line 90 def initialize # Prefetch reference repos for submodules we've seen before # Keep our own reference accounting of module dependencies. self.prefetch_submodules = true # Thread-level locking for reference repos # TODO: Add flock-based locking if we want to avoid conflicting with # ourselves. self.reference_mutex = Hash.new { |hash, key| hash[key] = Mutex.new } # Only update each reference repo once per run. # TODO: May want to update this so we don't duplicate work with other copies # of ourself. Perhaps a last-updated-time and a timeout per reference repo. self.reference_updated = Hash.new { |hash, key| hash[key] = false } self.options = {} self.abs_clone_path = Dir.pwd self.using_local_repo = false self.verbose = false self.print_git_errors = false self.color = false self.flock_timeout_secs = 0 end
Public Instance Methods
# File lib/git-fastclone.rb, line 382 def auth_error?(error) error.to_s =~ /.*^fatal: Authentication failed/m end
To avoid corruption of the cache, if we failed to update or check out we remove the cache directory entirely. This may cause the current clone to fail, but if the underlying error from git is transient it will not affect future clones.
# File lib/git-fastclone.rb, line 408 def clear_cache(dir, url) puts "[WARN] Removing the fastclone cache at #{dir}" FileUtils.remove_entry_secure(dir, force: true) reference_updated.delete(reference_repo_name(url)) end
# File lib/git-fastclone.rb, line 218 def clear_clone_dest(dest_files) puts 'Non-empty clone directory found, clearing its content now.' FileUtils.rm_rf(dest_files) end
# File lib/git-fastclone.rb, line 208 def clear_clone_dest_if_needed(attempt_number, clone_dest) return unless attempt_number.positive? dest_with_dotfiles = Dir.glob("#{clone_dest}/*", File::FNM_DOTMATCH) dest_files = dest_with_dotfiles.reject { |f| %w[. ..].include?(File.basename(f)) } return if dest_files.empty? clear_clone_dest(dest_files) end
Checkout to SOURCE_DIR. Update all submodules recursively. Use reference repos everywhere for speed.
# File lib/git-fastclone.rb, line 225 def clone(url, rev, src_dir, config) clone_dest = File.join(abs_clone_path, src_dir).to_s initial_time = Time.now if Dir.exist?(clone_dest) && !Dir.empty?(clone_dest) raise "Can't clone into an existing non-empty path: #{clone_dest}" end with_git_mirror(url) do |mirror, attempt_number| clear_clone_dest_if_needed(attempt_number, clone_dest) clone_commands = ['git', 'clone', verbose ? '--verbose' : '--quiet'] clone_commands << '--reference' << mirror.to_s << url.to_s << clone_dest clone_commands << '--config' << config.to_s unless config.nil? fail_on_error(*clone_commands, quiet: !verbose, print_on_failure: print_git_errors) end # Only checkout if we're changing branches to a non-default branch if rev fail_on_error('git', 'checkout', '--quiet', rev.to_s, quiet: !verbose, print_on_failure: print_git_errors, chdir: File.join(abs_clone_path, src_dir)) end update_submodules(src_dir, url) final_time = Time.now msg = "Checkout of #{src_dir} took #{final_time - initial_time}s" if color puts msg.green else puts msg end end
# File lib/git-fastclone.rb, line 178 def parse_inputs parse_options unless ARGV[0] warn usage exit(129) end if Dir.exist?(ARGV[0]) url = File.expand_path ARGV[0] self.using_local_repo = true else url = ARGV[0] end path = ARGV[1] || path_from_git_url(url) if Dir.exist?(path) msg = "Clone destination #{File.join(abs_clone_path, path)} already exists!" raise msg.red if color raise msg end self.reference_dir = ENV['REFERENCE_REPO_DIR'] || DEFAULT_REFERENCE_REPO_DIR FileUtils.mkdir_p(reference_dir) [url, path, options] end
# File lib/git-fastclone.rb, line 136 def parse_options # One option --branch=<branch> We're not as brittle as clone. That branch # can be a sha or tag and we're still okay. OptionParser.new do |opts| opts.banner = usage options[:branch] = nil opts.on('-b', '--branch BRANCH', 'Checkout this branch rather than the default') do |branch| options[:branch] = branch end opts.on('-v', '--verbose', 'Verbose mode') do puts '--print_git_errors is redundant when using --verbose' if print_git_errors self.verbose = true end opts.on('--print_git_errors', 'Print git output if a command fails') do puts '--print_git_errors is redundant when using --verbose' if verbose self.print_git_errors = true end opts.on('-c', '--color', 'Display colored output') do self.color = true end opts.on('--config CONFIG', 'Git config applied to the cloned repo') do |config| options[:config] = config end opts.on('--lock-timeout N', 'Timeout in seconds to acquire a lock on any reference repo.', 'Default is 0 which waits indefinitely.') do |timeout_secs| self.flock_timeout_secs = timeout_secs.to_i end opts.on('--pre-clone-hook script_file', 'An optional file that should be invoked before cloning mirror repo', 'No-op when a file is missing') do |script_file| options[:pre_clone_hook] = script_file end end.parse! end
Grab the children in the event of a prefetch
# File lib/git-fastclone.rb, line 350 def prefetch(submodule_file, attempt_number) File.readlines(submodule_file).each do |line| # We don't join these threads explicitly Thread.new { update_reference_repo(line.strip, false, attempt_number) } end end
# File lib/git-fastclone.rb, line 400 def print_formatted_error(error) indented_error = error.to_s.split("\n").map { |s| "> #{s}\n" }.join puts "[INFO] Encountered a retriable error:\n#{indented_error}\n" end
# File lib/git-fastclone.rb, line 386 def retriable_error?(error) error_strings = [ /^fatal: missing blob object/, /^fatal: remote did not send all necessary objects/, /^fatal: packed object [a-z0-9]+ \(stored in .*?\) is corrupt/, /^fatal: pack has \d+ unresolved delta/, /^error: unable to read sha1 file of /, /^fatal: did not receive expected object/, /^fatal: unable to read tree [a-z0-9]+\n^warning: Clone succeeded, but checkout failed/, /^fatal: Authentication failed/ ] error.to_s =~ /.*#{Regexp.union(error_strings)}/m end
# File lib/git-fastclone.rb, line 120 def run url, path, options = parse_inputs require_relative 'git-fastclone/version' msg = "git-fastclone #{GitFastCloneVersion::VERSION}" if color puts msg.yellow else puts msg end puts "Cloning #{path_from_git_url(url)} to #{File.join(abs_clone_path, path)}" ENV['GIT_ALLOW_PROTOCOL'] ||= DEFAULT_GIT_ALLOW_PROTOCOL clone(url, options[:branch], path, options[:config]) end
Creates or updates the mirror repo then stores an indication that this repo has been updated on this run of fastclone
# File lib/git-fastclone.rb, line 359 def store_updated_repo(url, mirror, repo_name, fail_hard, attempt_number) trigger_pre_clone_hook_if_needed(url, mirror, attempt_number) # If pre_clone_hook correctly creates a mirror directory, we don't want to clone, but just update it unless Dir.exist?(mirror) fail_on_error('git', 'clone', verbose ? '--verbose' : '--quiet', '--mirror', url.to_s, mirror.to_s, quiet: !verbose, print_on_failure: print_git_errors) end cmd = ['git', 'remote', verbose ? '--verbose' : nil, 'update', '--prune'].compact fail_on_error(*cmd, quiet: !verbose, print_on_failure: print_git_errors, chdir: mirror) reference_updated[repo_name] = true rescue RunnerExecutionRuntimeError => e # To avoid corruption of the cache, if we failed to update or check out we remove # the cache directory entirely. This may cause the current clone to fail, but if the # underlying error from git is transient it will not affect future clones. # # The only exception to this is authentication failures, because they are transient, # usually due to either a remote server outage or a local credentials config problem. clear_cache(mirror, url) unless auth_error?(e.output) raise e if fail_hard end
# File lib/git-fastclone.rb, line 283 def thread_update_submodule(submodule_url, submodule_path, threads, pwd) threads << Thread.new do with_git_mirror(submodule_url) do |mirror, _| cmd = ['git', 'submodule', verbose ? nil : '--quiet', 'update', '--reference', mirror.to_s, submodule_path.to_s].compact fail_on_error(*cmd, quiet: !verbose, print_on_failure: print_git_errors, chdir: File.join(abs_clone_path, pwd)) end update_submodules(File.join(pwd, submodule_path), submodule_url) end end
Fail_hard indicates whether the update is considered a failure of the overall checkout or not. When we pre-fetch based off of cached information, fail_hard is false. When we fetch based off info in a repository directly, fail_hard is true.
# File lib/git-fastclone.rb, line 333 def update_reference_repo(url, fail_hard, attempt_number) repo_name = reference_repo_name(url) mirror = reference_repo_dir(url, reference_dir, using_local_repo) with_reference_repo_lock(url) do # we've created this to track submodules' history submodule_file = reference_repo_submodule_file(url, reference_dir, using_local_repo) # if prefetch is on, then grab children immediately to frontload network requests prefetch(submodule_file, attempt_number) if File.exist?(submodule_file) && prefetch_submodules # Store the fact that our repo has been updated if necessary store_updated_repo(url, mirror, repo_name, fail_hard, attempt_number) unless reference_updated[repo_name] end end
# File lib/git-fastclone.rb, line 318 def update_submodule_reference(url, submodule_url_list) return if submodule_url_list.empty? || prefetch_submodules.nil? with_reference_repo_lock(url) do # Write the dependency file using submodule list File.open(reference_repo_submodule_file(url, reference_dir, using_local_repo), 'w') do |f| submodule_url_list.each { |submodule_url| f.write("#{submodule_url}\n") } end end end
# File lib/git-fastclone.rb, line 261 def update_submodules(pwd, url) return unless File.exist?(File.join(abs_clone_path, pwd, '.gitmodules')) puts 'Updating submodules...' if verbose threads = [] submodule_url_list = [] output = fail_on_error('git', 'submodule', 'init', quiet: !verbose, print_on_failure: print_git_errors, chdir: File.join(abs_clone_path, pwd)) output.split("\n").each do |line| submodule_path, submodule_url = parse_update_info(line) submodule_url_list << submodule_url thread_update_submodule(submodule_url, submodule_path, threads, pwd) end update_submodule_reference(url, submodule_url_list) threads.each(&:join) end
# File lib/git-fastclone.rb, line 449 def usage 'Usage: git fastclone [options] <git-url> [path]' end
This command will create and bring the mirror up-to-date on-demand, blocking any code passed in while the mirror is brought up-to-date
In future we may need to synchronize with flock here if we run multiple builds at once against the same reference repos. One build per slave at the moment means we only need to synchronize our own threads in case a single submodule url is included twice via multiple dependency paths
# File lib/git-fastclone.rb, line 421 def with_git_mirror(url) retries_allowed ||= 1 attempt_number ||= 0 update_reference_repo(url, true, attempt_number) dir = reference_repo_dir(url, reference_dir, using_local_repo) # Sometimes remote updates involve re-packing objects on a different thread # We grab the reference repo lock here just to make sure whatever thread # ended up doing the update is done with its housekeeping. # This makes sure we have control and unlock when the block returns: with_reference_repo_lock(url) do yield dir, attempt_number end rescue RunnerExecutionRuntimeError => e if retriable_error?(e.output) print_formatted_error(e.output) clear_cache(dir, url) if attempt_number < retries_allowed attempt_number += 1 retry end end raise e end
# File lib/git-fastclone.rb, line 296 def with_reference_repo_lock(url, &block) # Sane POSIX implementations remove exclusive flocks when a process is terminated or killed # We block here indefinitely. Waiting for other git-fastclone processes to release the lock. # With the default timeout of 0 we will wait forever, this can be overridden on the command line. lockfile = reference_repo_lock_file(url, reference_dir, using_local_repo) Timeout.timeout(flock_timeout_secs) { lockfile.flock(File::LOCK_EX) } with_reference_repo_thread_lock(url, &block) ensure # Not strictly necessary to do this unlock as an ensure. If ever exception is caught outside this # primitive, ensure protection may come in handy. lockfile.flock(File::LOCK_UN) lockfile.close end
# File lib/git-fastclone.rb, line 310 def with_reference_repo_thread_lock(url, &block) # We also need thread level locking because pre-fetch means multiple threads can # attempt to update the same repository from a single git-fastclone process # file locks in posix are tracked per process, not per userland thread. # This gives us the equivalent of pthread_mutex around these accesses. reference_mutex[reference_repo_name(url)].synchronize(&block) end
Private Instance Methods
# File lib/git-fastclone.rb, line 453 def trigger_pre_clone_hook_if_needed(url, mirror, attempt_number) return if Dir.exist?(mirror) || !options.include?(:pre_clone_hook) hook_command = options[:pre_clone_hook] unless File.exist?(File.expand_path(hook_command)) puts 'pre_clone_hook script is missing' if verbose return end popen2e_wrapper(hook_command, url.to_s, mirror.to_s, attempt_number.to_s, quiet: !verbose) end