module Elasticsearch::Git::Repository

Public Class Methods

repositories_count() click to toggle source

For Overwrite

# File lib/elasticsearch/git/repository.rb, line 326
def self.repositories_count
  10
end

Public Instance Methods

as_indexed_json(options = {}) click to toggle source

Representation of repository as indexed json Attention: It can be very very very huge hash

# File lib/elasticsearch/git/repository.rb, line 227
def as_indexed_json(options = {})
  ij = {}
  ij[:blobs] = index_blobs_array
  ij[:commits] = index_commits_array
  ij
end
can_index_blob?(blob) click to toggle source

Index text-like files which size less 1.mb

# File lib/elasticsearch/git/repository.rb, line 118
def can_index_blob?(blob)
  blob.text? && (blob.size && blob.size.to_i < 1048576)
end
client_for_indexing() click to toggle source
# File lib/elasticsearch/git/repository.rb, line 348
def client_for_indexing
  @client_for_indexing ||= Elasticsearch::Client.new log: true
end
delete_from_index_blob(blob) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 122
def delete_from_index_blob(blob)
  if blob.text?
    begin
      client_for_indexing.delete \
        index: "#{self.class.index_name}",
        type: "repository",
        id: "#{repository_id}_#{blob.path}"
    rescue Elasticsearch::Transport::Transport::Errors::NotFound
      return true
    rescue Exception => ex
      logger.warn "Error with remove file from index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
    end
  end
end
index_blob(blob, target_sha) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 85
def index_blob(blob, target_sha)
  if can_index_blob?(blob)
    tries = 0
    begin
      client_for_indexing.index \
        index: "#{self.class.index_name}",
        type: "repository",
        id: "#{repository_id}_#{blob.path}",
        body: {
          blob: {
            type: "blob",
            oid: blob.id,
            rid: repository_id,
            content: blob.data,
            commit_sha: target_sha,
            path: blob.path,
            language: blob.language ? blob.language.name : "Text"
          }
        }
    rescue Exception => ex
      # Retry 10 times send request
      if tries < 10
        tries += 1
        sleep tries * 10 * rand(10)
        retry
      else
        logger.warn "Can't index #{repository_id}_#{blob.path}. Reason: #{ex.message}"
      end
    end
  end
end
index_blobs(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid) click to toggle source

Indexing all text-like blobs in repository

All data stored in global index Repository can be selected by ‘rid’ field If you want - this field can be used for store ‘project’ id

blob {

id - uniq id of blob from all repositories
oid - blob id in repository
content - blob content
commit_sha - last actual commit sha

}

For search from blobs use type ‘blob’

# File lib/elasticsearch/git/repository.rb, line 64
def index_blobs(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid)
  from, to = parse_revs(from_rev, to_rev)

  diff = repository_for_indexing.diff(from, to)

  diff.deltas.reverse.each_with_index do |delta, step|
    if delta.status == :deleted
      next if delta.old_file[:mode].to_s(8) == "160000"
      b = LiteBlob.new(repository_for_indexing, delta.old_file)
      delete_from_index_blob(b)
    else
      next if delta.new_file[:mode].to_s(8) == "160000"
      b = LiteBlob.new(repository_for_indexing, delta.new_file)
      index_blob(b, to)
    end

    # Run GC every 100 blobs
    ObjectSpace.garbage_collect if step % 100 == 0
  end
end
index_blobs_array() click to toggle source

Indexing blob from current index

# File lib/elasticsearch/git/repository.rb, line 235
def index_blobs_array
  result = []

  target_sha = repository_for_indexing.head.target.oid

  if repository_for_indexing.bare?
    tree = repository_for_indexing.lookup(target_sha).tree
    result.push(recurse_blobs_index_hash(tree))
  else
    repository_for_indexing.index.each do |blob|
      b = LiteBlob.new(repository_for_indexing, blob)
      result.push(
        {
          type: 'blob',
          id: "#{target_sha}_#{b.path}",
          rid: repository_id,
          oid: b.id,
          content: b.data,
          commit_sha: target_sha
        }
      ) if b.text?
    end
  end

  result
end
index_commit(commit) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 178
def index_commit(commit)
  tries = 0
  begin
    client_for_indexing.index \
      index: "#{self.class.index_name}",
      type: "repository",
      id: "#{repository_id}_#{commit.oid}",
      body: {
        commit: {
          type: "commit",
          rid: repository_id,
          sha: commit.oid,
          author: commit.author,
          committer: commit.committer,
          message: encode!(commit.message)
        }
      }
  rescue Exception => ex
    # Retry 10 times send request
    if tries < 10
      tries += 1
      sleep tries * 10 * rand(10)
      retry
    else
      logger.warn "Can't index #{repository_id}_#{commit.oid}. Reason: #{ex.message}"
    end
  end
end
index_commits(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid) click to toggle source

Indexing all commits in repository

All data stored in global index Repository can be filtered by ‘rid’ field If you want - this field can be used git store ‘project’ id

commit {

sha - commit sha
author {
  name - commit author name
  email - commit author email
  time - commit time
}
commiter {
  name - committer name
  email - committer email
  time - commit time
}
message - commit message

}

For search from commits use type ‘commit’

# File lib/elasticsearch/git/repository.rb, line 159
def index_commits(from_rev: nil, to_rev: repository_for_indexing.last_commit.oid)
  from, to = parse_revs(from_rev, to_rev)
  range = [from, to].reject(&:nil?).join('..')
  out, err, status = Open3.capture3("git log #{range} --format=\"%H\"", chdir: repository_for_indexing.path)

  if status.success? && err.blank?
    #TODO use rugged walker!!!
    commit_oids = out.split("\n")

    commit_oids.each_with_index do |commit, step|
      index_commit(repository_for_indexing.lookup(commit))
      ObjectSpace.garbage_collect if step % 100 == 0
    end
    return commit_oids.count
  end

  0
end
index_commits_array() click to toggle source

Lookup all object ids for commit objects

# File lib/elasticsearch/git/repository.rb, line 288
def index_commits_array
  res = []

  repository_for_indexing.each_id do |oid|
    obj = repository_for_indexing.lookup(oid)
    if obj.type == :commit
      res.push(
        {
          type: 'commit',
          sha: obj.oid,
          author: obj.author,
          committer: obj.committer,
          message: encode!(obj.message)
        }
      )
    end
  end

  res
end
index_new_branch?(from) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 221
def index_new_branch?(from)
  from == '0000000000000000000000000000000000000000'
end
logger() click to toggle source
# File lib/elasticsearch/git/repository.rb, line 367
def logger
  @logger ||= Logger.new(STDOUT)
end
parse_revs(from_rev, to_rev) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 207
def parse_revs(from_rev, to_rev)
  from = if index_new_branch?(from_rev)
           if to_rev == repository_for_indexing.last_commit.oid
             nil
           else
             merge_base(to_rev)
           end
         else
           from_rev
         end

  return from, to_rev
end
path_to_repo() click to toggle source
# File lib/elasticsearch/git/repository.rb, line 331
def path_to_repo
  if @path_to_repo.blank?
    raise NotImplementedError, 'Please, define "path_to_repo" method, or set "path_to_repo" via "repository_for_indexing" method'
  else
    @path_to_repo
  end
end
recurse_blobs_index_hash(tree, path = "") click to toggle source
# File lib/elasticsearch/git/repository.rb, line 262
def recurse_blobs_index_hash(tree, path = "")
  result = []

  tree.each_blob do |blob|
    blob[:path] = path + blob[:name]
    b = LiteBlob.new(repository_for_indexing, blob)
    result.push(
      {
        type: 'blob',
        id: "#{repository_for_indexing.head.target.oid}_#{path}#{blob[:name]}",
        rid: repository_id,
        oid: b.id,
        content: b.data,
        commit_sha: repository_for_indexing.head.target.oid
      }
    ) if b.text?
  end

  tree.each_tree do |nested_tree|
    result.push(recurse_blobs_index_hash(repository_for_indexing.lookup(nested_tree[:oid]), "#{nested_tree[:name]}/"))
  end

  result.flatten
end
repository_for_indexing(repo_path = "") click to toggle source
# File lib/elasticsearch/git/repository.rb, line 340
def repository_for_indexing(repo_path = "")
  return @rugged_repo_indexer if defined? @rugged_repo_indexer

  @path_to_repo ||= repo_path
  set_repository_id
  @rugged_repo_indexer = Rugged::Repository.new(@path_to_repo)
end
repository_id() click to toggle source

For Overwrite

# File lib/elasticsearch/git/repository.rb, line 321
def repository_id
  @repository_id
end
set_repository_id(id = nil) click to toggle source

Repository id used for identity data from different repositories Update this value if need

# File lib/elasticsearch/git/repository.rb, line 316
def set_repository_id id = nil
  @repository_id = id || path_to_repo
end

Private Instance Methods

merge_base(to_rev) click to toggle source
# File lib/elasticsearch/git/repository.rb, line 373
def merge_base(to_rev)
  head_sha = repository_for_indexing.last_commit.oid
  repository_for_indexing.merge_base(to_rev, head_sha)
end