class Ikra::Translator::CommandTranslator

Attributes

environment_builder[R]
kernel_launcher_stack[R]
object_tracer[R]
program_builder[R]
root_command[R]

Public Class Methods

new(root_command:) click to toggle source
# File lib/translator/commands/command_translator.rb, line 56
def initialize(root_command:)
    @kernel_launcher_stack = []
    @environment_builder = EnvironmentBuilder.new

    # Select correct program builder based on command type
    @program_builder = ProgramBuilder.new(
        environment_builder: environment_builder, 
        root_command: root_command)

    @root_command = root_command
end
next_unique_id() click to toggle source
# File lib/translator/commands/command_translator.rb, line 14
def self.next_unique_id
    @@unique_id = @@unique_id + 1
    return @@unique_id
end
translate_command(command) click to toggle source

Entry point for translator. Returns a [ProgramBuilder], which contains all required information for compiling and executing the CUDA program.

# File lib/translator/commands/command_translator.rb, line 44
def self.translate_command(command)
    command_translator = self.new(root_command: command)
    command_translator.start_translation
    return command_translator.program_builder
end

Public Instance Methods

build_command_translation_result( execution: "", result:, command:) click to toggle source
# File lib/translator/commands/command_translator.rb, line 226
def build_command_translation_result(
    execution: "", result:, command:)

    result_type = command.result_type
    unique_id = command.unique_id

    if command.keep
        # Store result in global array
        # TODO: Remove DEBUG
        command_result = Constants::TEMP_RESULT_IDENTIFIER + unique_id.to_s
        command_execution = execution + "\n        " + result_type.to_c_type + " " + command_result + " = " + result + ";"

        kernel_builder.add_cached_result(unique_id.to_s, result_type)
        kernel_launcher.add_cached_result(unique_id.to_s, result_type)
        environment_builder.add_previous_result_type(unique_id, result_type)
    else
        command_result = result
        command_execution = execution
    end

    command_translation = CommandTranslationResult.new(
        execution: command_execution,
        result: command_result,
        command: command)
end
kernel_builder() click to toggle source
# File lib/translator/commands/command_translator.rb, line 98
def kernel_builder
    return kernel_launcher_stack.last.kernel_builder
end
kernel_launcher() click to toggle source
# File lib/translator/commands/command_translator.rb, line 94
def kernel_launcher
    return kernel_launcher_stack.last
end
pop_kernel_launcher(command_translation_result) click to toggle source

Pops a KernelBuilder from the kernel builder stack. This method is called when all blocks (parallel sections) for that kernel have been translated, i.e., the kernel is fully built.

# File lib/translator/commands/command_translator.rb, line 128
def pop_kernel_launcher(command_translation_result)
    previous_launcher = kernel_launcher_stack.pop

    kernel_builder = previous_launcher.kernel_builder
    kernel_builder.block_invocation = command_translation_result.result
    kernel_builder.execution = command_translation_result.execution
    kernel_builder.result_type = command_translation_result.result_type

    if previous_launcher == nil
        raise AssertionError.new("Attempt to pop kernel launcher, but stack is empty")
    end

    program_builder.add_kernel_launcher(previous_launcher)

    return previous_launcher
end
push_kernel_launcher(kernel_builder: nil, kernel_launcher: nil) click to toggle source
# File lib/translator/commands/command_translator.rb, line 112
def push_kernel_launcher(kernel_builder: nil, kernel_launcher: nil)
    if kernel_builder != nil && kernel_launcher == nil
        @kernel_launcher_stack.push(KernelLauncher.new(kernel_builder))
    elsif kernel_builder == nil && kernel_launcher != nil
        @kernel_launcher_stack.push(kernel_launcher)
    elsif kernel_builder == nil && kernel_launcher == nil
        # Default: add new kernel builder
        @kernel_launcher_stack.push(KernelLauncher.new(KernelBuilder.new))
    else
        raise ArgumentError.new("kernel_builder and kernel_laucher given but only expected one")
    end
end
start_translation() click to toggle source
# File lib/translator/commands/command_translator.rb, line 68
def start_translation
    Log.info("CommandTranslator: Starting translation...")

    # Trace all objects
    @object_tracer = TypeInference::ObjectTracer.new(root_command)
    all_objects = object_tracer.trace_all


    # --- Translate ---

    # Create new kernel launcher
    push_kernel_launcher

    # Translate the command (might create additional kernels)
    result = root_command.accept(self)

    # Add kernel builder to ProgramBuilder
    pop_kernel_launcher(result)

    # --- End of Translation ---


    # Add SoA arrays to environment
    object_tracer.register_soa_arrays(environment_builder)
end
translate_entire_input(command) click to toggle source
# File lib/translator/commands/command_translator.rb, line 145
def translate_entire_input(command)
    input_translated = command.input.each_with_index.map do |input, index|
        input.translate_input(
            parent_command: command,
            command_translator: self,
            # Assuming that every input consumes exactly one parameter
            start_eat_params_offset: index)
    end

    return EntireInputTranslationResult.new(input_translated)
end
translate_input(input) click to toggle source

Processes a [Symbolic::Input] objects, which contains a reference to a command object and information about how elements are accessed. If elements are only accessed according to the current thread ID, this input can be fused. Otherwise, a new kernel will be built.

# File lib/translator/commands/command_translator.rb, line 161
def translate_input(input)
    previous_result = ""

    if input.command.has_previous_result?
        # Read previously computed (cached) value
        Log.info("Reusing kept result for command #{input.command.unique_id}: #{input.command.gpu_result_pointer}")

        environment_builder.add_previous_result(
            input.command.unique_id, input.command.gpu_result_pointer)
        environment_builder.add_previous_result_type(
            input.command.unique_id, input.command.result_type)

        cell_access = ""
        if input.pattern == :tid
            cell_access = "[_tid_]"
        end

        kernel_launcher.configure_grid(input.command.size)
        previous_result = CommandTranslationResult.new(
            execution: "",
            result: "((#{input.command.result_type.to_c_type} *)(_env_->" + "prev_#{input.command.unique_id}))#{cell_access}",
            command: input.command)

        if input.pattern == :tid
            return previous_result
        else
        end
    end

    if input.pattern == :tid
        # Stay in current kernel
        return input.command.accept(self)
    elsif input.pattern == :entire
        if !input.command.has_previous_result?
            # Create new kernel
            push_kernel_launcher

            previous_result = input.command.accept(self)
            previous_result_kernel_var = kernel_launcher.kernel_result_var_name
            
            pop_kernel_launcher(previous_result)
        else
            kernel_launcher.use_cached_result(
                input.command.unique_id, input.command.result_type) 
            previous_result_kernel_var = "prev_" + input.command.unique_id.to_s
        end

        # Add parameter for previous input to this kernel
        kernel_launcher.add_previous_kernel_parameter(Variable.new(
            name: previous_result_kernel_var,
            type: previous_result.result_type))

        # This is a root command for this kernel, determine grid/block dimensions
        kernel_launcher.configure_grid(input.command.size, block_size: input.command.block_size)

        kernel_translation = CommandTranslationResult.new(
            result: previous_result_kernel_var,
            command: input.command)

        return kernel_translation
    else
        raise NotImplementedError.new("Unknown input pattern: #{input.pattern}")
    end
end
visit_array_combine_command(command) click to toggle source
# File lib/translator/commands/array_combine_command.rb, line 4
def visit_array_combine_command(command)
    Log.info("Translating ArrayCombineCommand [#{command.unique_id}]")

    super

    # Process dependent computation (receiver), returns [InputTranslationResult]
    input = translate_entire_input(command)

    # All variables accessed by this block should be prefixed with the unique ID
    # of the command in the environment.
    env_builder = @environment_builder[command.unique_id]

    block_translation_result = Translator.translate_block(
        block_def_node: command.block_def_node,
        environment_builder: env_builder,
        lexical_variables: command.lexical_externals,
        command_id: command.unique_id,
        entire_input_translation: input)

    kernel_builder.add_methods(block_translation_result.aux_methods)
    kernel_builder.add_block(block_translation_result.block_source)

    # Build command invocation string
    result = block_translation_result.function_name + "(" + 
        (["_env_"] + input.result).join(", ") + ")"

    command_translation = build_command_translation_result(
        execution: input.execution,
        result: result,
        command: command)

    Log.info("DONE translating ArrayCombineCommand [#{command.unique_id}]")

    return command_translation
end
visit_array_command(command) click to toggle source

— Actual Visitor parts stars here —

# File lib/translator/commands/command_translator.rb, line 105
def visit_array_command(command)
    if command.keep && !command.has_previous_result?
        # Create slot for result pointer on GPU in env
        environment_builder.allocate_previous_pointer(command.unique_id)
    end
end
visit_array_identity_command(command) click to toggle source
# File lib/translator/commands/array_identity_command.rb, line 4
def visit_array_identity_command(command)
    Log.info("Translating ArrayIdentityCommand [#{command.unique_id}]")

    super

    # This is a root command, determine grid/block dimensions
    kernel_launcher.configure_grid(command.size, block_size: command.block_size)

    # Add base array to environment
    need_union_type = !command.base_type.is_singleton?
    transformed_base_array = object_tracer.convert_base_array(
        command.input.first.command, need_union_type)
    environment_builder.add_base_array(command.unique_id, transformed_base_array)

    command_translation = build_command_translation_result(
        result: "#{Constants::ENV_IDENTIFIER}->#{EnvironmentBuilder.base_identifier(command.unique_id)}[_tid_]",
        command: command)

    Log.info("DONE translating ArrayIdentityCommand [#{command.unique_id}]")

    return command_translation
end
visit_array_index_command(command) click to toggle source

Translate the block of an `Array.pnew` section.

# File lib/translator/commands/array_index_command.rb, line 5
def visit_array_index_command(command)
    Log.info("Translating ArrayIndexCommand [#{command.unique_id}]")

    super

    # This is a root command, determine grid/block dimensions
    kernel_launcher.configure_grid(command.size, block_size: command.block_size)

    num_dims = command.dimensions.size

    # This is a root command, determine grid/block dimensions
    kernel_launcher.configure_grid(command.size, block_size: command.block_size)

    index_generators = (0...num_dims).map do |dim_index|
        index_div = command.dimensions.drop(dim_index + 1).reduce(1, :*)
        index_mod = command.dimensions[dim_index]

        if dim_index > 0
             "(_tid_ / #{index_div}) % #{index_mod}"
        else
            # No modulo required for first dimension
            "_tid_ / #{index_div}"
        end
    end

    if num_dims > 1
        # Retrieve type that was generated earlier
        zipped_type_singleton = command.result_type.singleton_type
        result = zipped_type_singleton.generate_inline_initialization(index_generators)

        # Add struct type to program builder, so that we can generate the source code
        # for its definition.
        program_builder.structs.add(zipped_type_singleton)
    else
        result = "_tid_"
    end

    command_translation = CommandTranslationResult.new(
        result: result,
        command: command)

    Log.info("DONE translating ArrayIndexCommand [#{command.unique_id}]")

    return command_translation
end
visit_array_reduce_command(command) click to toggle source
# File lib/translator/commands/array_reduce_command.rb, line 4
def visit_array_reduce_command(command)
    Log.info("Translating ArrayReduceCommand [#{command.unique_id}]")

    super

    if command.input.size != 1
        raise AssertionError.new("Expected exactly one input for ArrayReduceCommand")
    end

    # Process dependent computation (receiver)
    input = translate_entire_input(command)

    block_size = command.block_size

    # All variables accessed by this block should be prefixed with the unique ID
    # of the command in the environment.
    env_builder = @environment_builder[command.unique_id]

    block_translation_result = Translator.translate_block(
        block_def_node: command.block_def_node,
        environment_builder: env_builder,
        lexical_variables: command.lexical_externals,
        command_id: command.unique_id,
        entire_input_translation: input)

    kernel_builder.add_methods(block_translation_result.aux_methods)
    kernel_builder.add_block(block_translation_result.block_source)

    # Add "odd" parameter to the kernel which is needed for reduction
    kernel_builder.add_additional_parameters(Constants::ODD_TYPE + " " + Constants::ODD_IDENTIFIER)

    # Number of elements that will be reduced
    num_threads = command.input_size

    if num_threads.is_a?(Fixnum)
        # Easy case: Number of required reductions known statically

        odd = (num_threads % 2 == 1).to_s

        # Number of threads needed for reduction
        num_threads = num_threads.fdiv(2).ceil

        previous_result_kernel_var = input.result.first
        first_launch = true
        
        # While more kernel launches than one are needed to finish reduction
        while num_threads >= block_size + 1
            # Launch new kernel (with same kernel builder)
            push_kernel_launcher(kernel_builder: kernel_builder)
            # Configure kernel with correct arguments and grid
            kernel_launcher.add_additional_arguments(odd)
            kernel_launcher.configure_grid(num_threads, block_size: block_size)
            
            # First launch of kernel is supposed to allocate new memory, so only reuse memory after first launch
            if first_launch
                first_launch = false
            else
                kernel_launcher.reuse_memory!(previous_result_kernel_var)
            end

            previous_result_kernel_var = kernel_launcher.kernel_result_var_name

            pop_kernel_launcher(input.command_translation_result(0))

            # Update number of threads needed
            num_threads = num_threads.fdiv(block_size).ceil
            odd = (num_threads % 2 == 1).to_s
            num_threads = num_threads.fdiv(2).ceil
        end

        # Configuration for last launch of kernel
        kernel_launcher.add_additional_arguments(odd)
        kernel_launcher.configure_grid(num_threads, block_size: block_size)
    else
        # More difficult case: Have to generate loop for reductions

        # Add one regular kernel launcher for setting up the memory etc.
        odd_first = "(#{num_threads} % 2 == 1)"
        num_threads_first = "((int) ceil(#{num_threads} / 2.0))"
        push_kernel_launcher(kernel_builder: kernel_builder)
        kernel_launcher.add_additional_arguments(odd_first)
        kernel_launcher.configure_grid(num_threads_first, block_size: block_size)
        previous_result_kernel_var = kernel_launcher.kernel_result_var_name
        pop_kernel_launcher(input.command_translation_result(0))

        # Add loop
        # Set up state (variables that are updated inside the loop)
        # 1. Calculate number of elements from previous computation
        # 2. Check if odd number
        # 3. Calculate number of threads that we need
        loop_setup = "int _num_elements = ceil(#{num_threads_first} / (double) #{block_size});\nbool _next_odd = _num_elements % 2 == 1;\nint _next_threads = ceil(_num_elements / 2.0);\n"

        # Update loop state after iteration
        update_loop = "_num_elements = ceil(_next_threads / (double) #{block_size});\nbool _next_odd = _num_elements % 2 == 0;\n_next_threads = ceil(_num_elements / 2.0);\n"

        push_kernel_launcher(kernel_launcher: WhileLoopKernelLauncher.new(
            kernel_builder: kernel_builder,
            condition: "_num_elements > 1",
            before_loop: loop_setup,
            post_iteration: update_loop))

        kernel_launcher.add_additional_arguments("_next_odd")
        kernel_launcher.configure_grid("_next_threads", block_size: block_size)
        #pop_kernel_launcher(input.command_translation_result(0))
    end

    if !first_launch
        kernel_launcher.reuse_memory!(previous_result_kernel_var)
    end

    command_execution = Translator.read_file(file_name: "reduce_body.cpp", replacements: {
        "previous_result" => input.result.first,
        "block_name" => block_translation_result.function_name,
        "arguments" => Constants::ENV_IDENTIFIER,
        "block_size" => block_size.to_s,
        "temp_result" => Constants::TEMP_RESULT_IDENTIFIER,
        "odd" => Constants::ODD_IDENTIFIER,
        "type" => command.result_type.to_c_type,
        "num_threads" => Constants::NUM_THREADS_IDENTIFIER})

    command_translation = CommandTranslationResult.new(
        execution: command_execution,
        result:  Constants::TEMP_RESULT_IDENTIFIER,
        command: command)

    Log.info("DONE translating ArrayReduceCommand [#{command.unique_id}]")

    return command_translation
end
visit_array_stencil_command(command) click to toggle source
# File lib/translator/commands/array_stencil_command.rb, line 4
def visit_array_stencil_command(command)
    Log.info("Translating ArrayStencilCommand [#{command.unique_id}]")

    super

    num_dims = command.dimensions.size

    # Process dependent computation (receiver), returns [InputTranslationResult]
    input = translate_entire_input(command)

    # Count number of parameters
    num_parameters = command.offsets.size

    # All variables accessed by this block should be prefixed with the unique ID
    # of the command in the environment.
    env_builder = @environment_builder[command.unique_id]

    block_translation_result = Translator.translate_block(
        block_def_node: command.block_def_node,
        environment_builder: env_builder,
        lexical_variables: command.lexical_externals,
        command_id: command.unique_id,
        entire_input_translation: input)

    kernel_builder.add_methods(block_translation_result.aux_methods)
    kernel_builder.add_block(block_translation_result.block_source)

    # Compute indices in all dimensions
    index_generators = (0...num_dims).map do |dim_index|
        index_div = command.dimensions.drop(dim_index + 1).reduce(1, :*)
        index_mod = command.dimensions[dim_index]

        if dim_index > 0
             "(_tid_ / #{index_div}) % #{index_mod}"
        else
            # No modulo required for first dimension
            "_tid_ / #{index_div}"
        end
    end

    compute_indices = index_generators.map.with_index do |gen, dim_index|
        "int temp_stencil_dim_#{dim_index} = #{gen};"
    end.join("\n")

    # Check if an index is out of bounds in any dimension
    out_of_bounds_check = Array.new(num_dims) do |dim_index|
        min_in_dim = command.offsets.map do |offset|
            offset[dim_index]
        end.min
        max_in_dim = command.offsets.map do |offset|
            offset[dim_index]
        end.max
        
        dim_size = command.dimensions[dim_index]

        if dim_size.is_a?(String)
            # This is not a compile-time constant. Pass dimension size as argument
            # to the kernel.

            dim_size_expr = "dim_size_#{dim_index}"
            kernel_builder.add_additional_parameters("int #{dim_size_expr}")
            kernel_launcher.add_additional_arguments(dim_size)
        else
            dim_size_expr = dim_size
        end

        "temp_stencil_dim_#{dim_index} + #{min_in_dim} >= 0 && temp_stencil_dim_#{dim_index} + #{max_in_dim} < #{dim_size_expr}"
    end.join(" && ")

    # `previous_result` should be an expression returning the array containing the
    # result of the previous computation.
    previous_result = input.result(0)

    arguments = ["_env_"]

    # Pass values from previous computation that are required by this thread.
    # Reconstruct actual indices from indices for each dimension.
    for i in 0...num_parameters
        multiplier = 1
        global_index = []

        for dim_index in (num_dims - 1).downto(0)
            global_index.push("(temp_stencil_dim_#{dim_index} + #{command.offsets[i][dim_index]}) * #{multiplier}")

            next_dim_size = command.dimensions[dim_index]

            if next_dim_size.is_a?(String)
                Log.warn("Cannot handle multi-dimensional stencil computations in host sections yet.")
            else
                multiplier = multiplier * next_dim_size
            end
        end

        arguments.push("#{previous_result}[#{global_index.join(" + ")}]")
    end

    # Push additional arguments (e.g., index)
    arguments.push(*input.result(1..-1))
    argument_str = arguments.join(", ")
    stencil_computation = block_translation_result.function_name + "(#{argument_str})"

    temp_var_name = "temp_stencil_#{CommandTranslator.next_unique_id}"

    # The following template checks if there is at least one index out of bounds. If
    # so, the fallback value is used. Otherwise, the block is executed.
    command_execution = Translator.read_file(file_name: "stencil_body.cpp", replacements: {
        "execution" => input.execution,
        "temp_var" => temp_var_name,
        "result_type" => command.result_type.to_c_type,
        "compute_indices" => compute_indices,
        "out_of_bounds_check" => out_of_bounds_check,
        "out_of_bounds_fallback" => command.out_of_range_value.to_s,
        "stencil_computation" => stencil_computation})

    command_translation = build_command_translation_result(
        execution: command_execution,
        result: temp_var_name,
        command: command)

    Log.info("DONE translating ArrayStencilCommand [#{command.unique_id}]")

    return command_translation
end
visit_array_zip_command(command) click to toggle source
# File lib/translator/commands/array_zip_command.rb, line 4
def visit_array_zip_command(command)
    Log.info("Translating ArrayZipCommand [#{command.unique_id}]")
    
    super

    # Process dependent computation (receiver), returns [InputTranslationResult]
    input = translate_entire_input(command)

    # Get Ikra struct type
    zipped_type_singleton = command.result_type.singleton_type

    # Add struct type to program builder, so that we can generate the source code
    # for its definition.
    program_builder.structs.add(zipped_type_singleton)

    command_translation = CommandTranslationResult.new(
        execution: input.execution,
        result: zipped_type_singleton.generate_inline_initialization(input.result),
        command: command)

    Log.info("DONE translating ArrayZipCommand [#{command.unique_id}]")

    return command_translation
end