class HsdsTransformer::BaseTransformer
Constants
- SUPPORTED_HSDS_MODELS
Attributes
include_custom[R]
mapping[R]
Public Class Methods
new(args)
click to toggle source
TODO validate that incoming data is valid-ish, like unique IDs
# File lib/hsds_transformer/base_transformer.rb, line 15 def initialize(args) @mapping = parse_mapping(args[:mapping]) @include_custom = args[:include_custom] @zip_output = args[:zip_output] SUPPORTED_HSDS_MODELS.each do |model| var_name = "@" + model instance_variable_set(var_name, []) end set_file_paths(args) end
run(args)
click to toggle source
# File lib/hsds_transformer/base_transformer.rb, line 10 def self.run(args) new(args).transform end
Public Instance Methods
apply_custom_transformation()
click to toggle source
This is defined in custom transformer if there is one
# File lib/hsds_transformer/base_transformer.rb, line 66 def apply_custom_transformation end
transform()
click to toggle source
# File lib/hsds_transformer/base_transformer.rb, line 29 def transform # Initial transformation into HSDS mapping.each do |input_file_name, file_mapping| transform_file(input_file_name, file_mapping) end # HSDS additional formatting singletonize_languages apply_custom_transformation # make data path for these files Dir.mkdir(output_datapackage_path) unless Dir.exists?(output_datapackage_path) Dir.mkdir(output_data_path) unless Dir.exists?(output_data_path) # Write the data to CSV files write_output_files zip_output if @zip_output return self end
transform_file(input_file_name, file_mapping)
click to toggle source
# File lib/hsds_transformer/base_transformer.rb, line 52 def transform_file(input_file_name, file_mapping) path = @input_path + input_file_name org_mapping = file_mapping["columns"] # Now we want to process each row in a way that allows the row to create multiple objects, # including multiple objects from the same rows. CSV.foreach(path, headers: true) do |input| collected_data = hsds_objects_from_row(input, org_mapping) collect_into_ivars(collected_data) end end
Private Instance Methods
collect_into_ivars(collected_data)
click to toggle source
Now let's pop each object into its respective instance variable collection to be written to the right file
# File lib/hsds_transformer/base_transformer.rb, line 111 def collect_into_ivars(collected_data) SUPPORTED_HSDS_MODELS.each do |model| collection_ivar(model) << collected_data[model] if collected_data[model] && !collected_data[model].empty? end end
collection_ivar(model)
click to toggle source
# File lib/hsds_transformer/base_transformer.rb, line 117 def collection_ivar(model) var_name = "@" + model instance_variable_get(var_name) end
hsds_objects_from_row(input, org_mapping)
click to toggle source
# File lib/hsds_transformer/base_transformer.rb, line 71 def hsds_objects_from_row(input, org_mapping) collected_data = {} # k is the input field_name # org_mapping[k] gives us the array of output fields input.each do |k,v| # turn this into array to be backwards compatible output_fields = org_mapping[k].is_a?(Array) ? org_mapping[k] : [org_mapping[k]] # now lets collect each object output_fields.compact.each do |output_field| # collected_data[output_field["model"]] should make it such that collected_data = { "organizations" => {} } collected_data[output_field["model"]] ||= {} # Append all string fields marked as "append" to single output field if output_field["append"] existing_string_value = collected_data[output_field["model"]][output_field["field"]] || "" existing_string_value += v.to_s unless null_type(v) collected_data[output_field["model"]].merge!(output_field["field"] => existing_string_value) else if output_field["map"] value = output_field["map"][v] else value = v end safe_val = null_type(value) ? nil : value collected_data[output_field["model"]].merge!(output_field["field"] => safe_val) end end end collected_data end
null_type(string)
click to toggle source
# File lib/hsds_transformer/base_transformer.rb, line 106 def null_type(string) string.nil? || string.downcase.strip == "null" end
parse_mapping(mapping_path)
click to toggle source
# File lib/hsds_transformer/base_transformer.rb, line 199 def parse_mapping(mapping_path) if mapping_path[0..3] == "http" uri = URI(mapping_path) file = Net::HTTP.get(uri) YAML.load file else YAML.load File.read(mapping_path) end end
parse_path(path_chunks)
click to toggle source
Returns for example: ['tmp', 'input/data']
# File lib/hsds_transformer/base_transformer.rb, line 161 def parse_path(path_chunks) path = path_chunks[1] subpath_chunks = path.split("/") base_dir = subpath_chunks[1] remaining_path = subpath_chunks[2..-1].join("/") [base_dir, remaining_path] end
singletonize_languages()
click to toggle source
# File lib/hsds_transformer/base_transformer.rb, line 122 def singletonize_languages formatted_langs = @languages.each_with_object([]) do |language_row, array| langs = language_row["language"].to_s.split(",") if langs.size > 1 langs.each do |lang| array << language_row.clone.merge("language" => lang.strip) end else array << language_row end end @languages = formatted_langs end
write_csv(path, headers, data)
click to toggle source
This also dedupes data by calling `uniq` on each collection before writing
# File lib/hsds_transformer/base_transformer.rb, line 189 def write_csv(path, headers, data) return if data.empty? CSV.open(path, 'wb') do |csv| csv << headers data.uniq.each do |row| csv << CSV::Row.new(row.keys, row.values).values_at(*headers) unless row.values.all? { |v| v.nil? || v.strip == '' } end end end
write_datapackage_json()
click to toggle source
# File lib/hsds_transformer/base_transformer.rb, line 144 def write_datapackage_json package = DataPackage::Package.new # Is the output path in the file tree of the current directory? If so, we can work with it; if not, we can't. # Due to "safe" file path requirements in the datapackage-rb library path_chunks = output_datapackage_path.split(Dir.pwd) if path_chunks[0] == "" base_dir, remaining_path = parse_path(path_chunks) descriptor = package.infer(directory: "#{remaining_path}/data", base_path: base_dir) content_to_write = descriptor.to_json else content_to_write = File.read(default_datapackage_json_path) end File.open(output_datapackage_file_path, "wb") { |f| f.write(content_to_write) } end
write_output_files()
click to toggle source
# File lib/hsds_transformer/base_transformer.rb, line 136 def write_output_files SUPPORTED_HSDS_MODELS.each do |model| path_var = instance_variable_get "@output_#{model}_path" write_csv path_var, headers(collection_ivar(model).first, model), collection_ivar(model) end write_datapackage_json end
zip_output()
click to toggle source
# File lib/hsds_transformer/base_transformer.rb, line 169 def zip_output input_data_files = Dir.glob(File.join(output_data_path, "**/*")) File.delete(zipfile_name) if File.exists?(zipfile_name) Zip::File.open(zipfile_name, Zip::File::CREATE) do |zipfile| # Add databpackage.json zipfile.add("datapackage.json", output_datapackage_file_path) # Add data files input_data_files.each do |file_path| zipped_name = "data/" + File.basename(file_path) zipfile.add(zipped_name, file_path) end end end