class RBHive::SchemaDefinition
Constants
- INFINITY
- NAN
- TYPES
Attributes
schema[R]
Public Class Methods
new(schema, example_row)
click to toggle source
# File lib/rbhive/schema_definition.rb 20 def initialize(schema, example_row) 21 @schema = schema 22 @example_row = example_row ? example_row.split("\t") : [] 23 end
Public Instance Methods
coerce_column(column_name, value)
click to toggle source
# File lib/rbhive/schema_definition.rb 66 def coerce_column(column_name, value) 67 type = column_type_map[column_name] 68 return INFINITY if (type != :string && value == "Infinity") 69 return NAN if (type != :string && value == "NaN") 70 return coerce_complex_value(value) if type.to_s =~ /^array/ 71 conversion_method = TYPES[type] 72 conversion_method ? value.send(conversion_method) : value 73 end
coerce_complex_value(value)
click to toggle source
# File lib/rbhive/schema_definition.rb 79 def coerce_complex_value(value) 80 return nil if value.nil? 81 return nil if value.length == 0 82 return nil if value == 'null' 83 JSON.parse(value) 84 end
coerce_row(row)
click to toggle source
# File lib/rbhive/schema_definition.rb 59 def coerce_row(row) 60 column_names.zip(row.split("\t")).inject({}) do |hsh, (column_name, value)| 61 hsh[column_name] = coerce_column(column_name, value) 62 hsh 63 end 64 end
coerce_row_to_array(row)
click to toggle source
# File lib/rbhive/schema_definition.rb 75 def coerce_row_to_array(row) 76 column_names.map { |n| row[n] } 77 end
column_names()
click to toggle source
# File lib/rbhive/schema_definition.rb 25 def column_names 26 @column_names ||= begin 27 schema_names = @schema.fieldSchemas.map {|c| c.name } 28 29 # In rare cases Hive can return two identical column names 30 # consider SELECT a.foo, b.foo... 31 # in this case you get two columns called foo with no disambiguation. 32 # as a (far from ideal) solution we detect this edge case and rename them 33 # a.foo => foo1, b.foo => foo2 34 # otherwise we will trample one of the columns during Hash mapping. 35 s = Hash.new(0) 36 schema_names.map! { |c| s[c] += 1; s[c] > 1 ? "#{c}---|---#{s[c]}" : c } 37 schema_names.map! { |c| s[c] > 1 ? "#{c}---|---1" : c } 38 schema_names.map! { |c| c.gsub('---|---', '_').to_sym } 39 40 # Lets fix the fact that Hive doesn't return schema data for partitions on SELECT * queries 41 # For now we will call them :_p1, :_p2, etc. to avoid collisions. 42 offset = 0 43 while schema_names.length < @example_row.length 44 schema_names.push(:"_p#{offset+=1}") 45 end 46 schema_names 47 end 48 end
column_type_map()
click to toggle source
# File lib/rbhive/schema_definition.rb 50 def column_type_map 51 @column_type_map ||= column_names.inject({}) do |hsh, c| 52 definition = @schema.fieldSchemas.find {|s| s.name.to_sym == c } 53 # If the column isn't in the schema (eg partitions in SELECT * queries) assume they are strings 54 hsh[c] = definition ? definition.type.to_sym : :string 55 hsh 56 end 57 end