class RubyXL::Parser

Public Class Methods

new(opts = {}) click to toggle source

:data_only allows only the sheet data to be parsed, so as to speed up parsing However, using this option will result in date-formatted cells being interpreted as numbers

# File lib/rubyXL/parser.rb, line 15
def initialize(opts = {})
  @data_only = opts.is_a?(TrueClass) || opts[:data_only]
  @skip_filename_check = opts[:skip_filename_check]
end
parse(file_path, opts = {}) click to toggle source
# File lib/rubyXL/parser.rb, line 9
def self.parse(file_path, opts = {})
  self.new(opts).parse(file_path)
end

Private Class Methods

attr_float(node, attr_name) click to toggle source
# File lib/rubyXL/parser.rb, line 295
def self.attr_float(node, attr_name) 
  attr = node.attributes[attr_name]
  attr && Float(attr.value)
end
attr_int(node, attr_name) click to toggle source
# File lib/rubyXL/parser.rb, line 290
def self.attr_int(node, attr_name) 
  attr = node.attributes[attr_name]
  attr && Integer(attr.value)
end
attr_string(node, attr_name) click to toggle source
# File lib/rubyXL/parser.rb, line 300
def self.attr_string(node, attr_name) 
  attr = node.attributes[attr_name]
  attr && attr.value
end

Public Instance Methods

data_only() click to toggle source
# File lib/rubyXL/parser.rb, line 20
def data_only
  @data_only = true
  self
end
parse(file_path, opts = {}) click to toggle source
# File lib/rubyXL/parser.rb, line 25
def parse(file_path, opts = {})

  # options handling

  wb = Workbook.new([], file_path)

  raise 'Not .xlsx or .xlsm excel file' unless @skip_filename_check ||
                                          %w{.xlsx .xlsm}.include?(File.extname(file_path))

  dir_path = File.join(File.dirname(file_path), Dir::Tmpname.make_tmpname(['rubyXL', '.tmp'], nil))

  MyZip.new.unzip(file_path, dir_path)

  files = {}

  workbook_file = Nokogiri::XML.parse(File.open(File.join(dir_path,'xl','workbook.xml'),'r'))
  rels_doc = Nokogiri::XML.parse(File.open(File.join(dir_path, 'xl', '_rels', 'workbook.xml.rels'), 'r'))

  if(File.exist?(File.join(dir_path,'xl','sharedStrings.xml')))
    shared_string_file = Nokogiri::XML.parse(File.open(File.join(dir_path,'xl','sharedStrings.xml'),'r'))
  end

  unless @data_only
    wb.media = RubyXL::GenericStorage.new(File.join('xl', 'media')).binary.load_dir(dir_path)
    wb.external_links = RubyXL::GenericStorage.new(File.join('xl', 'externalLinks')).load_dir(dir_path)
    wb.external_links_rels = RubyXL::GenericStorage.new(File.join('xl', 'externalLinks', '_rels')).load_dir(dir_path)
    wb.drawings = RubyXL::GenericStorage.new(File.join('xl', 'drawings')).load_dir(dir_path)
    wb.drawings_rels = RubyXL::GenericStorage.new(File.join('xl', 'drawings', '_rels')).load_dir(dir_path)
    wb.charts = RubyXL::GenericStorage.new(File.join('xl', 'charts')).load_dir(dir_path)
    wb.chart_rels = RubyXL::GenericStorage.new(File.join('xl', 'charts', '_rels')).load_dir(dir_path)
    wb.printer_settings = RubyXL::GenericStorage.new(File.join('xl', 'printerSettings')).binary.load_dir(dir_path)
    wb.worksheet_rels = RubyXL::GenericStorage.new(File.join('xl', 'worksheets', '_rels')).load_dir(dir_path)
    wb.macros = RubyXL::GenericStorage.new('xl').binary.load_file(dir_path, 'vbaProject.bin')
    wb.theme = RubyXL::GenericStorage.new(File.join('xl', 'theme')).load_file(dir_path, 'theme1.xml')

    core_file = Nokogiri::XML.parse(File.open(File.join(dir_path, 'docProps', 'core.xml'), 'r'))
    wb.creator = core_file.css('dc|creator').children.to_s
    wb.modifier = core_file.css('cp|last_modified_by').children.to_s
    wb.created_at = core_file.css('dcterms|created').children.to_s
    wb.modified_at = core_file.css('dcterms|modified').children.to_s

    app_file = Nokogiri::XML.parse(File.open(File.join(dir_path, 'docProps', 'app.xml'), 'r'))
    wb.company = app_file.css('Company').children.to_s
    wb.application = app_file.css('Application').children.to_s
    wb.appversion = app_file.css('AppVersion').children.to_s
  end

  styles_xml = Nokogiri::XML.parse(File.open(File.join(dir_path, 'xl', 'styles.xml'), 'r'))

  defined_names = workbook_file.css('definedNames definedName')
  wb.defined_names = defined_names.collect { |node| RubyXL::DefinedName.parse(node) }

  wb.date1904 = workbook_file.css('workbookPr').attribute('date1904').to_s == '1'

  wb.shared_strings_XML = shared_string_file.to_s

  unless shared_string_file.nil?
    sst = shared_string_file.css('sst')

    # According to http://msdn.microsoft.com/en-us/library/office/gg278314.aspx,
    # these attributes may be either both missing, or both present. Need to validate.
    wb.shared_strings.count_attr = sst.attribute('count').value.to_i
    wb.shared_strings.unique_count_attr = sst.attribute('uniqueCount').value.to_i

    # Note that the strings may contain text formatting, such as changing font color/properties
    # in the middle of the string. We do not support that in this gem... at least yet!
    # If you save the file, this formatting will be destoyed.
    shared_string_file.css('si').each_with_index { |node, i|
      wb.shared_strings.add(node.css('t').inject(''){ |s, c| s + c.text }, i)
    }

  end

  fills = styles_xml.css('fills fill')
  wb.fills = fills.collect { |node| RubyXL::Fill.parse(node) }

  colors = styles_xml.css('colors').first

  if colors then
    colors.element_children.each { |color_type_node|
      wb.colors[color_type_node.name] ||= []
      color_type_node.element_children.each { |color_node|
        wb.colors[color_type_node.name] << RubyXL::Color.parse(color_node)
      }
    }
  end

  borders = styles_xml.css('borders border')
  wb.borders = borders.collect { |node| RubyXL::Border.parse(node) }

  fonts = styles_xml.css('fonts font')
  wb.fonts = fonts.collect { |node| RubyXL::Font.parse(node) }

  cell_styles = styles_xml.css('cellStyles cellStyle')
  wb.cell_styles = cell_styles.collect { |node| RubyXL::CellStyle.parse(node) }

  num_fmts = styles_xml.css('numFmts numFmt')
  wb.num_fmts = num_fmts.collect { |node| RubyXL::NumFmt.parse(node) }

  csxfs = styles_xml.css('cellStyleXfs xf')
  wb.cell_style_xfs = csxfs.collect { |node| RubyXL::XF.parse(node) }

  cxfs = styles_xml.css('cellXfs xf')
  wb.cell_xfs = cxfs.collect { |node| RubyXL::XF.parse(node) }

  #fills out count information for each font, fill, and border
  wb.cell_xfs.each { |style|
    id = style.font_id
    wb.fonts[id].count += 1 #unless id.nil?

    id = style.fill_id
    wb.fills[id].count += 1 #unless id.nil?

    id = style.border_id
    wb.borders[id].count += 1 #unless id.nil?
  }

  # Not sure why they were getting sheet names from god knows where.
  # There *may* have been a good reason behind it, so not tossing this code out entirely yet.
  # sheet_names = app_file.css('TitlesOfParts vt|vector vt|lpstr').children

  workbook_file.css('sheets sheet').each_with_index { |sheet_node, i|
    sheet_rid = sheet_node.attributes['id'].value 
    sheet_file_path = rels_doc.css("Relationships Relationship[Id=#{sheet_rid}]").first.attributes['Target']
    worksheet_xml = Nokogiri::XML.parse(File.read(File.join(dir_path, 'xl', sheet_file_path)))
    parse_worksheet(wb, i, worksheet_xml, sheet_node.attributes['name'].value,
                           sheet_node.attributes['sheetId'].value )
  }

  FileUtils.remove_entry_secure(dir_path)

  return wb
end

Private Instance Methods

parse_worksheet(wb, i, worksheet_xml, worksheet_name, sheet_id) click to toggle source

Parse the incoming worksheet_xml into a new Worksheet object

# File lib/rubyXL/parser.rb, line 162
    def parse_worksheet(wb, i, worksheet_xml, worksheet_name, sheet_id)
      worksheet = Worksheet.new(wb, worksheet_name)
      wb.worksheets[i] = worksheet # Due to "validate_workbook" issues. Should remove that validation eventually.
      worksheet.sheet_id = sheet_id

      dimensions_node = worksheet_xml.css('dimension').first
      return nil if dimensions_node.nil? # Temporary plug for Issue #76

      dimensions = RubyXL::Reference.new(dimensions_node.attribute('ref').value)
      cols = dimensions.last_col

      # Create empty arrays for workcells. Using +downto()+ here so memory for +sheet_data[]+ is
      # allocated on the first iteration (in case of +upto()+, +sheet_data[]+ would end up being
      # reallocated on every iteration).
      dimensions.last_row.downto(0) { |i| worksheet.sheet_data[i] = Array.new(cols + 1) }

      namespaces = worksheet_xml.root.namespaces

      if @data_only
        row_xpath = '/xmlns:worksheet/xmlns:sheetData/xmlns:row[xmlns:c[xmlns:v]]'
        cell_xpath = './xmlns:c[xmlns:v[text()]]'
      else
        row_xpath = '/xmlns:worksheet/xmlns:sheetData/xmlns:row'
        cell_xpath = './xmlns:c'

        sheet_views_nodes = worksheet_xml.xpath('/xmlns:worksheet/xmlns:sheetViews/xmlns:sheetView', namespaces)
        worksheet.sheet_views = sheet_views_nodes.collect { |node| RubyXL::SheetView.parse(node) }

        col_node_set = worksheet_xml.xpath('/xmlns:worksheet/xmlns:cols/xmlns:col',namespaces)
        worksheet.column_ranges = col_node_set.collect { |col_node| RubyXL::ColumnRange.parse(col_node) }

        merged_cells_nodeset = worksheet_xml.xpath('/xmlns:worksheet/xmlns:mergeCells/xmlns:mergeCell', namespaces)
        worksheet.merged_cells = merged_cells_nodeset.collect { |child| RubyXL::Reference.new(child.attributes['ref'].text) }

#        worksheet.pane = worksheet.sheet_view[:pane]

        data_validations = worksheet_xml.xpath('/xmlns:worksheet/xmlns:dataValidations/xmlns:dataValidation', namespaces)
        worksheet.validations = data_validations.collect { |node| RubyXL::DataValidation.parse(node) }

# Currently  not working #TODO#
#        ext_list_node = worksheet_xml.xpath('/xmlns:worksheet/xmlns:extLst', namespaces)

        legacy_drawing_nodes = worksheet_xml.xpath('/xmlns:worksheet/xmlns:legacyDrawing', namespaces)
        worksheet.legacy_drawings = legacy_drawing_nodes.collect { |node| RubyXL::LegacyDrawing.parse(node) }

        drawing_nodes = worksheet_xml.xpath('/xmlns:worksheet/xmlns:drawing', namespaces)
        worksheet.drawings = drawing_nodes.collect { |n| n.attributes['id'] }
      end

      worksheet_xml.xpath(row_xpath, namespaces).each { |row|
        unless @data_only
          ##row styles##
          row_attributes = row.attributes
          row_style = row_attributes['s'] && Integer(row_attributes['s'].value) || 0
          row_num = Integer(row_attributes['r'].content)

          worksheet.row_styles[row_num] = { :style => row_style  }

          if !row_attributes['ht'].nil?  && (!row_attributes['ht'].content.nil? || row_attributes['ht'].content.strip != "" )
            worksheet.change_row_height(row_num - 1, Float(row_attributes['ht'].value))
          end
          ##end row styles##
        end

        row.search(cell_xpath).each { |value|
          #attributes is from the excel cell(c) and is basically location information and style and type
          value_attributes = value.attributes
          # r attribute contains the location like A1
          cell_index = RubyXL::Reference.ref2ind(value_attributes['r'].content)
          style_index = 0
          # t is optional and contains the type of the cell
          data_type = value_attributes['t'].content if value_attributes['t']
          element_hash ={}

          value.children.each { |node| element_hash["#{node.name()}_element"] = node }

          # v is the value element that is part of the cell
          if element_hash["v_element"]
            v_element_content = element_hash["v_element"].content
          else
            v_element_content=""
          end

          if v_element_content == "" # no data
            cell_data = nil
          elsif data_type == RubyXL::Cell::SHARED_STRING
            str_index = Integer(v_element_content)
            cell_data = wb.shared_strings[str_index].to_s
          elsif data_type == RubyXL::Cell::RAW_STRING
            cell_data = v_element_content
          elsif data_type == RubyXL::Cell::ERROR
            cell_data = v_element_content
          else# (value.css('v').to_s != "") && (value.css('v').children.to_s != "") #is number
            data_type = ''
            if(v_element_content =~ /\./ or v_element_content =~ /\d+e\-?\d+/i) #is float
              cell_data = Float(v_element_content)
            else
              cell_data = Integer(v_element_content)
            end
          end

          # f is the formula element
          cell_formula = nil
          fmla_css = element_hash["f_element"]
          if fmla_css && fmla_css.content
            fmla_css_content= fmla_css.content
            if(fmla_css_content != "")
              cell_formula = fmla_css_content
              cell_formula_attr = {}
              fmla_css_attributes = fmla_css.attributes
              cell_formula_attr['t'] = fmla_css_attributes['t'].content if fmla_css_attributes['t']
              cell_formula_attr['ref'] = fmla_css_attributes['ref'].content if fmla_css_attributes['ref']
              cell_formula_attr['si'] = fmla_css_attributes['si'].content if fmla_css_attributes['si']
            end
          end

          style_index = value['s'].to_i #nil goes to 0 (default)

          worksheet.sheet_data[cell_index[0]][cell_index[1]] =
            Cell.new(worksheet,cell_index[0],cell_index[1],cell_data,cell_formula,
              data_type,style_index,cell_formula_attr)
          cell = worksheet.sheet_data[cell_index[0]][cell_index[1]]
        }
      }

      worksheet
    end