class NameParser

fix: move into TextUtils namespace/module!! ??

Public Instance Methods

parse( chunks ) click to toggle source
# File lib/textutils/parser/name_parser.rb, line 9
def parse( chunks )
  ## todo/fix: (re)use nameparser - for now "simple" inline version
  ##  fix!!! - note: for now lang gets ignored
  ##  fix: add hanlde
  ##  Leuven[nl]|Louvain[fr] Löwen[de]
  ##  Antwerpen[nl]|Anvers[fr] [Antwerp]
  ##  Brussel[nl]•Bruxelles[fr]   -> official bi-lingual name
  ##  etc.

  ## values - split into names (name n lang pairs)
  ## note: assumes (default) lang from more_attribs unless otherwise marked e.g. [] assume en etc.

  ## split chunks into values
  values = []
  chunks.each do |chunk|
    next if chunk.nil? || chunk.blank?  ## skip nil or empty/blank chunks

    parts = chunk.split( '|' )   # 1)  split |

    parts.each do |part|
      s = StringScanner.new( part )
      s.skip( /[ \t]+/)   # skip whitespaces

      while s.eos? == false
        if s.check( /\[/ )
          ## scan everything until the end of bracket (e.g.])
          ##  fix!!! - note: for now lang gets ignored
          value = s.scan( /\[[^\]]+\]/)
          value = value[1...-1]   # strip enclosing [] e.g. [Bavaria] => Bavaria
        else
          ## scan everything until the begin of bracket (e.g.[)
          value = s.scan( /[^\[]+/)
          value = value.strip
        end
        values << value

        s.skip( /[ \t]+/)  # skip whitespaces
        logger.debug( "[NameParser] eos?: #{s.eos?}, rest: >#{s.rest}<" )
      end
    end
  end

  logger.debug( "[NameParser] values=#{values.inspect}")

  names = []
  values.each do |value|
    name = value
    ## todo: split by bullet ? (official multilang name) e.g. Brussel • Bruxelles
    ## todo: process variants w/ () e.g. Krems (a. d. Donau) etc. ??
    names << name
  end

  logger.debug( "[NameParser] names=#{names.inspect}")

  names
end