class NameTokenizer
fix: move into TextUtils
namespace/module!! ??
Public Instance Methods
tokenize( value )
click to toggle source
# File lib/textutils/parser/name_tokenizer.rb, line 14 def tokenize( value ) ## rename to/use split - why? why not?? names = [] # 1) split by | (pipe) -- remove leading n trailing whitespaces parts = value.split( /[ \t]*\|[ \t]*/ ) # 2) split "inline" translations e.g. München [Munich] ## todo: add support for Munich [en] e.g. trailing lang tag ## todo: add support for bullet (official bi-lingual names w/ tags ??) - see brussels - why, why not?? parts.each do |part| s = StringScanner.new( part ) s.skip( /[ \t]+/) # skip whitespaces while s.eos? == false if s.check( /\[/ ) ## scan everything until the end of bracket (e.g.]) name = s.scan( /\[[^\]]+\]/) ## todo/fix: if name nil - issue warning?? # starting w/ [ but no closing ] found !!!! - possible? fix!! else ## scan everything until the begin of bracket (e.g.[) name = s.scan( /[^\[]+/) name = name.rstrip ## remove trailing spaces (if present) end names << name s.skip( /[ \t]+/) # skip whitespaces logger.debug( "[NameTokenizer] eos?: #{s.eos?}, rest: >#{s.rest}<" ) end end # each part logger.debug( "[NameTokenizer] names=#{names.inspect}") names end