class TextUtils::TitleMapper
Attributes
known_titles[R]
Public Class Methods
new( records, tag )
click to toggle source
# File lib/textutils/title_mapper.rb, line 25 def initialize( records, tag ) @known_titles = build_title_table_for( records ) ## build mapping lookup table ## todo: rename tag to attrib or attrib_name - why ?? why not ??? @tag = tag # e.g. tag name use for @@brewery@@ @@team@@ etc. end
Public Instance Methods
find_key!( line )
click to toggle source
# File lib/textutils/title_mapper.rb, line 42 def find_key!( line ) find_key_for!( @tag, line ) end
find_keys!( line )
click to toggle source
# File lib/textutils/title_mapper.rb, line 46 def find_keys!( line ) # NB: keys (plural!) - will return array counter = 1 keys = [] key = find_key_for!( "#{@tag}#{counter}", line ) while key.present? keys << key counter += 1 key = find_key_for!( "#{@tag}#{counter}", line ) end keys end
map_titles!( line )
click to toggle source
# File lib/textutils/title_mapper.rb, line 33 def map_titles!( line ) ## rename to just map! - why?? why not??? @known_titles.each do |rec| key = rec[0] values = rec[1] map_title_for!( @tag, line, key, values ) end end
Private Instance Methods
build_title_table_for( records )
click to toggle source
# File lib/textutils/title_mapper.rb, line 61 def build_title_table_for( records ) #### fix/todo: ### reorder - sort by largest strings etc. ## do NOT use lookup w/ array per key; use 1:1 one key per lookup ## -> lets us sort by find largest first ## build known tracks table w/ synonyms e.g. # # [[ 'wolfsbrug', [ 'VfL Wolfsburg' ]], # [ 'augsburg', [ 'FC Augsburg', 'Augi2', 'Augi3' ]], # [ 'stuttgart', [ 'VfB Stuttgart' ]] ] known_titles = [] records.each_with_index do |rec,index| title_candidates = [] title_candidates << rec.title title_candidates += rec.synonyms.split('|') if rec.synonyms.present? ## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit) # make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan titles = [] title_candidates.each do |t| titles << t if t =~ /\(.+\)/ extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles extra_title.strip! # strip leading n trailing withspaces too! titles << extra_title end end ## NB: sort here by length (largest goes first - best match) # exclude code and key (key should always go last) titles = titles.sort { |left,right| right.length <=> left.length } ## escape for regex plus allow subs for special chars/accents titles = titles.map { |title| TextUtils.title_esc_regex( title ) } ## NB: only include code field - if defined titles << rec.code if rec.respond_to?(:code) && rec.code.present? known_titles << [ rec.key, titles ] logger.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<" end known_titles end
find_key_for!( tag, line )
click to toggle source
# File lib/textutils/title_mapper.rb, line 139 def find_key_for!( tag, line ) regex = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@]) upcase_tag = tag.upcase downcase_tag = tag.downcase if line =~ regex value = "#{$1}" logger.debug " #{downcase_tag}: >#{value}<" line.sub!( regex, "[#{upcase_tag}]" ) return $1 else return nil end end
map_title_for!( tag, line, key, values )
click to toggle source
# File lib/textutils/title_mapper.rb, line 118 def map_title_for!( tag, line, key, values ) downcase_tag = tag.downcase values.each do |value| ## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9) ## (thus add it, allows match for Benfica Lis. for example - note . at the end) ## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$ regex = /\b#{value}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker) if line =~ regex logger.debug " match for #{downcase_tag} >#{key}< >#{value}<" # make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc. line.sub!( regex, "@@oo#{key}oo@@ " ) # NB: add one space char at end return true # break out after first match (do NOT continue) end end return false end