class TextUtils::TitleMapper2
Constants
- MappingStruct
key: e.g. augsburg title: e.g. FC Augsburg length (of title - not pattern): e.g. 11 – do not count dots (e.g. U.S.A. => 3 or 6) why? why not?
Attributes
known_titles[R]
Public Class Methods
new( records, tag )
click to toggle source
# File lib/textutils/title_mapper2.rb, line 24 def initialize( records, tag ) @known_titles = build_title_table_for( records ) ## build mapping lookup table ## todo: rename tag to attrib or attrib_name - why ?? why not ??? @tag = tag # e.g. tag name use for @@brewery@@ @@team@@ etc. end
Public Instance Methods
find_key!( line )
click to toggle source
# File lib/textutils/title_mapper2.rb, line 38 def find_key!( line ) find_key_for!( @tag, line ) end
find_keys!( line )
click to toggle source
# File lib/textutils/title_mapper2.rb, line 42 def find_keys!( line ) # NB: keys (plural!) - will return array counter = 1 keys = [] key = find_key_for!( "#{@tag}#{counter}", line ) while key.present? keys << key counter += 1 key = find_key_for!( "#{@tag}#{counter}", line ) end keys end
map_titles!( line )
click to toggle source
# File lib/textutils/title_mapper2.rb, line 32 def map_titles!( line ) ## rename to just map! - why?? why not??? begin found = map_title_for!( @tag, line, @known_titles ) end while found end
Private Instance Methods
build_title_table_for( records )
click to toggle source
# File lib/textutils/title_mapper2.rb, line 57 def build_title_table_for( records ) ## build known tracks table w/ synonyms e.g. # # [[ 'wolfsbrug', 'VfL Wolfsburg'], # [ 'augsburg', 'FC Augsburg'], # [ 'augsburg', 'Augi2'], # [ 'augsburg', 'Augi3' ], # [ 'stuttgart', 'VfB Stuttgart']] known_titles = [] records.each_with_index do |rec,index| title_candidates = [] title_candidates << rec.title title_candidates += rec.synonyms.split('|') if rec.synonyms.present? ## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit) # make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan titles = [] title_candidates.each do |t| titles << t if t =~ /\(.+\)/ extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles # note: strip leading n trailing withspaces too! # -- todo: add squish or something if () is inline e.g. leaves two spaces? extra_title.strip! titles << extra_title end end titles.each do |t| m = MappingStruct.new m.key = rec.key m.title = t m.length = t.length ## note: escape for regex plus allow subs for special chars/accents m.pattern = TextUtils.title_esc_regex( t ) known_titles << m end logger.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<" ## NB: only include code field - if defined if rec.respond_to?(:code) && rec.code.present? m = MappingStruct.new m.key = rec.key m.title = rec.code m.length = rec.code.length m.pattern = rec.code ## note: use code for now as is (no variants allowed fow now) known_titles << m end end ## note: sort here by length (largest goes first - best match) # exclude code and key (key should always go last) known_titles = known_titles.sort { |left,right| right.length <=> left.length } known_titles end
find_key_for!( tag, line )
click to toggle source
# File lib/textutils/title_mapper2.rb, line 148 def find_key_for!( tag, line ) regex = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@]) upcase_tag = tag.upcase downcase_tag = tag.downcase if line =~ regex value = "#{$1}" logger.debug " #{downcase_tag}: >#{value}<" line.sub!( regex, "[#{upcase_tag}]" ) return $1 else return nil end end
map_title_for!( tag, line, mappings )
click to toggle source
# File lib/textutils/title_mapper2.rb, line 124 def map_title_for!( tag, line, mappings ) downcase_tag = tag.downcase mappings.each do |mapping| key = mapping.key value = mapping.pattern ## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9) ## (thus add it, allows match for Benfica Lis. for example - note . at the end) ## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$ regex = /\b#{value}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker) if line =~ regex logger.debug " match for #{downcase_tag} >#{key}< >#{value}<" # make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc. line.sub!( regex, "@@oo#{key}oo@@ " ) # NB: add one space char at end return true # break out after first match (do NOT continue) end end return false end