module FkStr
Public Class Methods
articles_and_others()
click to toggle source
# File lib/fk_str/dictionary.rb, line 57 def FkStr.articles_and_others return @@articles_and_others end
countries_acronyms()
click to toggle source
# File lib/fk_str/dictionary.rb, line 73 def FkStr.countries_acronyms return @@countries_acronyms end
downcase(w)
click to toggle source
# File lib/fk_str.rb, line 99 def self.downcase w return w if w.to_s == '' # Cria uma Array apenas com os caracteres necessários por questões de performance. letters = [] clean_word = self.remove_accents(w).downcase.gsub(/[^a-z]/, '') clean_word.split('').uniq.each { |lt| @@letters_by_letter[lt].each { |l| letters << l } } letters.each do |l| # Transforma tudo em minúsculo. w = w.gsub l[1], l[0] end return w end
extract_dates(str, reference_date=Time.now, reverse_month_day=false)
click to toggle source
# File lib/fk_str.rb, line 213 def self.extract_dates str, reference_date=Time.now, reverse_month_day=false return [] if str.nil? return [Time.new(str.year, str.month, str.day)] if str.kind_of?(Time) or str.kind_of?(Date) or str.kind_of?(DateTime) o_str = str years = [] (-30..20).each { |y| years << reference_date.year+y } begin str = str.gsub /[0-9]{1,}(º|ª)/, ' ' str = self.remove_accents str str = str.downcase str = str.gsub /[0-9]{1,}+[a-z]{1,}+[0-9]{1,}/, '' str = str.gsub /[0-9]{1,}+[a-z]{1,}/, ' ' str = str.gsub /[a-z]{1,}+[0-9]{1,}/, ' ' str = str.gsub(/[^a-z|^0-9|^\/|^\-|^\.|^:]/i, ' ') str = str.gsub(/[0-9]{1,}:[0-9]{1,}|:[0-9]{1,}|[0-9]{1,}h[0-9]{1,}|[0-9]{1,}%|[0-9]{1,}h |[0-9]{1,}h$|palco [0-9]{1,}/i, '') str.scan(/[0-9]{1,}+.+[0-9]{1,}/).each { |d| str = str.gsub(d, d.gsub('.', '/')) } if reverse_month_day str.scan(/[0-9]{1,}\/[0-9]{1,}/).each do |d| str = str.gsub(d, d.split('/')[1] + '/' + d.split('/')[0]) end end @@months_strs.each do |mc| str.scan(/#{mc.first}.*[0-9]{1,2}+[1-9]{2,4}/).each do |md| if md.scan(/[0-9]{1,2}/).size < 4 and md.scan(/[0-9]{4,}/).size < (md.scan(/[0-9]{2,2}/).size-1) continue = true @@months_strs.each do |smc| md.scan(/[0-9].*#{smc.first}/).each do |d| continue = false end end if continue m = md.scan(/[0-9]{1,2}/).first str = str.gsub(/#{mc.first}.+#{m}/, "#{m} #{mc.first}").gsub(',', '') end end end end str.scan(/[0-9]{4,4}-[0-9]{1,2}-[0-9]{1,2}/).each do |y| str = str.gsub(y, y.split('-')[2] + '/' + y.split('-')[1] + '/' + y.split('-')[0]) end str.scan(/[0-9]{4,4}\/[0-9]{1,2}\/[0-9]{1,2}/).each do |y| str = str.gsub(y, y.split('/')[2] + '/' + y.split('/')[1] + '/' + y.split('/')[0]) end str.scan(/[0-9]{1,2}-[0-9]{1,2}-[0-9]{2,4}/).each do |y| str = str.gsub(y, y.split('-')[0] + '/' + y.split('-')[1] + '/' + y.split('-')[2]) end str.scan(/[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{1,}/).each do |y| if y.split('/')[2].size < 4 sr = y.split('/').first + '/' + y.split('/')[1] sy = y.split('/')[2] if sy.size < 3 sy = '0' + sy if sy.size == 1 if years.include? (reference_date.year.to_s[0..1]+sy).to_i sr += '/' + reference_date.year.to_s[0..1]+sy elsif years.include? ((reference_date.year-100).to_s[0..1]+sy).to_i sr += '/' + (reference_date.year-100).to_s[0..1]+sy end end str = str.gsub(y, sr) end end str = str.gsub(/[0-9]{5,}/, '') dates = [] continue = true while continue @@months_strs.each do |m| str.scan(/([0-9].*#{m.first})+([^0-9]|$)/).each do |d| days = d.first.split(/(#{m.first})+([^0-9]|$)/).first jump=false @@months_strs.each do |mc| if days.scan(/([0-9].*#{mc.first})+([^0-9]|$)/).size > 0 jump = true end end if !jump year = nil str.scan(/#{days}#{m.first}.*[0-9]{4,4}/).each do |sc| sy = sc.gsub(/(#{days}#{m.first})+([^0-9]|$)/, '') # [lorem 9/jan/2012] = false # [2012 e 07/05/2012] = true # [2012] = true if sy.scan(/[0-9]{4,4}/).size > 1 or (sy.scan(/[0-9]{4,4}/).size == 1 and !sy.gsub(/[0-9]{4,4}/, '').match(/[0,9]/)) sy.scan(/[0-9]{4,4}/).each { |y| year=y.to_i if years.include? y.to_i; break; } end end #puts '[' + str + '] => ' + year.inspect str = str.gsub(/(#{days}#{m.first})+([^0-9]|$)/, '') #puts '[' + str + "\n\n" days.gsub(/[0-9]{4,4}/, '').scan(/[0-9]{1,2}/).each do |day| day = day.to_i if day > 0 and day < 32 if year dates<<Time.new(year, m[1], day) elsif m[1]<(reference_date.month-3) dates<<Time.new(reference_date.year+1, m[1], day) else dates<<Time.new(reference_date.year, m[1], day) end end end end end end continue = false @@months_strs.each do |mt| if str.scan(/([0-9].*#{mt.first})+([^0-9]|$)/).size > 0 continue = true end end end return dates.uniq.sort rescue => exc return [] end end
extract_time(str, date=nil, reference_time=Time.now)
click to toggle source
# File lib/fk_str.rb, line 361 def self.extract_time str, date=nil, reference_time=Time.now return nil if date.nil? return Time.new(date.year, date.month, date.day, reference_time.hour, reference_time.min) if str.nil? or !str.match /[0-9]{1,2}:[0-9]{1,2}/ begin time = str.scan(/[0-9]{1,2}:[0-9]{1,2}/).first.split(':') return Time.new(date.year, date.month, date.day, time[0], time[1]) rescue => exp return Time.new(date.year, date.month, date.day, reference_time.hour, reference_time.min) end end
is_eq(str, str_b, pct=1)
click to toggle source
# File lib/fk_str.rb, line 14 def self.is_eq str, str_b, pct=1 str = self.to_term str, true str_b = self.to_term str_b, true str_c = str.concat str_b return true if (100-(100*str_c.uniq.size/str_c.size)) >= pct return false end
remove_accents(str)
click to toggle source
# File lib/fk_str.rb, line 64 def self.remove_accents str return '' if str.to_s == '' str = str.gsub(/[ÁÃÂÀÄĂĀÅÆ]/, 'A').gsub(/[áãâàäăāåæ]/, 'a') str = str.gsub(/[ÉẼÊÈËĔĒ]/, 'E').gsub(/[éẽêèëĕē]/, 'e') str = str.gsub(/[ÍĨÎÌÏĬĪ]/, 'I').gsub(/[íĩîìïĭī]/, 'i') str = str.gsub(/[ÓÕÔÒÖŎŌŐÐ]/, 'O').gsub(/[óõôòöŏōőð]/, 'o') str = str.gsub(/[ÚŨÛÙÜŬŪǕ]/, 'U').gsub(/[úũûùüŭūǖ]/, 'u') str = str.gsub(/[ÇČ]/, 'C').gsub(/[çč]/, 'c').gsub(/Ğ/, 'G').gsub(/ğ/, 'g').gsub(/Ñ/, 'N').gsub(/ñ/, 'n').gsub(/Š/, 'S').gsub(/š/, 's') str = str.gsub(/[ȲŸÝỲ]/, 'Y').gsub(/[ȳÿýỳ]/, 'y').gsub(/Ž/, 'Z').gsub(/ž/, 'z') return str end
remove_if_ends_with(str, texts, not_change_if_returns_with=nil, if_not_change_returns_with_last_removed=0)
click to toggle source
# File lib/fk_str.rb, line 144 def self.remove_if_ends_with str, texts, not_change_if_returns_with=nil, if_not_change_returns_with_last_removed=0 return str if str.split(' ').size == 1 texts.each_with_index { |t, i| texts.delete_at i if t == '' } str_o = str str = str.strip str_t = self.remove_accents(str).downcase texts = texts.uniq texts.each_with_index { |v, i| texts[i] = self.remove_accents(v).downcase } not_change_if_returns_with.each_with_index { |v, i| not_change_if_returns_with[i] = self.remove_accents(v).downcase } if !not_change_if_returns_with.nil? removed = [] continue = true while continue continue = false texts.each do |t| # Se o final da string for igual ao termo... if t == str_t[str_t.size-t.size..str_t.size].to_s # Se antes do termo final na string não for igual à ' de ' ou ' da '... if ![' de ', ' da '].include? str_t[str_t.size-t.size-4].to_s + str_t[str_t.size-t.size-3..str_t.size-t.size-2].to_s + str_t[str_t.size-t.size-1].to_s # Se o primeiro char do termo não for uma letra ou se o char anterior ao termo não for uma letra... if (!@@simple_downcase_letters.include? t[0] or !@@simple_downcase_letters.include? str_t[str_t.size-t.size-1]) and str_t.size > 1 str_l = str str = str[0..str.size-t.size-1].strip str_t = self.remove_accents(str).downcase removed << str_l[str.size..str_l.size] continue = true end end end end end # Se o retorno for igual à alguma condição que não deve ser retornada... if !not_change_if_returns_with.nil? if not_change_if_returns_with.include?(self.remove_accents(str).downcase) # Se for solicitado que retorne apenas com x termos que foram removidos... if if_not_change_returns_with_last_removed > 0 removed = removed.reverse (1..if_not_change_returns_with_last_removed).each { |n| str += removed[n-1].to_s } return str.strip end return str_o end end return str end
separators()
click to toggle source
# File lib/fk_str/dictionary.rb, line 96 def FkStr.separators return @@separators end
to_slug(str)
click to toggle source
# File lib/fk_str.rb, line 27 def self.to_slug str return str if str.to_s == '' return self.remove_accents(str).gsub(/\s{1,}| {1,}/, ' ').gsub(/[\+\/_\-|:@#\\,]/, ' ').gsub('&', 'e').gsub(/[^a-zA-Z0-9 ]/, '').downcase.gsub(/\s{1,}| {1,}/, ' ').strip.gsub(' ', '-') end
to_term(str, ar=false)
click to toggle source
# File lib/fk_str.rb, line 35 def self.to_term str, ar=false return str if str.to_s == '' str_ar = [] self.to_slug(str).split('-').each do |s| s.split('').uniq.each { |r| s = s.gsub /#{r}{2,}/, r } @@simple_downcase_consonants.each { |c| s = s.gsub /#{c}(h|r|l|u)/, c } if !s.empty? and !@@countries_acronyms.include? s and !@@articles_and_others.include? s s = s.gsub /m/, 'n' s = s.gsub /l/, 'r' s = s.gsub /z/, 's' s = s.gsub /g/, 'j' s = s.gsub /e|y/, 'i' s = s.gsub /o|w/, 'u' s = s.gsub /c|q/, 'k' s.split('').uniq.each { |r| s = s.gsub /#{r}{2,}/, r } s = s.gsub /(r|s|n)$/, '' str_ar << s if !s.empty? end end return str_ar.uniq if ar return str_ar.uniq.join end
treat_encoding(str, debug=false)
click to toggle source
# File lib/fk_str.rb, line 8 def self.treat_encoding str, debug=false str_r = '' str.lines.each_with_index { |l, i| str_r += ' ' + self.treat_encoding_s(l, debug) if !debug or (i > -1 and i < 1) } return str_r.strip end
upcase(w)
click to toggle source
# File lib/fk_str.rb, line 79 def self.upcase w return w if w.to_s == '' # Cria uma Array apenas com os caracteres necessários por questões de performance. letters = [] clean_word = self.remove_accents(w).downcase.gsub(/[^a-z]/, '') clean_word.split('').uniq.each { |lt| @@letters_by_letter[lt].each { |l| letters << l } } letters.each do |l| # Transforma tudo em maiúsculo. w = w.gsub l[0], l[1] end return w end
upcasewords(str)
click to toggle source
35 seconds 18 seconds 16 seconds
# File lib/fk_str.rb, line 122 def self.upcasewords str return str if str.to_s == '' # Trata espaçamentos duplicados ou inválidos. str = str.gsub(/\s{1,}| {1,}/, ' ').strip rstr = [] str.split(' ').each { |w| rstr << upcaseword(w) } str = rstr.join(' ') # Trata espaçamentos duplicados ou inválidos. str = str.gsub(/\s{1,}| {1,}/, ' ') # Maiúsculo na primeira letra fl = @@letters_by_letter[remove_accents(str[0]).downcase] fl.each { |l| str[0] = str[0].gsub(l[0], l[1]) } if fl return str end
Private Class Methods
treat_encoding_i(str, tolerance=0, debug=false)
click to toggle source
# File lib/fk_str.rb, line 415 def self.treat_encoding_i str, tolerance=0, debug=false str_t = str str_v = self.valid_encoding str_t, tolerance, debug if !str_v puts '[try force_encoding UTF-8]' if debug begin str_t = str.force_encoding 'UTF-8' rescue => exp end else return str_v end str_v = self.valid_encoding str_t, tolerance, debug if !str_v puts '[try WINDOWS-1252]' if debug begin str_t = str.encode 'UTF-8', 'WINDOWS-1252' rescue => exp end else return str_v end str_v = self.valid_encoding str_t, tolerance, debug if !str_v puts '[try UTF-8]' if debug begin str_t = str.encode 'UTF-8', 'UTF-8' rescue => exp end else return str_v end str_v = self.valid_encoding str_t, tolerance, debug if !str_v puts '[try ISO-8859-2]' if debug begin str_t = str.encode 'UTF-8', 'ISO-8859-2' rescue => exp end else return str_v end str_v = self.valid_encoding str_t, tolerance, debug if !str_v puts '[try ISO-8859-3]' if debug begin str_t = str.encode 'UTF-8', 'ISO-8859-3' rescue => exp end else return str_v end str_v = self.valid_encoding str_t, tolerance, debug if tolerance == 0 and !str_v str_t = self.treat_encoding_i str, 1, debug end return str_t end
treat_encoding_s(str, debug=false)
click to toggle source
# File lib/fk_str.rb, line 378 def self.treat_encoding_s str, debug=false begin str_r = '' ws = str.split(' ').each_slice(20) ws.each_with_index do |w, i| if i == 0 str_r += self.treat_encoding_i w.join(' '), 0, debug else str_r += ' ' + self.treat_encoding_i(w.join(' '), 0, debug) end end rescue => exp str_r = '' str.chars.each_slice(200).each { |w| str_r += self.treat_encoding_i w.join, 0, debug } end return str_r end
upcaseword(w)
click to toggle source
# File lib/fk_str.rb, line 483 def self.upcaseword w return w if w.to_s == '' if w.scan(/#{@@separators_regex.join('|')}/).size == 0 # Cria uma Array apenas com os caracteres necessários por questões de performance. letters = [] clean_word = self.remove_accents(w).downcase.gsub(/[^a-z]/, '') clean_word.split('').uniq.each { |lt| @@letters_by_letter[lt].each { |l| letters << l } } trf = 'tm' trf = 'tfu' if w.size > 5 or !@@articles_and_others.include? clean_word trf = 'tau' if !w.match(/^mr$|^jr$|^mr.$|^jr.$|^sr$|^sr.$/i) and ((w.size < 6 and clean_word.match(/[^aeiouwy]{4,}|[aeiouwy]{4,}|^[^aeiouwy]{2,3}$/)) or w.scan('.').size > 2) letters.each do |l| # Transforma tudo em minúsculo. w = w.gsub l[1], l[0] if trf == 'tm' || trf == 'tfu' # Maiúsculo na primeira letra caso não seja um artigo ou algo do gênero. w = w.gsub /^#{l[0]}/, l[1] if trf == 'tfu' # Transforma em maiúsculo: # * Sequência de 4 ou mais consoantes. # * Sequência de 4 ou mais vogais. # * Sequência exata de 2 ou 3 vogais. w = w.gsub l[0], l[1] if trf == 'tau' end else # Quebra termos entre caracteres separadores como "'", "(", etc. @@separators.each do |l| sw = w.split(l) if sw.size > 1 # Trata o termo isoladamente se não for uma letra única antes de "'" sw.each_with_index { |v, i| sw[i] = upcaseword v if !(["'"].include? l and v.size == 1 and i == 0) } if w[w.size-1] == l w = sw.join(l) + l else w = sw.join(l) end end end end return w end
valid_encoding(str, tolerance=0, debug=false)
click to toggle source
# File lib/fk_str.rb, line 398 def self.valid_encoding str, tolerance=0, debug=false str_v = str begin str_v.match 'á' str_v = str_v.gsub /\s{1,}|\n{1,}|\r{1,}/, '' @@legal_chars.each { |lc| str_v = str_v.gsub lc, '' } @@invalid_sequences.each { |is| raise 'invalid sequence: ' + is if str.match is } puts '[' + str_v + ']' if debug and str_v.size > 0 return false if str_v.size > tolerance str_v.split('').each { |c| str = str.gsub c, '' } if str_v.size > 0 return str rescue => exp #puts '[error] ' + exp.message if debug or !exp.message.match /incompatible encoding|invalid byte sequence|invalid sequence/i return false end end