class ConfidentialInfoRedactor::Redactor
This class redacts various tokens from a text
Constants
- EMAIL_REGEX
Rubular: rubular.com/r/mxcj2G0Jfa
- NUMBER_REGEX
Rubular: rubular.com/r/OI2wQZ0KSl
Attributes
date_text[R]
ignore_dates[R]
ignore_emails[R]
ignore_hyperlinks[R]
ignore_numbers[R]
language[R]
number_text[R]
token_text[R]
tokens[R]
Public Class Methods
new(**args)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 14 def initialize(**args) @language = args[:language] || 'en' @tokens = args[:tokens] @number_text = args[:number_text] || '<redacted number>' @date_text = args[:date_text] || '<redacted date>' @token_text = args[:token_text] || '<redacted>' @ignore_emails = args[:ignore_emails] @ignore_dates = args[:ignore_dates] @ignore_numbers = args[:ignore_numbers] @ignore_hyperlinks = args[:ignore_hyperlinks] end
Public Instance Methods
dates(text)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 26 def dates(text) redact_dates(text) end
emails(text)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 34 def emails(text) redact_emails(text) end
hyperlinks(text)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 38 def hyperlinks(text) redact_hyperlinks(text) end
numbers(text)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 30 def numbers(text) redact_numbers(text) end
proper_nouns(text)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 42 def proper_nouns(text) redact_tokens(text) end
redact(text)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 46 def redact(text) if ignore_emails redacted_text = text else redacted_text = redact_emails(text) end redacted_text = redact_hyperlinks(redacted_text) unless ignore_hyperlinks redacted_text = redact_dates(redacted_text) unless ignore_dates redacted_text = redact_numbers(redacted_text) unless ignore_numbers redact_tokens(redacted_text) end
Private Instance Methods
redact_dates(txt)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 64 def redact_dates(txt) ConfidentialInfoRedactor::Date.new(language: language).replace(txt).gsub(/<redacted date>/, "#{date_text}").gsub(/\s*#{Regexp.escape(date_text)}\s*/, " #{date_text} ").gsub(/\A\s*#{Regexp.escape(date_text)}\s*/, "#{date_text} ").gsub(/#{Regexp.escape(date_text)}\s{1}\.{1}/, "#{date_text}.") end
redact_emails(txt)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 72 def redact_emails(txt) txt.gsub(EMAIL_REGEX, "#{token_text}") end
redact_hyperlinks(txt)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 60 def redact_hyperlinks(txt) ConfidentialInfoRedactor::Hyperlink.new.replace(txt).gsub(/<redacted>/, "#{token_text}").gsub(/\s*#{Regexp.escape(token_text)}\s*/, " #{token_text} ").gsub(/#{Regexp.escape(token_text)}\s{1}\.{1}/, "#{token_text}.").gsub(/#{Regexp.escape(token_text)}\s{1}\,{1}/, "#{token_text},") end
redact_numbers(txt)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 68 def redact_numbers(txt) txt.gsub(NUMBER_REGEX, " #{number_text} ").gsub(/\s*#{Regexp.escape(number_text)}\s*/, " #{number_text} ").gsub(/\A\s*#{Regexp.escape(number_text)}\s*/, "#{number_text} ").gsub(/#{Regexp.escape(number_text)}\s{1}\.{1}/, "#{number_text}.").gsub(/#{Regexp.escape(number_text)}\s{1}\,{1}/, "#{number_text},").gsub(/#{Regexp.escape(number_text)}\s{1}\){1}/, "#{number_text})").gsub(/\(\s{1}#{Regexp.escape(number_text)}/, "(#{number_text}").gsub(/#{Regexp.escape(number_text)}\s\z/, "#{number_text}") end
redact_tokens(txt)
click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 76 def redact_tokens(txt) tokens.sort_by{ |x| x.split.count }.reverse.each do |token| txt.gsub!(/(?<=\s|^|\")#{Regexp.escape(token)}(?=\W|$)/, "#{token_text}") end txt.strip end