class ConfidentialInfoRedactor::Redactor

This class redacts various tokens from a text

Constants

EMAIL_REGEX

Rubular: rubular.com/r/mxcj2G0Jfa

NUMBER_REGEX

Rubular: rubular.com/r/OI2wQZ0KSl

Attributes

date_text[R]
ignore_dates[R]
ignore_emails[R]
ignore_numbers[R]
language[R]
number_text[R]
token_text[R]
tokens[R]

Public Class Methods

new(**args) click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 14
def initialize(**args)
  @language = args[:language] || 'en'
  @tokens = args[:tokens]
  @number_text = args[:number_text] || '<redacted number>'
  @date_text = args[:date_text] || '<redacted date>'
  @token_text = args[:token_text] || '<redacted>'
  @ignore_emails = args[:ignore_emails]
  @ignore_dates = args[:ignore_dates]
  @ignore_numbers = args[:ignore_numbers]
  @ignore_hyperlinks = args[:ignore_hyperlinks]
end

Public Instance Methods

dates(text) click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 26
def dates(text)
  redact_dates(text)
end
emails(text) click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 34
def emails(text)
  redact_emails(text)
end
numbers(text) click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 30
def numbers(text)
  redact_numbers(text)
end
proper_nouns(text) click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 42
def proper_nouns(text)
  redact_tokens(text)
end
redact(text) click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 46
def redact(text)
  if ignore_emails
    redacted_text = text
  else
    redacted_text = redact_emails(text)
  end
  redacted_text = redact_hyperlinks(redacted_text) unless ignore_hyperlinks
  redacted_text = redact_dates(redacted_text) unless ignore_dates
  redacted_text = redact_numbers(redacted_text) unless ignore_numbers
  redact_tokens(redacted_text)
end

Private Instance Methods

redact_dates(txt) click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 64
def redact_dates(txt)
  ConfidentialInfoRedactor::Date.new(language: language).replace(txt).gsub(/<redacted date>/, "#{date_text}").gsub(/\s*#{Regexp.escape(date_text)}\s*/, " #{date_text} ").gsub(/\A\s*#{Regexp.escape(date_text)}\s*/, "#{date_text} ").gsub(/#{Regexp.escape(date_text)}\s{1}\.{1}/, "#{date_text}.")
end
redact_emails(txt) click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 72
def redact_emails(txt)
  txt.gsub(EMAIL_REGEX, "#{token_text}")
end
redact_numbers(txt) click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 68
def redact_numbers(txt)
  txt.gsub(NUMBER_REGEX, " #{number_text} ").gsub(/\s*#{Regexp.escape(number_text)}\s*/, " #{number_text} ").gsub(/\A\s*#{Regexp.escape(number_text)}\s*/, "#{number_text} ").gsub(/#{Regexp.escape(number_text)}\s{1}\.{1}/, "#{number_text}.").gsub(/#{Regexp.escape(number_text)}\s{1}\,{1}/, "#{number_text},").gsub(/#{Regexp.escape(number_text)}\s{1}\){1}/, "#{number_text})").gsub(/\(\s{1}#{Regexp.escape(number_text)}/, "(#{number_text}").gsub(/#{Regexp.escape(number_text)}\s\z/, "#{number_text}")
end
redact_tokens(txt) click to toggle source
# File lib/confidential_info_redactor/redactor.rb, line 76
def redact_tokens(txt)
  tokens.sort_by{ |x| x.split.count }.reverse.each do |token|
    txt.gsub!(/(?<=\s|^|\")#{Regexp.escape(token)}(?=\W|$)/, "#{token_text}")
  end
  txt.strip
end