module Licensee::ContentHelper

Constants

DIGEST
END_OF_TERMS_REGEX
NORMALIZATIONS
REGEXES
START_REGEX
STRIP_METHODS
VARIETAL_WORDS

Legally equivalent words that schould be ignored for comparison See spdx.org/spdx-license-list/matching-guidelines

Public Class Methods

const_missing(const) click to toggle source

Backwards compatibalize constants to avoid a breaking change

Calls superclass method
# File lib/licensee/content_helper.rb, line 174
def self.const_missing(const)
  key = const.to_s.downcase.gsub('_regex', '').to_sym
  REGEXES[key] || super
end
format_percent(float) click to toggle source
# File lib/licensee/content_helper.rb, line 200
def self.format_percent(float)
  "#{format('%<float>.2f', float: float)}%"
end
title_regex() click to toggle source
# File lib/licensee/content_helper.rb, line 204
def self.title_regex
  @title_regex ||= begin
    licenses = Licensee::License.all(hidden: true, psuedo: false)
    titles = licenses.map(&:title_regex)

    # Title regex must include the version to support matching within
    # families, but for sake of normalization, we can be less strict
    without_versions = licenses.map do |license|
      next if license.title == license.name_without_version

      Regexp.new Regexp.escape(license.name_without_version), 'i'
    end
    titles.concat(without_versions.compact)

    /#{START_REGEX}\(?(?:the )?#{Regexp.union titles}.*?$/i
  end
end
wrap(text, line_width = 80) click to toggle source

Wrap text to the given line length

# File lib/licensee/content_helper.rb, line 180
def self.wrap(text, line_width = 80)
  return if text.nil?

  text = text.clone
  text.gsub!(REGEXES[:bullet]) { |m| "\n#{m}\n" }
  text.gsub!(/([^\n])\n([^\n])/, '\1 \2')

  text = text.split("\n").collect do |line|
    if line =~ REGEXES[:hrs]
      line
    elsif line.length > line_width
      line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip
    else
      line
    end
  end * "\n"

  text.strip
end

Public Instance Methods

content_hash() click to toggle source

SHA1 of the normalized content

# File lib/licensee/content_helper.rb, line 139
def content_hash
  @content_hash ||= DIGEST.hexdigest content_normalized
end
content_normalized(wrap: nil) click to toggle source
# File lib/licensee/content_helper.rb, line 156
def content_normalized(wrap: nil)
  @content_normalized ||= begin
    @_content = content_without_title_and_version.downcase

    (NORMALIZATIONS.keys + %i[spelling bullets]).each { |op| normalize(op) }
    STRIP_METHODS.each { |op| strip(op) }

    _content
  end

  if wrap.nil?
    @content_normalized
  else
    Licensee::ContentHelper.wrap(@content_normalized, wrap)
  end
end
content_without_title_and_version() click to toggle source

Content with the title and version removed The first time should normally be the attribution line Used to dry up ‘content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile

# File lib/licensee/content_helper.rb, line 147
def content_without_title_and_version
  @content_without_title_and_version ||= begin
    @_content = nil
    ops = %i[html hrs comments markdown_headings title version]
    ops.each { |op| strip(op) }
    _content
  end
end
length() click to toggle source

Number of characters in the normalized content

# File lib/licensee/content_helper.rb, line 116
def length
  return 0 unless content_normalized

  content_normalized.length
end
length_delta(other) click to toggle source

Given another license or project file, calculates the difference in length

# File lib/licensee/content_helper.rb, line 123
def length_delta(other)
  (length - other.length).abs
end
similarity(other) click to toggle source

Given another license or project file, calculates the similarity as a percentage of words in common, minus a tiny penalty that increases with size difference between licenses so that false positives for long licnses are ruled out by this score alone.

# File lib/licensee/content_helper.rb, line 131
def similarity(other)
  overlap = (wordset_fieldless & other.wordset).size
  total = wordset_fieldless.size + other.wordset.size -
          fields_normalized_set.size
  (overlap * 200.0) / (total + length_delta(other) / 10)
end
wordset() click to toggle source

A set of each word in the license, without duplicates

# File lib/licensee/content_helper.rb, line 111
def wordset
  @wordset ||= content_normalized&.scan(%r{(?:[\w\/](?:'s|(?<=s)')?)+})&.to_set
end

Private Instance Methods

_content() click to toggle source
# File lib/licensee/content_helper.rb, line 224
def _content
  @_content ||= content.to_s.dup.strip
end
fields_normalized() click to toggle source

Returns an array of strings of substitutable fields in normalized content

# File lib/licensee/content_helper.rb, line 330
def fields_normalized
  @fields_normalized ||=
    content_normalized.scan(LicenseField::FIELD_REGEX).flatten
end
fields_normalized_set() click to toggle source
# File lib/licensee/content_helper.rb, line 335
def fields_normalized_set
  @fields_normalized_set ||= fields_normalized.to_set
end
normalize(from_or_key, to = nil) click to toggle source
# File lib/licensee/content_helper.rb, line 303
def normalize(from_or_key, to = nil)
  operation = { from: from_or_key, to: to } if to
  operation ||= NORMALIZATIONS[from_or_key]

  if operation
    @_content = _content.gsub operation[:from], operation[:to]
  elsif respond_to?("normalize_#{from_or_key}", true)
    send("normalize_#{from_or_key}")
  else
    raise ArgumentError, "#{from_or_key} is an invalid normalization"
  end
end
normalize_bullets() click to toggle source
# File lib/licensee/content_helper.rb, line 320
def normalize_bullets
  normalize(REGEXES[:bullet], "\n\n* ")
  normalize(/\)\s+\(/, ')(')
end
normalize_spelling() click to toggle source
# File lib/licensee/content_helper.rb, line 316
def normalize_spelling
  normalize(/\b#{Regexp.union(VARIETAL_WORDS.keys)}\b/, VARIETAL_WORDS)
end
strip(regex_or_sym) click to toggle source
# File lib/licensee/content_helper.rb, line 228
def strip(regex_or_sym)
  return unless _content

  if regex_or_sym.is_a?(Symbol)
    meth = "strip_#{regex_or_sym}"
    return send(meth) if respond_to?(meth, true)

    unless REGEXES[regex_or_sym]
      raise ArgumentError, "#{regex_or_sym} is an invalid regex reference"
    end

    regex_or_sym = REGEXES[regex_or_sym]
  end

  @_content = _content.gsub(regex_or_sym, ' ').squeeze(' ').strip
end
strip_borders() click to toggle source
# File lib/licensee/content_helper.rb, line 251
def strip_borders
  normalize(REGEXES[:border_markup], '\1')
end
strip_cc0_optional() click to toggle source
# File lib/licensee/content_helper.rb, line 268
def strip_cc0_optional
  return unless _content.include? 'associating cc0'

  strip(REGEXES[:cc_legal_code])
  strip(REGEXES[:cc0_info])
  strip(REGEXES[:cc0_disclaimer])
end
strip_comments() click to toggle source
# File lib/licensee/content_helper.rb, line 255
def strip_comments
  lines = _content.split("\n")
  return if lines.count == 1
  return unless lines.all? { |line| line =~ REGEXES[:comment_markup] }

  strip(:comment_markup)
end
strip_end_of_terms() click to toggle source
# File lib/licensee/content_helper.rb, line 282
def strip_end_of_terms
  body, _partition, _instructions = _content.partition(END_OF_TERMS_REGEX)
  @_content = body
end
strip_html() click to toggle source
# File lib/licensee/content_helper.rb, line 295
def strip_html
  return unless respond_to?(:filename) && filename
  return unless File.extname(filename) =~ /\.html?/i

  require 'reverse_markdown'
  @_content = ReverseMarkdown.convert(_content, unknown_tags: :bypass)
end
strip_span_markup() click to toggle source
# File lib/licensee/content_helper.rb, line 287
def strip_span_markup
  normalize(REGEXES[:span_markup], '\1')
end
strip_title() click to toggle source
# File lib/licensee/content_helper.rb, line 245
def strip_title
  while _content =~ ContentHelper.title_regex
    strip(ContentHelper.title_regex)
  end
end
strip_unlicense_optional() click to toggle source
# File lib/licensee/content_helper.rb, line 276
def strip_unlicense_optional
  return unless _content.include? 'unlicense'

  strip(REGEXES[:unlicense_info])
end
wordset_fieldless() click to toggle source
# File lib/licensee/content_helper.rb, line 325
def wordset_fieldless
  @wordset_fieldless ||= wordset - fields_normalized_set
end