module Licensee::ContentHelper
Constants
- DIGEST
- END_OF_TERMS_REGEX
- NORMALIZATIONS
- REGEXES
- START_REGEX
- STRIP_METHODS
- VARIETAL_WORDS
Legally equivalent words that schould be ignored for comparison See spdx.org/spdx-license-list/matching-guidelines
Public Class Methods
Backwards compatibalize constants to avoid a breaking change
# File lib/licensee/content_helper.rb, line 174 def self.const_missing(const) key = const.to_s.downcase.gsub('_regex', '').to_sym REGEXES[key] || super end
# File lib/licensee/content_helper.rb, line 200 def self.format_percent(float) "#{format('%<float>.2f', float: float)}%" end
# File lib/licensee/content_helper.rb, line 204 def self.title_regex @title_regex ||= begin licenses = Licensee::License.all(hidden: true, psuedo: false) titles = licenses.map(&:title_regex) # Title regex must include the version to support matching within # families, but for sake of normalization, we can be less strict without_versions = licenses.map do |license| next if license.title == license.name_without_version Regexp.new Regexp.escape(license.name_without_version), 'i' end titles.concat(without_versions.compact) /#{START_REGEX}\(?(?:the )?#{Regexp.union titles}.*?$/i end end
Wrap text to the given line length
# File lib/licensee/content_helper.rb, line 180 def self.wrap(text, line_width = 80) return if text.nil? text = text.clone text.gsub!(REGEXES[:bullet]) { |m| "\n#{m}\n" } text.gsub!(/([^\n])\n([^\n])/, '\1 \2') text = text.split("\n").collect do |line| if line =~ REGEXES[:hrs] line elsif line.length > line_width line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip else line end end * "\n" text.strip end
Public Instance Methods
SHA1 of the normalized content
# File lib/licensee/content_helper.rb, line 139 def content_hash @content_hash ||= DIGEST.hexdigest content_normalized end
# File lib/licensee/content_helper.rb, line 156 def content_normalized(wrap: nil) @content_normalized ||= begin @_content = content_without_title_and_version.downcase (NORMALIZATIONS.keys + %i[spelling bullets]).each { |op| normalize(op) } STRIP_METHODS.each { |op| strip(op) } _content end if wrap.nil? @content_normalized else Licensee::ContentHelper.wrap(@content_normalized, wrap) end end
Content with the title and version removed The first time should normally be the attribution line Used to dry up `content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile
# File lib/licensee/content_helper.rb, line 147 def content_without_title_and_version @content_without_title_and_version ||= begin @_content = nil ops = %i[html hrs comments markdown_headings title version] ops.each { |op| strip(op) } _content end end
Number of characters in the normalized content
# File lib/licensee/content_helper.rb, line 116 def length return 0 unless content_normalized content_normalized.length end
Given another license or project file, calculates the difference in length
# File lib/licensee/content_helper.rb, line 123 def length_delta(other) (length - other.length).abs end
Given another license or project file, calculates the similarity as a percentage of words in common, minus a tiny penalty that increases with size difference between licenses so that false positives for long licnses are ruled out by this score alone.
# File lib/licensee/content_helper.rb, line 131 def similarity(other) overlap = (wordset_fieldless & other.wordset).size total = wordset_fieldless.size + other.wordset.size - fields_normalized_set.size (overlap * 200.0) / (total + length_delta(other) / 10) end
A set of each word in the license, without duplicates
# File lib/licensee/content_helper.rb, line 111 def wordset @wordset ||= content_normalized&.scan(%r{(?:[\w\/](?:'s|(?<=s)')?)+})&.to_set end
Private Instance Methods
# File lib/licensee/content_helper.rb, line 224 def _content @_content ||= content.to_s.dup.strip end
Returns an array of strings of substitutable fields in normalized content
# File lib/licensee/content_helper.rb, line 330 def fields_normalized @fields_normalized ||= content_normalized.scan(LicenseField::FIELD_REGEX).flatten end
# File lib/licensee/content_helper.rb, line 335 def fields_normalized_set @fields_normalized_set ||= fields_normalized.to_set end
# File lib/licensee/content_helper.rb, line 303 def normalize(from_or_key, to = nil) operation = { from: from_or_key, to: to } if to operation ||= NORMALIZATIONS[from_or_key] if operation @_content = _content.gsub operation[:from], operation[:to] elsif respond_to?("normalize_#{from_or_key}", true) send("normalize_#{from_or_key}") else raise ArgumentError, "#{from_or_key} is an invalid normalization" end end
# File lib/licensee/content_helper.rb, line 320 def normalize_bullets normalize(REGEXES[:bullet], "\n\n* ") normalize(/\)\s+\(/, ')(') end
# File lib/licensee/content_helper.rb, line 316 def normalize_spelling normalize(/\b#{Regexp.union(VARIETAL_WORDS.keys)}\b/, VARIETAL_WORDS) end
# File lib/licensee/content_helper.rb, line 228 def strip(regex_or_sym) return unless _content if regex_or_sym.is_a?(Symbol) meth = "strip_#{regex_or_sym}" return send(meth) if respond_to?(meth, true) unless REGEXES[regex_or_sym] raise ArgumentError, "#{regex_or_sym} is an invalid regex reference" end regex_or_sym = REGEXES[regex_or_sym] end @_content = _content.gsub(regex_or_sym, ' ').squeeze(' ').strip end
# File lib/licensee/content_helper.rb, line 251 def strip_borders normalize(REGEXES[:border_markup], '\1') end
# File lib/licensee/content_helper.rb, line 268 def strip_cc0_optional return unless _content.include? 'associating cc0' strip(REGEXES[:cc_legal_code]) strip(REGEXES[:cc0_info]) strip(REGEXES[:cc0_disclaimer]) end
# File lib/licensee/content_helper.rb, line 255 def strip_comments lines = _content.split("\n") return if lines.count == 1 return unless lines.all? { |line| line =~ REGEXES[:comment_markup] } strip(:comment_markup) end
# File lib/licensee/content_helper.rb, line 263 def strip_copyright regex = Regexp.union(Matchers::Copyright::REGEX, REGEXES[:all_rights_reserved]) strip(regex) while _content =~ regex end
# File lib/licensee/content_helper.rb, line 282 def strip_end_of_terms body, _partition, _instructions = _content.partition(END_OF_TERMS_REGEX) @_content = body end
# File lib/licensee/content_helper.rb, line 295 def strip_html return unless respond_to?(:filename) && filename return unless File.extname(filename) =~ /\.html?/i require 'reverse_markdown' @_content = ReverseMarkdown.convert(_content, unknown_tags: :bypass) end
# File lib/licensee/content_helper.rb, line 291 def strip_link_markup normalize(REGEXES[:link_markup], '\1') end
# File lib/licensee/content_helper.rb, line 287 def strip_span_markup normalize(REGEXES[:span_markup], '\1') end
# File lib/licensee/content_helper.rb, line 245 def strip_title while _content =~ ContentHelper.title_regex strip(ContentHelper.title_regex) end end
# File lib/licensee/content_helper.rb, line 276 def strip_unlicense_optional return unless _content.include? 'unlicense' strip(REGEXES[:unlicense_info]) end
# File lib/licensee/content_helper.rb, line 325 def wordset_fieldless @wordset_fieldless ||= wordset - fields_normalized_set end