class Mizuho::IdMap

Constants

MATCHER
URANDOM

Attributes

associations[R]
entries[R]

Public Class Methods

new() click to toggle source
# File lib/mizuho/id_map.rb, line 52
def initialize
        @entries = {}
        @associations = {}
        #@namespace = slug(File.basename(filename, File.extname(filename)))
end

Public Instance Methods

add(title, id, *options) click to toggle source
# File lib/mizuho/id_map.rb, line 186
def add(title, id, *options)
        return @entries[title] = Entry.new(title, id || create_unique_id(title), *options)
end
generate_associations(titles) click to toggle source
# File lib/mizuho/id_map.rb, line 106
def generate_associations(titles)
        @associations = {}

        # Associate exact matches.
        titles = titles.reject do |title|
                if (entry = @entries[title]) && !entry.associated?
                        entry.associated = true
                        @associations[title] = entry.id
                        true
                else
                        false
                end
        end

        # For the remaining titles, associate with moved or similar-looking entry.
        titles.reject! do |title|
                if entry = find_moved(title)
                        @entries.delete(entry.title)
                        @entries[title] = entry
                        entry.title = title
                        entry.associated = true
                        entry.fuzzy = false
                        @associations[title] = entry.id
                        true
                else
                        false
                end
        end

        # For the remaining titles, associate with similar-looking entry.
        titles.reject! do |title|
                if entry = find_similar(title)
                        @entries.delete(entry.title)
                        @entries[title] = entry
                        entry.title = title
                        entry.associated = true
                        entry.fuzzy = true
                        @associations[title] = entry.id
                        true
                else
                        false
                end
        end

        # For the remaining titles, create new entries.
        titles.each do |title|
                id = create_unique_id(title)
                add(title, id, false, true)
                @associations[title] = id
        end
end
load(filename_or_io) click to toggle source
# File lib/mizuho/id_map.rb, line 58
def load(filename_or_io)
        @entries.clear
        open_io(filename_or_io, :read) do |io|
                fuzzy = false
                while true
                        begin
                                line = io.readline.strip
                                if line.empty?
                                        fuzzy = false
                                elsif line == "# fuzzy"
                                        fuzzy = true
                                elsif line !~ /\A#/
                                        title, id = line.split("\t=>\t", 2)
                                        add(title, id, fuzzy, false)
                                        fuzzy = false
                                end
                        rescue EOFError
                                break
                        end
                end
        end
        return self
end
save(filename_or_io) click to toggle source
# File lib/mizuho/id_map.rb, line 82
def save(filename_or_io)
        normal, orphaned = group_and_sort_entries
        output = ""
        output << BANNER
        normal.each do |entry|
                output << "# fuzzy\n" if entry.fuzzy?
                output << "#{entry.title}    => #{entry.id}\n"
                output << "\n"
        end
        if !orphaned.empty?
                output << "\n"
                output << "### These sections appear to have been removed. Please check.\n"
                output << "\n"
                orphaned.each do |entry|
                        output << "# fuzzy\n" if entry.fuzzy?
                        output << "#{entry.title}   =>        #{entry.id}\n"
                        output << "\n"
                end
        end
        open_io(filename_or_io, :write) do |f|
                f.write(output)
        end
end
stats() click to toggle source
# File lib/mizuho/id_map.rb, line 190
def stats
        fuzzy = 0
        orphaned = 0
        @entries.each_value do |entry|
                fuzzy += 1 if entry.fuzzy?
                orphaned += 1 if !entry.associated?
        end
        return { :fuzzy => fuzzy, :orphaned => orphaned }
end
xassociate(title) click to toggle source
# File lib/mizuho/id_map.rb, line 158
def xassociate(title)
        if entry = @entries[title]
                if entry.associated?
                        raise AlreadyAssociatedError, "Cannot associate an already associated title (#{title.inspect})"
                else
                        entry.associated = true
                        id = entry.id
                end
        elsif (moved_entry = find_moved(title)) || (similar_entry = find_similar(title))
                if moved_entry
                        puts "moved entry: #{title.inspect} -> #{moved_entry.title.inspect}"
                elsif similar_entry
                        puts "similar entry: #{title.inspect} -> #{similar_entry.title.inspect}"
                end
                entry = (moved_entry || similar_entry)
                @entries.delete(entry.title)
                @entries[title] = entry
                entry.title = title
                entry.associated = true
                entry.fuzzy = true if similar_entry
                id = entry.id
        else
                id = create_unique_id(title)
                add(title, id, false, true)
        end
        return id
end

Private Instance Methods

create_unique_id(title) click to toggle source
# File lib/mizuho/id_map.rb, line 302
def create_unique_id(title)
        suffix = URANDOM.read(4).unpack('H*')[0].to_i(16).to_s(36)
        return "#{slug(title)}-#{suffix}"
end
find_moved(title) click to toggle source
# File lib/mizuho/id_map.rb, line 225
def find_moved(title)
        orig_chapter, orig_pure_title = extract_chapter(title)
        return nil if !orig_chapter

        # Find all possible matches.
        orig_chapter_digits = chapter_to_int_array(orig_chapter)
        matches = []
        @entries.each_value do |entry|
                next if entry.associated?
                chapter, pure_title = extract_chapter(entry.title)
                if chapter && orig_pure_title == pure_title
                        matches << {
                                :chapter_digits => chapter_to_int_array(chapter),
                                :pure_title => pure_title,
                                :entry => entry
                        }
                end
        end

        # Iterate until we find the best match. We match the chapter
        # digits from left to right.
        digit_match_index = 0
        while matches.size > 1
                orig_digit = orig_chapter_digits[digit_match_index] || 1

                # Find closest digit in all matches.
                tmp = matches.min do |a, b|
                        x = a[:chapter_digits][digit_match_index] - orig_digit
                        y = b[:chapter_digits][digit_match_index] - orig_digit
                        x.abs <=> y.abs
                end
                closest_digit = tmp[:chapter_digits][digit_match_index]

                # Filter out all matches with this digit.
                matches = matches.find_all do |m|
                        m[:chapter_digits][digit_match_index] == closest_digit
                end

                # If a next iteration is necessary, we check the next digit.
                digit_match_index += 1
        end

        if matches.empty?
                return nil
        else
                return matches[0][:entry]
        end
end
find_similar(title) click to toggle source
# File lib/mizuho/id_map.rb, line 274
def find_similar(title)
        lower_title = title.downcase
        best_score = nil
        best_match = nil
        @entries.each_value do |entry|
                next if entry.associated?
                score = MATCHER.getDistance(entry.title.downcase, lower_title)
                if best_score.nil? || score > best_score
                        best_score = score
                        best_match = entry
                end
        end
        if best_score && best_score > 0.8
                return best_match
        else
                return nil
        end
end
group_and_sort_entries() click to toggle source
# File lib/mizuho/id_map.rb, line 323
def group_and_sort_entries
        normal = []
        orphaned = []
        
        @entries.each_value do |entry|
                if entry.associated?
                        normal << entry
                else
                        orphaned << entry
                end
        end
        
        normal.sort!
        orphaned.sort!

        return [normal, orphaned]
end
open_io(filename_or_io, mode) { |filename_or_io| ... } click to toggle source
# File lib/mizuho/id_map.rb, line 307
def open_io(filename_or_io, mode, &block)
        if mode == :read
                if filename_or_io.respond_to?(:readline)
                        yield(filename_or_io)
                else
                        File.open(filename_or_io, 'r', &block)
                end
        else
                if filename_or_io.respond_to?(:write)
                        yield(filename_or_io)
                else
                        File.open(filename_or_io, 'w', &block)
                end
        end
end
slug(text) click to toggle source
# File lib/mizuho/id_map.rb, line 293
def slug(text)
        text = text.downcase
        text.gsub!(/^(\d+\.)+ /, '')
        text.gsub!(/[^a-z0-9\-\_]/i, '-')
        text.gsub!('_', '-')
        text.gsub!(/--+/, '-')
        return text
end