class DelimitedWordDataSource
Attributes
buckets[R]
wordAsEncountered[R]
wordCounts[R]
wordValueSequence[R]
Public Class Methods
new(filePath, lineStateMachine, limit)
click to toggle source
Calls superclass method
WordDataSource::new
# File lib/data/word_data_source.rb, line 123 def initialize(filePath, lineStateMachine, limit) @lineStateMachine = lineStateMachine @limit = limit @count = 0 @buckets = {} @wordCounts = {} @wordValueSequence = [] # list of words in file in terms of index into @wordAsEncountered @wordAsEncounteredIndex = {} # key is word, value is number as encountered @wordAsEncountered = [] # array entry added only when a new word is encountered @nextWordEncounteredIndex = 0 super(filePath,"/[^[:print:]]/") end
Public Instance Methods
bucket()
click to toggle source
# File lib/data/word_data_source.rb, line 136 def bucket @lineStateMachine.bucket end
has_terminator?()
click to toggle source
# File lib/data/word_data_source.rb, line 222 def has_terminator? true end
metaDataFor(offset)
click to toggle source
TODO: fix this, linear metadata search, O(N) should be O(lg N)
# File lib/data/word_data_source.rb, line 162 def metaDataFor(offset) previousMetadata = "unknown" @lineStateMachine.pages.sort_by(&:reverse).each do |metadata, wordOffset| if (wordOffset < offset) then previousMetadata = metadata else return previousMetadata end end return previousMetadata end
process(line)
click to toggle source
# File lib/data/word_data_source.rb, line 207 def process(line) line = self.preprocessLine(line) data = @lineStateMachine.process(line, @wordValueSequence.length) if (data.length > 0) then bucket = @lineStateMachine.bucket @buckets[bucket] = {} if (!@buckets.has_key?(bucket)) return self.processData(data,bucket) end return false end
processData(data,bucket)
click to toggle source
# File lib/data/word_data_source.rb, line 179 def processData(data,bucket) data.each do |word| word = word.chomp(",") word = word.chomp(".") if (word.length > 0) then @words << word if (!@wordCounts.has_key?(word)) then # we have a new word @wordAsEncounteredIndex[word] = @nextWordEncounteredIndex @wordAsEncountered << word @nextWordEncounteredIndex += 1 @wordCounts[word] = 0 end @wordCounts[word] += 1 if (!@buckets[bucket].has_key?(word)) then @buckets[bucket][word] = 0 end @buckets[bucket][word] += 1 @wordValueSequence << @wordAsEncounteredIndex[word] @count += 1 if ((@limit > 0) && (@count >= @limit)) then return true end end end return false end
save()
click to toggle source
# File lib/data/word_data_source.rb, line 140 def save File.open("#{@filePath}.words", 'w') do |file| @wordAsEncountered.each do |word| file.write("#{word}\n") end end File.open("#{@filePath}.values", 'wb') do |file| file << @wordValueSequence.pack("N*") end File.open("#{@filePath}.summary", "w") do |file| file << "#{@numberWordsInFile} words in file\n" file << "#{@nextWordEncounteredIndex} distinct words\n" file << "Metadata\n" # uh-oh, this seems to reverse the hash in place! @lineStateMachine.pages.sort_by(&:reverse).each do |page, wordOffset| file << "#{wordOffset} #{page}\n" end end end
terminator()
click to toggle source
# File lib/data/word_data_source.rb, line 226 def terminator "END_OF_DOCUMENT" end
verify(word, count)
click to toggle source
# File lib/data/word_data_source.rb, line 218 def verify(word, count) @wordCounts[word] == count end
wordCount(word)
click to toggle source
# File lib/data/word_data_source.rb, line 174 def wordCount(word) return @wordCounts[word] if @wordCounts.has_key?(word) return 0 end