module Linguist::Samples
Model for accessing classifier training data.
Constants
- DATA
- PATH
Path for serialized samples db
- ROOT
Path to samples root directory
Public Class Methods
data()
click to toggle source
Public: Build Classifier
from all samples.
Returns trained Classifier
.
# File lib/linguist/samples.rb, line 67 def self.data db = {} db['extnames'] = {} db['filenames'] = {} each do |sample| language_name = sample[:language] if sample[:extname] db['extnames'][language_name] ||= [] if !db['extnames'][language_name].include?(sample[:extname]) db['extnames'][language_name] << sample[:extname] db['extnames'][language_name].sort! end end if sample[:filename] db['filenames'][language_name] ||= [] db['filenames'][language_name] << sample[:filename] db['filenames'][language_name].sort! end data = File.read(sample[:path]) Classifier.train!(db, language_name, data) end db['md5'] = Linguist::MD5.hexdigest(db) db end
each() { |{ :path => join, :language => category, :filename => subfilename }| ... }
click to toggle source
Public: Iterate over each sample.
&block - Yields Sample to block
Returns nothing.
# File lib/linguist/samples.rb, line 25 def self.each(&block) Dir.entries(ROOT).each do |category| next if category == '.' || category == '..' # Skip text and binary for now # Possibly reconsider this later next if category == 'Text' || category == 'Binary' dirname = File.join(ROOT, category) Dir.entries(dirname).each do |filename| next if filename == '.' || filename == '..' if filename == 'filenames' Dir.entries(File.join(dirname, filename)).each do |subfilename| next if subfilename == '.' || subfilename == '..' yield({ :path => File.join(dirname, filename, subfilename), :language => category, :filename => subfilename }) end else if File.extname(filename) == "" raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir" end yield({ :path => File.join(dirname, filename), :language => category, :extname => File.extname(filename) }) end end end nil end