[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

vigra/sampling.hxx VIGRA

00001 /************************************************************************/
00002 /*                                                                      */
00003 /*        Copyright 2008-2009 by  Ullrich Koethe and Rahul Nair         */
00004 /*                                                                      */
00005 /*    This file is part of the VIGRA computer vision library.           */
00006 /*    The VIGRA Website is                                              */
00007 /*        http://hci.iwr.uni-heidelberg.de/vigra/                       */
00008 /*    Please direct questions, bug reports, and contributions to        */
00009 /*        ullrich.koethe@iwr.uni-heidelberg.de    or                    */
00010 /*        vigra@informatik.uni-hamburg.de                               */
00011 /*                                                                      */
00012 /*    Permission is hereby granted, free of charge, to any person       */
00013 /*    obtaining a copy of this software and associated documentation    */
00014 /*    files (the "Software"), to deal in the Software without           */
00015 /*    restriction, including without limitation the rights to use,      */
00016 /*    copy, modify, merge, publish, distribute, sublicense, and/or      */
00017 /*    sell copies of the Software, and to permit persons to whom the    */
00018 /*    Software is furnished to do so, subject to the following          */
00019 /*    conditions:                                                       */
00020 /*                                                                      */
00021 /*    The above copyright notice and this permission notice shall be    */
00022 /*    included in all copies or substantial portions of the             */
00023 /*    Software.                                                         */
00024 /*                                                                      */
00025 /*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
00026 /*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
00027 /*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
00028 /*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
00029 /*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
00030 /*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
00031 /*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
00032 /*    OTHER DEALINGS IN THE SOFTWARE.                                   */
00033 /*                                                                      */
00034 /************************************************************************/
00035 
00036 #ifndef VIGRA_SAMPLING_HXX
00037 #define VIGRA_SAMPLING_HXX
00038 
00039 #include "array_vector.hxx"
00040 #include "random.hxx"
00041 #include <map>
00042 #include <memory>
00043 #include <cmath>
00044 
00045 namespace vigra
00046 {
00047 
00048 /** \addtogroup MachineLearning Machine Learning
00049 **/
00050 //@{
00051 
00052 
00053 /**\brief Options object for the Sampler class.
00054  
00055   <b>usage:</b>
00056  
00057   \code
00058   SamplerOptions opt =  SamplerOptions()
00059                                .withReplacement()
00060                                .sampleProportion(0.5);
00061   \endcode
00062  
00063   Note that the return value of all methods is <tt>*this</tt> which makes
00064   concatenating of options as above possible.
00065 */
00066 class SamplerOptions
00067 {
00068   public:
00069 
00070     double sample_proportion;
00071     unsigned int sample_size;
00072     bool   sample_with_replacement;
00073     bool   stratified_sampling;
00074     
00075     SamplerOptions()
00076     : sample_proportion(1.0),
00077       sample_size(0), 
00078       sample_with_replacement(true),
00079       stratified_sampling(false)
00080     {}
00081 
00082         /**\brief Sample from training population with replacement.
00083          *
00084          * <br> Default: true
00085          */
00086     SamplerOptions& withReplacement(bool in = true)
00087     {
00088         sample_with_replacement = in;
00089         return *this;
00090     }
00091 
00092         /**\brief Sample from training population without replacement.
00093          *
00094          * <br> Default (if you don't call this function): false
00095          */
00096     SamplerOptions& withoutReplacement(bool in = true)
00097     {
00098         sample_with_replacement = !in;
00099         return *this;
00100     }
00101 
00102         /**\brief Draw the given number of samples.
00103          * If stratifiedSampling is true, the \a size is equally distributed
00104          * across all strata (e.g. <tt>size / strataCount</tt> samples are taken 
00105          * from each stratum, subject to rounding).
00106          *
00107          * <br> Default: 0 (i.e. determine the count by means of sampleProportion())
00108          */
00109     SamplerOptions& sampleSize(unsigned int size)
00110     {
00111         sample_size = size;
00112         return *this;
00113     }
00114 
00115 
00116         /**\brief Determine the number of samples to draw as a proportion of the total
00117          * number. That is, we draw <tt>count = totalCount * proportion</tt> samples. 
00118          * This option is overridden when an absolute count is specified by sampleSize().
00119          * 
00120          * If stratifiedSampling is true, the count is equally distributed
00121          * across all strata (e.g. <tt>totalCount * proportion / strataCount</tt> samples are taken 
00122          * from each stratum).
00123          *
00124          * <br> Default: 1.0
00125          */
00126     SamplerOptions& sampleProportion(double proportion)
00127     {
00128         vigra_precondition(proportion >= 0.0,
00129                "SamplerOptions::sampleProportion(): argument must not be negative.");
00130         sample_proportion = proportion;
00131         return *this;
00132     }
00133 
00134         /**\brief Draw equally many samples from each "stratum". 
00135          *  A stratum is a group of like entities, e.g. pixels belonging 
00136          *  to the same object class. This is useful to create balanced samples  
00137          *  when the class probabilities are very unbalanced (e.g.
00138          *  when there are many background and few foreground pixels).
00139          *  Stratified sampling thus avoids that a trained classifier is biased 
00140          *  towards the majority class. 
00141          *
00142          * <br> Default (if you don't call this function): false
00143          */
00144     SamplerOptions& stratified(bool in = true)
00145     {
00146         stratified_sampling = in;
00147         return *this;
00148     }
00149 };
00150 
00151 /************************************************************/
00152 /*                                                          */
00153 /*                        Sampler                           */
00154 /*                                                          */
00155 /************************************************************/
00156 
00157 /** \brief Create random samples from a sequence of indices.
00158 
00159     Selecting data items at random is a basic task of machine learning,
00160     for example in boostrapping, RandomForest training, and cross validation.
00161     This class implements various ways to select random samples via their indices. 
00162     Indices are assumed to be consecutive in
00163     the range <tt>0 &lt;= index &lt; total_sample_count</tt>.
00164     
00165     The class always contains a current sample which can be accessed by 
00166     the index operator or by the function sampledIndices(). The indices
00167     that are not in the current sample (out-of-bag indices) can be accessed
00168     via the function oobIndices().
00169     
00170     The sampling method (with/without replacement, stratified or not) and the
00171     number of samples to draw are determined by the option object 
00172     SamplerOptions.
00173     
00174     <b>Usage:</b>
00175     
00176     <b>\#include</b> <vigra/sampling.hxx><br>
00177     Namespace: vigra
00178     
00179     Create a Sampler with default options, i.e. sample as many indices as there 
00180     are data elements, with replacement. On average, the sample will contain 
00181     <tt>0.63*totalCount</tt> distinct indices.
00182     
00183     \code
00184     int totalCount = 10000;   // total number of data elements
00185     int numberOfSamples = 20; // repeat experiment 20 times 
00186     Sampler<> sampler(totalCount);
00187     for(int k=0; k<numberOfSamples; ++k)
00188     {
00189         // process current sample
00190         for(int i=0; i<sampler.sampleSize(); ++i)
00191         {
00192             int currentIndex = sampler[i];
00193             processData(data[currentIndex]);
00194         }
00195         // create next sample
00196         sampler.sample();
00197     }
00198     \endcode
00199     
00200     Create a Sampler for stratified sampling, without replacement.
00201     
00202     \code
00203     // prepare the strata (i.e. specify which stratum each element belongs to)
00204     int stratumSize1 = 2000, stratumSize2 = 8000,
00205         totalCount = stratumSize1 + stratumSize2;
00206     ArrayVerctor<int> strata(totalCount);
00207     for(int i=0; i<stratumSize1; ++i)
00208         strata[i] = 1;
00209     for(int i=stratumSize1; i<stratumSize2; ++i)
00210         strata[i] = 2;
00211         
00212     int sampleSize = 200; // i.e. sample 100 elements from each of the two strata
00213     int numberOfSamples = 20; // repeat experiment 20 times 
00214     Sampler<> stratifiedSampler(strata.begin(), strata.end(),
00215                      SamplerOptions().withoutReplacement().stratified().sampleSize(sampleSize));
00216     // create first sample
00217     sampler.sample();
00218 
00219     for(int k=0; k<numberOfSamples; ++k)
00220     {
00221         // process current sample
00222         for(int i=0; i<sampler.sampleSize(); ++i)
00223         {
00224             int currentIndex = sampler[i];
00225             processData(data[currentIndex]);
00226         }
00227         // create next sample
00228         sampler.sample();
00229     }
00230     \endcode
00231 */
00232 template<class Random = MersenneTwister >
00233 class Sampler
00234 {
00235   public:
00236         /** Internal type of the indices.
00237             Currently, 64-bit indices are not supported because this
00238             requires extension of the random number generator classes.
00239         */
00240     typedef Int32                               IndexType;
00241     
00242     typedef ArrayVector     <IndexType>  IndexArrayType;
00243     
00244         /** Type of the array view object that is returned by 
00245             sampledIndices() and oobIndices().
00246         */
00247     typedef ArrayVectorView <IndexType>  IndexArrayViewType;
00248 
00249   private:
00250     typedef std::map<IndexType, IndexArrayType> StrataIndicesType;
00251     typedef std::map<IndexType, int> StrataSizesType;
00252     typedef ArrayVector     <bool>       IsUsedArrayType;
00253     typedef ArrayVectorView <bool>       IsUsedArrayViewType;
00254     
00255     static const int oobInvalid = -1;
00256 
00257     int total_count_, sample_size_;
00258     mutable int current_oob_count_;
00259     StrataIndicesType     strata_indices_;
00260     StrataSizesType       strata_sample_size_;
00261     IndexArrayType        current_sample_;
00262     mutable IndexArrayType        current_oob_sample_;
00263     IsUsedArrayType       is_used_;
00264     Random const & random_;
00265     SamplerOptions options_;
00266 
00267     void initStrataCount()
00268     {
00269         // compute how many samples to take from each stratum
00270         // (may be unequal if sample_size_ is not a multiple of strataCount())
00271         int strata_sample_size = (int)std::ceil(double(sample_size_) / strataCount());
00272         int strata_total_count = strata_sample_size * strataCount();
00273 
00274         for(StrataIndicesType::iterator i = strata_indices_.begin(); 
00275              i != strata_indices_.end(); ++i)
00276         {
00277             if(strata_total_count > sample_size_)
00278             {
00279                 strata_sample_size_[i->first] = strata_sample_size - 1;
00280                 --strata_total_count;
00281             }
00282             else
00283             {
00284                 strata_sample_size_[i->first] = strata_sample_size;
00285             }
00286         }
00287     }
00288 
00289   public:
00290     
00291         /** Create a sampler for \a totalCount data objects.
00292             
00293             In each invocation of <tt>sample()</tt> below, it will sample
00294             indices according to the options passed. If no options are given, 
00295             <tt>totalCount</tt> indices will be drawn with replacement.
00296         */
00297     Sampler(UInt32 totalCount, SamplerOptions const & opt = SamplerOptions(), 
00298             Random const & rnd = Random(RandomSeed))
00299     : total_count_(totalCount),
00300       sample_size_(opt.sample_size == 0
00301                          ? (int)(std::ceil(total_count_ * opt.sample_proportion))
00302                          : opt.sample_size),
00303       current_oob_count_(oobInvalid),
00304       current_sample_(sample_size_),
00305       current_oob_sample_(total_count_),
00306       is_used_(total_count_),
00307       random_(rnd),
00308       options_(opt)
00309     {
00310         vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
00311           "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
00312           
00313         vigra_precondition(!opt.stratified_sampling,
00314           "Sampler(): Stratified sampling requested, but no strata given.");
00315           
00316         // initialize a single stratum containing all data
00317         strata_indices_[0].resize(total_count_);
00318         for(int i=0; i<total_count_; ++i)
00319             strata_indices_[0][i] = i;
00320 
00321         initStrataCount();
00322         //this is screwing up the random forest tests.
00323         //sample();
00324     }
00325     
00326         /** Create a sampler for stratified sampling.
00327             
00328             <tt>strataBegin</tt> and <tt>strataEnd</tt> must refer to a sequence 
00329             which specifies for each sample the stratum it belongs to. The
00330             total number of data objects will be set to <tt>strataEnd - strataBegin</tt>.
00331             Equally many samples (subject to rounding) will be drawn from each stratum, 
00332             unless the option object explicitly requests unstratified sampling, 
00333             in which case the strata are ignored.
00334         */
00335     template <class Iterator>
00336     Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const & opt = SamplerOptions(), 
00337             Random const & rnd = Random(RandomSeed))
00338     : total_count_(strataEnd - strataBegin),
00339       sample_size_(opt.sample_size == 0
00340                          ? (int)(std::ceil(total_count_ * opt.sample_proportion))
00341                          : opt.sample_size),
00342       current_oob_count_(oobInvalid),
00343       current_sample_(sample_size_),
00344       current_oob_sample_(total_count_),
00345       is_used_(total_count_),
00346       random_(rnd),
00347       options_(opt)
00348     {
00349         vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
00350           "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
00351           
00352         // copy the strata indices
00353         if(opt.stratified_sampling)
00354         {
00355             for(int i = 0; strataBegin != strataEnd; ++i, ++strataBegin)
00356             {
00357                 strata_indices_[*strataBegin].push_back(i);
00358             }
00359         }
00360         else
00361         {
00362             strata_indices_[0].resize(total_count_);
00363             for(int i=0; i<total_count_; ++i)
00364                 strata_indices_[0][i] = i;
00365         }
00366             
00367         vigra_precondition(sample_size_ >= (int)strata_indices_.size(),
00368             "Sampler(): Requested sample count must be at least as large as the number of strata.");
00369 
00370         initStrataCount();
00371         //this is screwing up the random forest tests.
00372         //sample();
00373     }
00374 
00375         /** Return the k-th index in the current sample.
00376          */
00377     IndexType operator[](int k) const
00378     {
00379         return current_sample_[k];
00380     }
00381 
00382         /** Create a new sample.
00383          */
00384     void sample();
00385 
00386         /** The total number of data elements.
00387          */
00388     int totalCount() const
00389     {
00390         return total_count_;
00391     }
00392 
00393         /** The number of data elements that have been sampled.
00394          */
00395     int sampleSize() const
00396     {
00397         return sample_size_;
00398     }
00399 
00400         /** Same as sampleSize().
00401          */
00402     int size() const
00403     {
00404         return sample_size_;
00405     }
00406 
00407         /** The number of strata to be used.
00408             Will be 1 if no strata are given. Will be ignored when
00409             stratifiedSampling() is false.
00410          */
00411     int strataCount() const
00412     {
00413         return strata_indices_.size();
00414     }
00415 
00416         /** Whether to use stratified sampling.
00417             (If this is false, strata will be ignored even if present.)
00418          */
00419     bool stratifiedSampling() const
00420     {
00421         return options_.stratified_sampling;
00422     }
00423     
00424         /** Whether sampling should be performed with replacement.
00425          */
00426     bool withReplacement() const
00427     {
00428         return options_.sample_with_replacement;
00429     }
00430     
00431         /** Return an array view containing the indices in the current sample.
00432          */
00433     IndexArrayViewType sampledIndices() const
00434     {
00435         return current_sample_;
00436     }
00437     
00438         /** Return an array view containing the out-of-bag indices.
00439             (i.e. the indices that are not in the current sample)
00440          */
00441     IndexArrayViewType oobIndices() const
00442     {
00443         if(current_oob_count_ == oobInvalid)
00444         {
00445             current_oob_count_ = 0;
00446             for(int i = 0; i<total_count_; ++i)
00447             {
00448                 if(!is_used_[i])
00449                 {
00450                     current_oob_sample_[current_oob_count_] = i;
00451                     ++current_oob_count_;
00452                 }
00453             }
00454         }
00455         return current_oob_sample_.subarray(0, current_oob_count_);
00456     }
00457     IsUsedArrayType const & is_used() const
00458     {
00459         return is_used_;
00460     }
00461 };
00462 
00463 
00464 template<class Random>
00465 void Sampler<Random>::sample()
00466 {
00467     current_oob_count_ = oobInvalid;
00468     is_used_.init(false);
00469     
00470     if(options_.sample_with_replacement)
00471     {
00472         //Go thru all strata
00473         int j = 0;
00474         StrataIndicesType::iterator iter;
00475         for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
00476         {
00477             // do sampling with replacement in each strata and copy data.
00478             int stratum_size = iter->second.size();
00479             for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
00480             {
00481                 current_sample_[j] = iter->second[random_.uniformInt(stratum_size)];
00482                 is_used_[current_sample_[j]] = true;
00483             }
00484         }
00485     }
00486     else
00487     {
00488         //Go thru all strata
00489         int j = 0;
00490         StrataIndicesType::iterator iter;
00491         for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
00492         {
00493             // do sampling without replacement in each strata and copy data.
00494             int stratum_size = iter->second.size();
00495             for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
00496             {
00497                 std::swap(iter->second[i], iter->second[i+ random_.uniformInt(stratum_size - i)]);
00498                 current_sample_[j] = iter->second[i];
00499                 is_used_[current_sample_[j]] = true;
00500             }
00501         }
00502     }
00503 }
00504 
00505 template<class Random =RandomTT800 >
00506 class PoissonSampler
00507 {
00508 public:
00509     Random  randfloat;
00510     typedef Int32                               IndexType;
00511     typedef vigra::ArrayVector     <IndexType>  IndexArrayType;
00512     IndexArrayType        used_indices_;
00513     double lambda;
00514     int minIndex;
00515     int maxIndex;
00516     
00517     PoissonSampler(double lambda,IndexType minIndex,IndexType maxIndex)
00518     : lambda(lambda),
00519       minIndex(minIndex),
00520       maxIndex(maxIndex)
00521     {}
00522 
00523     void sample(  )
00524     {
00525         used_indices_.clear();
00526         IndexType i;
00527         for(i=minIndex;i<maxIndex;++i)
00528         {
00529             //from http://en.wikipedia.org/wiki/Poisson_distribution
00530             int k=0;
00531             double p=1;
00532             double L=exp(-lambda);
00533             do
00534             {
00535                 ++k;
00536                 p*=randfloat.uniform53();
00537 
00538             }while(p>L);
00539             --k;
00540             //Insert i this many time
00541             while(k>0)
00542             {
00543                 used_indices_.push_back(i);
00544                 --k;
00545             }
00546         }
00547     }
00548 
00549     IndexType const & operator[](int in) const
00550     {
00551         return used_indices_[in];
00552     }
00553     
00554     int numOfSamples() const
00555     {
00556         return used_indices_.size();
00557     }
00558 };
00559 
00560 //@}
00561 
00562 } // namespace vigra
00563 
00564 #endif /*VIGRA_SAMPLING_HXX*/

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.9.0 (Tue Nov 6 2012)