[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]
vigra/random_forest/rf_algorithm.hxx | ![]() |
00001 /************************************************************************/ 00002 /* */ 00003 /* Copyright 2008-2009 by Rahul Nair */ 00004 /* */ 00005 /* This file is part of the VIGRA computer vision library. */ 00006 /* The VIGRA Website is */ 00007 /* http://hci.iwr.uni-heidelberg.de/vigra/ */ 00008 /* Please direct questions, bug reports, and contributions to */ 00009 /* ullrich.koethe@iwr.uni-heidelberg.de or */ 00010 /* vigra@informatik.uni-hamburg.de */ 00011 /* */ 00012 /* Permission is hereby granted, free of charge, to any person */ 00013 /* obtaining a copy of this software and associated documentation */ 00014 /* files (the "Software"), to deal in the Software without */ 00015 /* restriction, including without limitation the rights to use, */ 00016 /* copy, modify, merge, publish, distribute, sublicense, and/or */ 00017 /* sell copies of the Software, and to permit persons to whom the */ 00018 /* Software is furnished to do so, subject to the following */ 00019 /* conditions: */ 00020 /* */ 00021 /* The above copyright notice and this permission notice shall be */ 00022 /* included in all copies or substantial portions of the */ 00023 /* Software. */ 00024 /* */ 00025 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */ 00026 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */ 00027 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */ 00028 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */ 00029 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */ 00030 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */ 00031 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */ 00032 /* OTHER DEALINGS IN THE SOFTWARE. */ 00033 /* */ 00034 /************************************************************************/ 00035 #define VIGRA_RF_ALGORITHM_HXX 00036 00037 #include <vector> 00038 #include "splices.hxx" 00039 #include <queue> 00040 #include <fstream> 00041 namespace vigra 00042 { 00043 00044 namespace rf 00045 { 00046 /** This namespace contains all algorithms developed for feature 00047 * selection 00048 * 00049 */ 00050 namespace algorithms 00051 { 00052 00053 namespace detail 00054 { 00055 /** create a MultiArray containing only columns supplied between iterators 00056 b and e 00057 */ 00058 template<class OrigMultiArray, 00059 class Iter, 00060 class DestMultiArray> 00061 void choose(OrigMultiArray const & in, 00062 Iter const & b, 00063 Iter const & e, 00064 DestMultiArray & out) 00065 { 00066 int columnCount = std::distance(b, e); 00067 int rowCount = in.shape(0); 00068 out.reshape(MultiArrayShape<2>::type(rowCount, columnCount)); 00069 int ii = 0; 00070 for(Iter iter = b; iter != e; ++iter, ++ii) 00071 { 00072 columnVector(out, ii) = columnVector(in, *iter); 00073 } 00074 } 00075 } 00076 00077 00078 00079 /** Standard random forest Errorrate callback functor 00080 * 00081 * returns the random forest error estimate when invoked. 00082 */ 00083 class RFErrorCallback 00084 { 00085 RandomForestOptions options; 00086 00087 public: 00088 /** Default constructor 00089 * 00090 * optionally supply options to the random forest classifier 00091 * \sa RandomForestOptions 00092 */ 00093 RFErrorCallback(RandomForestOptions opt = RandomForestOptions()) 00094 : options(opt) 00095 {} 00096 00097 /** returns the RF OOB error estimate given features and 00098 * labels 00099 */ 00100 template<class Feature_t, class Response_t> 00101 double operator() (Feature_t const & features, 00102 Response_t const & response) 00103 { 00104 RandomForest<> rf(options); 00105 visitors::OOB_Error oob; 00106 rf.learn(features, 00107 response, 00108 visitors::create_visitor(oob )); 00109 return oob.oob_breiman; 00110 } 00111 }; 00112 00113 00114 /** Structure to hold Variable Selection results 00115 */ 00116 class VariableSelectionResult 00117 { 00118 bool initialized; 00119 00120 public: 00121 VariableSelectionResult() 00122 : initialized(false) 00123 {} 00124 00125 typedef std::vector<int> FeatureList_t; 00126 typedef std::vector<double> ErrorList_t; 00127 typedef FeatureList_t::iterator Pivot_t; 00128 00129 Pivot_t pivot; 00130 00131 /** list of features. 00132 */ 00133 FeatureList_t selected; 00134 00135 /** vector of size (number of features) 00136 * 00137 * the i-th entry encodes the error rate obtained 00138 * while using features [0 - i](including i) 00139 * 00140 * if the i-th entry is -1 then no error rate was obtained 00141 * this may happen if more than one feature is added to the 00142 * selected list in one step of the algorithm. 00143 * 00144 * during initialisation error[m+n-1] is always filled 00145 */ 00146 ErrorList_t errors; 00147 00148 00149 /** errorrate using no features 00150 */ 00151 double no_features; 00152 00153 template<class FeatureT, 00154 class ResponseT, 00155 class Iter, 00156 class ErrorRateCallBack> 00157 bool init(FeatureT const & all_features, 00158 ResponseT const & response, 00159 Iter b, 00160 Iter e, 00161 ErrorRateCallBack errorcallback) 00162 { 00163 bool ret_ = init(all_features, response, errorcallback); 00164 if(!ret_) 00165 return false; 00166 vigra_precondition(std::distance(b, e) == (std::ptrdiff_t)selected.size(), 00167 "Number of features in ranking != number of features matrix"); 00168 std::copy(b, e, selected.begin()); 00169 return true; 00170 } 00171 00172 template<class FeatureT, 00173 class ResponseT, 00174 class Iter> 00175 bool init(FeatureT const & all_features, 00176 ResponseT const & response, 00177 Iter b, 00178 Iter e) 00179 { 00180 RFErrorCallback ecallback; 00181 return init(all_features, response, b, e, ecallback); 00182 } 00183 00184 00185 template<class FeatureT, 00186 class ResponseT> 00187 bool init(FeatureT const & all_features, 00188 ResponseT const & response) 00189 { 00190 return init(all_features, response, RFErrorCallback()); 00191 } 00192 /**initialization routine. Will be called only once in the lifetime 00193 * of a VariableSelectionResult. Subsequent calls will not reinitialize 00194 * member variables. 00195 * 00196 * This is intended, to allow continuing variable selection at a point 00197 * stopped in an earlier iteration. 00198 * 00199 * returns true if initialization was successful and false if 00200 * the object was already initialized before. 00201 */ 00202 template<class FeatureT, 00203 class ResponseT, 00204 class ErrorRateCallBack> 00205 bool init(FeatureT const & all_features, 00206 ResponseT const & response, 00207 ErrorRateCallBack errorcallback) 00208 { 00209 if(initialized) 00210 { 00211 return false; 00212 } 00213 initialized = true; 00214 // calculate error with all features 00215 selected.resize(all_features.shape(1), 0); 00216 for(unsigned int ii = 0; ii < selected.size(); ++ii) 00217 selected[ii] = ii; 00218 errors.resize(all_features.shape(1), -1); 00219 errors.back() = errorcallback(all_features, response); 00220 00221 // calculate error rate if no features are chosen 00222 // corresponds to max(prior probability) of the classes 00223 std::map<typename ResponseT::value_type, int> res_map; 00224 std::vector<int> cts; 00225 int counter = 0; 00226 for(int ii = 0; ii < response.shape(0); ++ii) 00227 { 00228 if(res_map.find(response(ii, 0)) == res_map.end()) 00229 { 00230 res_map[response(ii, 0)] = counter; 00231 ++counter; 00232 cts.push_back(0); 00233 } 00234 cts[res_map[response(ii,0)]] +=1; 00235 } 00236 no_features = double(*(std::max_element(cts.begin(), 00237 cts.end()))) 00238 / double(response.shape(0)); 00239 00240 /*init not_selected vector; 00241 not_selected.resize(all_features.shape(1), 0); 00242 for(int ii = 0; ii < not_selected.size(); ++ii) 00243 { 00244 not_selected[ii] = ii; 00245 } 00246 initialized = true; 00247 */ 00248 pivot = selected.begin(); 00249 return true; 00250 } 00251 }; 00252 00253 00254 00255 /** Perform forward selection 00256 * 00257 * \param features IN: n x p matrix containing n instances with p attributes/features 00258 * used in the variable selection algorithm 00259 * \param response IN: n x 1 matrix containing the corresponding response 00260 * \param result IN/OUT: VariableSelectionResult struct which will contain the results 00261 * of the algorithm. 00262 * Features between result.selected.begin() and result.pivot will 00263 * be left untouched. 00264 * \sa VariableSelectionResult 00265 * \param errorcallback 00266 * IN, OPTIONAL: 00267 * Functor that returns the error rate given a set of 00268 * features and labels. Default is the RandomForest OOB Error. 00269 * 00270 * Forward selection subsequently chooses the next feature that decreases the Error rate most. 00271 * 00272 * usage: 00273 * \code 00274 * MultiArray<2, double> features = createSomeFeatures(); 00275 * MultiArray<2, int> labels = createCorrespondingLabels(); 00276 * VariableSelectionResult result; 00277 * forward_selection(features, labels, result); 00278 * \endcode 00279 * To use forward selection but ensure that a specific feature e.g. feature 5 is always 00280 * included one would do the following 00281 * 00282 * \code 00283 * VariableSelectionResult result; 00284 * result.init(features, labels); 00285 * std::swap(result.selected[0], result.selected[5]); 00286 * result.setPivot(1); 00287 * forward_selection(features, labels, result); 00288 * \endcode 00289 * 00290 * \sa VariableSelectionResult 00291 * 00292 */ 00293 template<class FeatureT, class ResponseT, class ErrorRateCallBack> 00294 void forward_selection(FeatureT const & features, 00295 ResponseT const & response, 00296 VariableSelectionResult & result, 00297 ErrorRateCallBack errorcallback) 00298 { 00299 VariableSelectionResult::FeatureList_t & selected = result.selected; 00300 VariableSelectionResult::ErrorList_t & errors = result.errors; 00301 VariableSelectionResult::Pivot_t & pivot = result.pivot; 00302 int featureCount = features.shape(1); 00303 // initialize result struct if in use for the first time 00304 if(!result.init(features, response, errorcallback)) 00305 { 00306 //result is being reused just ensure that the number of features is 00307 //the same. 00308 vigra_precondition((int)selected.size() == featureCount, 00309 "forward_selection(): Number of features in Feature " 00310 "matrix and number of features in previously used " 00311 "result struct mismatch!"); 00312 } 00313 00314 00315 int not_selected_size = std::distance(pivot, selected.end()); 00316 while(not_selected_size > 1) 00317 { 00318 std::vector<double> current_errors; 00319 VariableSelectionResult::Pivot_t next = pivot; 00320 for(int ii = 0; ii < not_selected_size; ++ii, ++next) 00321 { 00322 std::swap(*pivot, *next); 00323 MultiArray<2, double> cur_feats; 00324 detail::choose( features, 00325 selected.begin(), 00326 pivot+1, 00327 cur_feats); 00328 double error = errorcallback(cur_feats, response); 00329 current_errors.push_back(error); 00330 std::swap(*pivot, *next); 00331 } 00332 int pos = std::distance(current_errors.begin(), 00333 std::min_element(current_errors.begin(), 00334 current_errors.end())); 00335 next = pivot; 00336 std::advance(next, pos); 00337 std::swap(*pivot, *next); 00338 errors[std::distance(selected.begin(), pivot)] = current_errors[pos]; 00339 #ifdef RN_VERBOSE 00340 std::copy(current_errors.begin(), current_errors.end(), std::ostream_iterator<double>(std::cerr, ", ")); 00341 std::cerr << "Choosing " << *pivot << " at error of " << current_errors[pos] << std::endl; 00342 #endif 00343 ++pivot; 00344 not_selected_size = std::distance(pivot, selected.end()); 00345 } 00346 } 00347 template<class FeatureT, class ResponseT> 00348 void forward_selection(FeatureT const & features, 00349 ResponseT const & response, 00350 VariableSelectionResult & result) 00351 { 00352 forward_selection(features, response, result, RFErrorCallback()); 00353 } 00354 00355 00356 /** Perform backward elimination 00357 * 00358 * \param features IN: n x p matrix containing n instances with p attributes/features 00359 * used in the variable selection algorithm 00360 * \param response IN: n x 1 matrix containing the corresponding response 00361 * \param result IN/OUT: VariableSelectionResult struct which will contain the results 00362 * of the algorithm. 00363 * Features between result.pivot and result.selected.end() will 00364 * be left untouched. 00365 * \sa VariableSelectionResult 00366 * \param errorcallback 00367 * IN, OPTIONAL: 00368 * Functor that returns the error rate given a set of 00369 * features and labels. Default is the RandomForest OOB Error. 00370 * 00371 * Backward elimination subsequently eliminates features that have the least influence 00372 * on the error rate 00373 * 00374 * usage: 00375 * \code 00376 * MultiArray<2, double> features = createSomeFeatures(); 00377 * MultiArray<2, int> labels = createCorrespondingLabels(); 00378 * VariableSelectionResult result; 00379 * backward_elimination(features, labels, result); 00380 * \endcode 00381 * To use backward elimination but ensure that a specific feature e.g. feature 5 is always 00382 * excluded one would do the following: 00383 * 00384 * \code 00385 * VariableSelectionResult result; 00386 * result.init(features, labels); 00387 * std::swap(result.selected[result.selected.size()-1], result.selected[5]); 00388 * result.setPivot(result.selected.size()-1); 00389 * backward_elimination(features, labels, result); 00390 * \endcode 00391 * 00392 * \sa VariableSelectionResult 00393 * 00394 */ 00395 template<class FeatureT, class ResponseT, class ErrorRateCallBack> 00396 void backward_elimination(FeatureT const & features, 00397 ResponseT const & response, 00398 VariableSelectionResult & result, 00399 ErrorRateCallBack errorcallback) 00400 { 00401 int featureCount = features.shape(1); 00402 VariableSelectionResult::FeatureList_t & selected = result.selected; 00403 VariableSelectionResult::ErrorList_t & errors = result.errors; 00404 VariableSelectionResult::Pivot_t & pivot = result.pivot; 00405 00406 // initialize result struct if in use for the first time 00407 if(!result.init(features, response, errorcallback)) 00408 { 00409 //result is being reused just ensure that the number of features is 00410 //the same. 00411 vigra_precondition((int)selected.size() == featureCount, 00412 "backward_elimination(): Number of features in Feature " 00413 "matrix and number of features in previously used " 00414 "result struct mismatch!"); 00415 } 00416 pivot = selected.end() - 1; 00417 00418 int selected_size = std::distance(selected.begin(), pivot); 00419 while(selected_size > 1) 00420 { 00421 VariableSelectionResult::Pivot_t next = selected.begin(); 00422 std::vector<double> current_errors; 00423 for(int ii = 0; ii < selected_size; ++ii, ++next) 00424 { 00425 std::swap(*pivot, *next); 00426 MultiArray<2, double> cur_feats; 00427 detail::choose( features, 00428 selected.begin(), 00429 pivot+1, 00430 cur_feats); 00431 double error = errorcallback(cur_feats, response); 00432 current_errors.push_back(error); 00433 std::swap(*pivot, *next); 00434 } 00435 int pos = std::distance(current_errors.begin(), 00436 std::min_element(current_errors.begin(), 00437 current_errors.end())); 00438 next = selected.begin(); 00439 std::advance(next, pos); 00440 std::swap(*pivot, *next); 00441 // std::cerr << std::distance(selected.begin(), pivot) << " " << pos << " " << current_errors.size() << " " << errors.size() << std::endl; 00442 errors[std::distance(selected.begin(), pivot)-1] = current_errors[pos]; 00443 selected_size = std::distance(selected.begin(), pivot); 00444 #ifdef RN_VERBOSE 00445 std::copy(current_errors.begin(), current_errors.end(), std::ostream_iterator<double>(std::cerr, ", ")); 00446 std::cerr << "Eliminating " << *pivot << " at error of " << current_errors[pos] << std::endl; 00447 #endif 00448 --pivot; 00449 } 00450 } 00451 00452 template<class FeatureT, class ResponseT> 00453 void backward_elimination(FeatureT const & features, 00454 ResponseT const & response, 00455 VariableSelectionResult & result) 00456 { 00457 backward_elimination(features, response, result, RFErrorCallback()); 00458 } 00459 00460 /** Perform rank selection using a predefined ranking 00461 * 00462 * \param features IN: n x p matrix containing n instances with p attributes/features 00463 * used in the variable selection algorithm 00464 * \param response IN: n x 1 matrix containing the corresponding response 00465 * \param result IN/OUT: VariableSelectionResult struct which will contain the results 00466 * of the algorithm. The struct should be initialized with the 00467 * predefined ranking. 00468 * 00469 * \sa VariableSelectionResult 00470 * \param errorcallback 00471 * IN, OPTIONAL: 00472 * Functor that returns the error rate given a set of 00473 * features and labels. Default is the RandomForest OOB Error. 00474 * 00475 * Often some variable importance, score measure is used to create the ordering in which 00476 * variables have to be selected. This method takes such a ranking and calculates the 00477 * corresponding error rates. 00478 * 00479 * usage: 00480 * \code 00481 * MultiArray<2, double> features = createSomeFeatures(); 00482 * MultiArray<2, int> labels = createCorrespondingLabels(); 00483 * std::vector<int> ranking = createRanking(features); 00484 * VariableSelectionResult result; 00485 * result.init(features, labels, ranking.begin(), ranking.end()); 00486 * backward_elimination(features, labels, result); 00487 * \endcode 00488 * 00489 * \sa VariableSelectionResult 00490 * 00491 */ 00492 template<class FeatureT, class ResponseT, class ErrorRateCallBack> 00493 void rank_selection (FeatureT const & features, 00494 ResponseT const & response, 00495 VariableSelectionResult & result, 00496 ErrorRateCallBack errorcallback) 00497 { 00498 VariableSelectionResult::FeatureList_t & selected = result.selected; 00499 VariableSelectionResult::ErrorList_t & errors = result.errors; 00500 VariableSelectionResult::Pivot_t & iter = result.pivot; 00501 int featureCount = features.shape(1); 00502 // initialize result struct if in use for the first time 00503 if(!result.init(features, response, errorcallback)) 00504 { 00505 //result is being reused just ensure that the number of features is 00506 //the same. 00507 vigra_precondition((int)selected.size() == featureCount, 00508 "forward_selection(): Number of features in Feature " 00509 "matrix and number of features in previously used " 00510 "result struct mismatch!"); 00511 } 00512 00513 int ii = 0; 00514 for(; iter != selected.end(); ++iter) 00515 { 00516 ++ii; 00517 MultiArray<2, double> cur_feats; 00518 detail::choose( features, 00519 selected.begin(), 00520 iter+1, 00521 cur_feats); 00522 double error = errorcallback(cur_feats, response); 00523 errors[std::distance(selected.begin(), iter)] = error; 00524 #ifdef RN_VERBOSE 00525 std::copy(selected.begin(), iter+1, std::ostream_iterator<int>(std::cerr, ", ")); 00526 std::cerr << "Choosing " << *(iter+1) << " at error of " << error << std::endl; 00527 #endif 00528 00529 } 00530 } 00531 00532 template<class FeatureT, class ResponseT> 00533 void rank_selection (FeatureT const & features, 00534 ResponseT const & response, 00535 VariableSelectionResult & result) 00536 { 00537 rank_selection(features, response, result, RFErrorCallback()); 00538 } 00539 00540 00541 00542 enum ClusterLeafTypes{c_Leaf = 95, c_Node = 99}; 00543 00544 /* View of a Node in the hierarchical clustering 00545 * class 00546 * For internal use only - 00547 * \sa NodeBase 00548 */ 00549 class ClusterNode 00550 : public NodeBase 00551 { 00552 public: 00553 00554 typedef NodeBase BT; 00555 00556 /**constructors **/ 00557 ClusterNode():NodeBase(){} 00558 ClusterNode( int nCol, 00559 BT::T_Container_type & topology, 00560 BT::P_Container_type & split_param) 00561 : BT(nCol + 5, 5,topology, split_param) 00562 { 00563 status() = 0; 00564 BT::column_data()[0] = nCol; 00565 if(nCol == 1) 00566 BT::typeID() = c_Leaf; 00567 else 00568 BT::typeID() = c_Node; 00569 } 00570 00571 ClusterNode( BT::T_Container_type const & topology, 00572 BT::P_Container_type const & split_param, 00573 int n ) 00574 : NodeBase(5 , 5,topology, split_param, n) 00575 { 00576 //TODO : is there a more elegant way to do this? 00577 BT::topology_size_ += BT::column_data()[0]; 00578 } 00579 00580 ClusterNode( BT & node_) 00581 : BT(5, 5, node_) 00582 { 00583 //TODO : is there a more elegant way to do this? 00584 BT::topology_size_ += BT::column_data()[0]; 00585 BT::parameter_size_ += 0; 00586 } 00587 int index() 00588 { 00589 return static_cast<int>(BT::parameters_begin()[1]); 00590 } 00591 void set_index(int in) 00592 { 00593 BT::parameters_begin()[1] = in; 00594 } 00595 double& mean() 00596 { 00597 return BT::parameters_begin()[2]; 00598 } 00599 double& stdev() 00600 { 00601 return BT::parameters_begin()[3]; 00602 } 00603 double& status() 00604 { 00605 return BT::parameters_begin()[4]; 00606 } 00607 }; 00608 00609 /** Stackentry class for HClustering class 00610 */ 00611 struct HC_Entry 00612 { 00613 int parent; 00614 int level; 00615 int addr; 00616 bool infm; 00617 HC_Entry(int p, int l, int a, bool in) 00618 : parent(p), level(l), addr(a), infm(in) 00619 {} 00620 }; 00621 00622 00623 /** Hierarchical Clustering class. 00624 * Performs single linkage clustering 00625 * \code 00626 * Matrix<double> distance = get_distance_matrix(); 00627 * linkage.cluster(distance); 00628 * // Draw clustering tree. 00629 * Draw<double, int> draw(features, labels, "linkagetree.graph"); 00630 * linkage.breadth_first_traversal(draw); 00631 * \endcode 00632 * \sa ClusterImportanceVisitor 00633 * 00634 * once the clustering has taken place. Information queries can be made 00635 * using the breadth_first_traversal() method and iterate() method 00636 * 00637 */ 00638 class HClustering 00639 { 00640 public: 00641 typedef MultiArrayShape<2>::type Shp; 00642 ArrayVector<int> topology_; 00643 ArrayVector<double> parameters_; 00644 int begin_addr; 00645 00646 // Calculates the distance between two 00647 double dist_func(double a, double b) 00648 { 00649 return std::min(a, b); 00650 } 00651 00652 /** Visit each node with a Functor 00653 * in creation order (should be depth first) 00654 */ 00655 template<class Functor> 00656 void iterate(Functor & tester) 00657 { 00658 00659 std::vector<int> stack; 00660 stack.push_back(begin_addr); 00661 while(!stack.empty()) 00662 { 00663 ClusterNode node(topology_, parameters_, stack.back()); 00664 stack.pop_back(); 00665 if(!tester(node)) 00666 { 00667 if(node.columns_size() != 1) 00668 { 00669 stack.push_back(node.child(0)); 00670 stack.push_back(node.child(1)); 00671 } 00672 } 00673 } 00674 } 00675 00676 /** Perform breadth first traversal of hierarchical cluster tree 00677 */ 00678 template<class Functor> 00679 void breadth_first_traversal(Functor & tester) 00680 { 00681 00682 std::queue<HC_Entry> queue; 00683 int level = 0; 00684 int parent = -1; 00685 int addr = -1; 00686 bool infm = false; 00687 queue.push(HC_Entry(parent,level,begin_addr, infm)); 00688 while(!queue.empty()) 00689 { 00690 level = queue.front().level; 00691 parent = queue.front().parent; 00692 addr = queue.front().addr; 00693 infm = queue.front().infm; 00694 ClusterNode node(topology_, parameters_, queue.front().addr); 00695 ClusterNode parnt; 00696 if(parent != -1) 00697 { 00698 parnt = ClusterNode(topology_, parameters_, parent); 00699 } 00700 queue.pop(); 00701 bool istrue = tester(node, level, parnt, infm); 00702 if(node.columns_size() != 1) 00703 { 00704 queue.push(HC_Entry(addr, level +1,node.child(0),istrue)); 00705 queue.push(HC_Entry(addr, level +1,node.child(1),istrue)); 00706 } 00707 } 00708 } 00709 /**save to HDF5 - defunct - has to be updated to new HDF5 interface 00710 */ 00711 #ifdef HasHDF5 00712 void save(std::string file, std::string prefix) 00713 { 00714 00715 vigra::writeHDF5(file.c_str(), (prefix + "topology").c_str(), 00716 MultiArrayView<2, int>( 00717 Shp(topology_.size(),1), 00718 topology_.data())); 00719 vigra::writeHDF5(file.c_str(), (prefix + "parameters").c_str(), 00720 MultiArrayView<2, double>( 00721 Shp(parameters_.size(), 1), 00722 parameters_.data())); 00723 vigra::writeHDF5(file.c_str(), (prefix + "begin_addr").c_str(), 00724 MultiArrayView<2, int>(Shp(1,1), &begin_addr)); 00725 00726 } 00727 #endif 00728 00729 /**Perform single linkage clustering 00730 * \param distance distance matrix used. \sa CorrelationVisitor 00731 */ 00732 template<class T, class C> 00733 void cluster(MultiArrayView<2, T, C> distance) 00734 { 00735 MultiArray<2, T> dist(distance); 00736 std::vector<std::pair<int, int> > addr; 00737 typedef std::pair<int, int> Entry; 00738 int index = 0; 00739 for(int ii = 0; ii < distance.shape(0); ++ii) 00740 { 00741 addr.push_back(std::make_pair(topology_.size(), ii)); 00742 ClusterNode leaf(1, topology_, parameters_); 00743 leaf.set_index(index); 00744 ++index; 00745 leaf.columns_begin()[0] = ii; 00746 } 00747 00748 while(addr.size() != 1) 00749 { 00750 //find the two nodes with the smallest distance 00751 int ii_min = 0; 00752 int jj_min = 1; 00753 double min_dist = dist((addr.begin()+ii_min)->second, 00754 (addr.begin()+jj_min)->second); 00755 for(unsigned int ii = 0; ii < addr.size(); ++ii) 00756 { 00757 for(unsigned int jj = ii+1; jj < addr.size(); ++jj) 00758 { 00759 if( dist((addr.begin()+ii_min)->second, 00760 (addr.begin()+jj_min)->second) 00761 > dist((addr.begin()+ii)->second, 00762 (addr.begin()+jj)->second)) 00763 { 00764 min_dist = dist((addr.begin()+ii)->second, 00765 (addr.begin()+jj)->second); 00766 ii_min = ii; 00767 jj_min = jj; 00768 } 00769 } 00770 } 00771 00772 //merge two nodes 00773 int col_size = 0; 00774 // The problem is that creating a new node invalidates the iterators stored 00775 // in firstChild and secondChild. 00776 { 00777 ClusterNode firstChild(topology_, 00778 parameters_, 00779 (addr.begin() +ii_min)->first); 00780 ClusterNode secondChild(topology_, 00781 parameters_, 00782 (addr.begin() +jj_min)->first); 00783 col_size = firstChild.columns_size() + secondChild.columns_size(); 00784 } 00785 int cur_addr = topology_.size(); 00786 begin_addr = cur_addr; 00787 // std::cerr << col_size << std::endl; 00788 ClusterNode parent(col_size, 00789 topology_, 00790 parameters_); 00791 ClusterNode firstChild(topology_, 00792 parameters_, 00793 (addr.begin() +ii_min)->first); 00794 ClusterNode secondChild(topology_, 00795 parameters_, 00796 (addr.begin() +jj_min)->first); 00797 parent.parameters_begin()[0] = min_dist; 00798 parent.set_index(index); 00799 ++index; 00800 std::merge(firstChild.columns_begin(), firstChild.columns_end(), 00801 secondChild.columns_begin(),secondChild.columns_end(), 00802 parent.columns_begin()); 00803 //merge nodes in addr 00804 int to_desc; 00805 int ii_keep; 00806 if(*parent.columns_begin() == *firstChild.columns_begin()) 00807 { 00808 parent.child(0) = (addr.begin()+ii_min)->first; 00809 parent.child(1) = (addr.begin()+jj_min)->first; 00810 (addr.begin()+ii_min)->first = cur_addr; 00811 ii_keep = ii_min; 00812 to_desc = (addr.begin()+jj_min)->second; 00813 addr.erase(addr.begin()+jj_min); 00814 } 00815 else 00816 { 00817 parent.child(1) = (addr.begin()+ii_min)->first; 00818 parent.child(0) = (addr.begin()+jj_min)->first; 00819 (addr.begin()+jj_min)->first = cur_addr; 00820 ii_keep = jj_min; 00821 to_desc = (addr.begin()+ii_min)->second; 00822 addr.erase(addr.begin()+ii_min); 00823 } 00824 //update distances; 00825 00826 for(int jj = 0 ; jj < (int)addr.size(); ++jj) 00827 { 00828 if(jj == ii_keep) 00829 continue; 00830 double bla = dist_func( 00831 dist(to_desc, (addr.begin()+jj)->second), 00832 dist((addr.begin()+ii_keep)->second, 00833 (addr.begin()+jj)->second)); 00834 00835 dist((addr.begin()+ii_keep)->second, 00836 (addr.begin()+jj)->second) = bla; 00837 dist((addr.begin()+jj)->second, 00838 (addr.begin()+ii_keep)->second) = bla; 00839 } 00840 } 00841 } 00842 00843 }; 00844 00845 00846 /** Normalize the status value in the HClustering tree (HClustering Visitor) 00847 */ 00848 class NormalizeStatus 00849 { 00850 public: 00851 double n; 00852 /** Constructor 00853 * \param m normalize status() by m 00854 */ 00855 NormalizeStatus(double m) 00856 :n(m) 00857 {} 00858 template<class Node> 00859 bool operator()(Node& node) 00860 { 00861 node.status()/=n; 00862 return false; 00863 } 00864 }; 00865 00866 00867 /** Perform Permutation importance on HClustering clusters 00868 * (See visit_after_tree() method of visitors::VariableImportance to 00869 * see the basic idea. (Just that we apply the permutation not only to 00870 * variables but also to clusters)) 00871 */ 00872 template<class Iter, class DT> 00873 class PermuteCluster 00874 { 00875 public: 00876 typedef MultiArrayShape<2>::type Shp; 00877 Matrix<double> tmp_mem_; 00878 MultiArrayView<2, double> perm_imp; 00879 MultiArrayView<2, double> orig_imp; 00880 Matrix<double> feats_; 00881 Matrix<int> labels_; 00882 const int nPerm; 00883 DT const & dt; 00884 int index; 00885 int oob_size; 00886 00887 template<class Feat_T, class Label_T> 00888 PermuteCluster(Iter a, 00889 Iter b, 00890 Feat_T const & feats, 00891 Label_T const & labls, 00892 MultiArrayView<2, double> p_imp, 00893 MultiArrayView<2, double> o_imp, 00894 int np, 00895 DT const & dt_) 00896 :tmp_mem_(_spl(a, b).size(), feats.shape(1)), 00897 perm_imp(p_imp), 00898 orig_imp(o_imp), 00899 feats_(_spl(a,b).size(), feats.shape(1)), 00900 labels_(_spl(a,b).size(),1), 00901 nPerm(np), 00902 dt(dt_), 00903 index(0), 00904 oob_size(b-a) 00905 { 00906 copy_splice(_spl(a,b), 00907 _spl(feats.shape(1)), 00908 feats, 00909 feats_); 00910 copy_splice(_spl(a,b), 00911 _spl(labls.shape(1)), 00912 labls, 00913 labels_); 00914 } 00915 00916 template<class Node> 00917 bool operator()(Node& node) 00918 { 00919 tmp_mem_ = feats_; 00920 RandomMT19937 random; 00921 int class_count = perm_imp.shape(1) - 1; 00922 //permute columns together 00923 for(int kk = 0; kk < nPerm; ++kk) 00924 { 00925 tmp_mem_ = feats_; 00926 for(int ii = 0; ii < rowCount(feats_); ++ii) 00927 { 00928 int index = random.uniformInt(rowCount(feats_) - ii) +ii; 00929 for(int jj = 0; jj < node.columns_size(); ++jj) 00930 { 00931 if(node.columns_begin()[jj] != feats_.shape(1)) 00932 tmp_mem_(ii, node.columns_begin()[jj]) 00933 = tmp_mem_(index, node.columns_begin()[jj]); 00934 } 00935 } 00936 00937 for(int ii = 0; ii < rowCount(tmp_mem_); ++ii) 00938 { 00939 if(dt 00940 .predictLabel(rowVector(tmp_mem_, ii)) 00941 == labels_(ii, 0)) 00942 { 00943 //per class 00944 ++perm_imp(index,labels_(ii, 0)); 00945 //total 00946 ++perm_imp(index, class_count); 00947 } 00948 } 00949 } 00950 double node_status = perm_imp(index, class_count); 00951 node_status /= nPerm; 00952 node_status -= orig_imp(0, class_count); 00953 node_status *= -1; 00954 node_status /= oob_size; 00955 node.status() += node_status; 00956 ++index; 00957 00958 return false; 00959 } 00960 }; 00961 00962 /** Convert ClusteringTree into a list (HClustering visitor) 00963 */ 00964 class GetClusterVariables 00965 { 00966 public: 00967 /** NumberOfClusters x NumberOfVariables MultiArrayView containing 00968 * in each row the variable belonging to a cluster 00969 */ 00970 MultiArrayView<2, int> variables; 00971 int index; 00972 GetClusterVariables(MultiArrayView<2, int> vars) 00973 :variables(vars), index(0) 00974 {} 00975 #ifdef HasHDF5 00976 void save(std::string file, std::string prefix) 00977 { 00978 vigra::writeHDF5(file.c_str(), (prefix + "_variables").c_str(), 00979 variables); 00980 } 00981 #endif 00982 00983 template<class Node> 00984 bool operator()(Node& node) 00985 { 00986 for(int ii = 0; ii < node.columns_size(); ++ii) 00987 variables(index, ii) = node.columns_begin()[ii]; 00988 ++index; 00989 return false; 00990 } 00991 }; 00992 /** corrects the status fields of a linkage Clustering (HClustering Visitor) 00993 * 00994 * such that status(currentNode) = min(status(parent), status(currentNode)) 00995 * \sa cluster_permutation_importance() 00996 */ 00997 class CorrectStatus 00998 { 00999 public: 01000 template<class Nde> 01001 bool operator()(Nde & cur, int level, Nde parent, bool infm) 01002 { 01003 if(parent.hasData_) 01004 cur.status() = std::min(parent.status(), cur.status()); 01005 return true; 01006 } 01007 }; 01008 01009 01010 /** draw current linkage Clustering (HClustering Visitor) 01011 * 01012 * create a graphviz .dot file 01013 * usage: 01014 * \code 01015 * Matrix<double> distance = get_distance_matrix(); 01016 * linkage.cluster(distance); 01017 * Draw<double, int> draw(features, labels, "linkagetree.graph"); 01018 * linkage.breadth_first_traversal(draw); 01019 * \endcode 01020 */ 01021 template<class T1, 01022 class T2, 01023 class C1 = UnstridedArrayTag, 01024 class C2 = UnstridedArrayTag> 01025 class Draw 01026 { 01027 public: 01028 typedef MultiArrayShape<2>::type Shp; 01029 MultiArrayView<2, T1, C1> const & features_; 01030 MultiArrayView<2, T2, C2> const & labels_; 01031 std::ofstream graphviz; 01032 01033 01034 Draw(MultiArrayView<2, T1, C1> const & features, 01035 MultiArrayView<2, T2, C2> const& labels, 01036 std::string const gz) 01037 :features_(features), labels_(labels), 01038 graphviz(gz.c_str(), std::ios::out) 01039 { 01040 graphviz << "digraph G\n{\n node [shape=\"record\"]"; 01041 } 01042 ~Draw() 01043 { 01044 graphviz << "\n}\n"; 01045 graphviz.close(); 01046 } 01047 01048 template<class Nde> 01049 bool operator()(Nde & cur, int level, Nde parent, bool infm) 01050 { 01051 graphviz << "node" << cur.index() << " [style=\"filled\"][label = \" #Feats: "<< cur.columns_size() << "\\n"; 01052 graphviz << " status: " << cur.status() << "\\n"; 01053 for(int kk = 0; kk < cur.columns_size(); ++kk) 01054 { 01055 graphviz << cur.columns_begin()[kk] << " "; 01056 if(kk % 15 == 14) 01057 graphviz << "\\n"; 01058 } 01059 graphviz << "\"] [color = \"" <<cur.status() << " 1.000 1.000\"];\n"; 01060 if(parent.hasData_) 01061 graphviz << "\"node" << parent.index() << "\" -> \"node" << cur.index() <<"\";\n"; 01062 return true; 01063 } 01064 }; 01065 01066 /** calculate Cluster based permutation importance while learning. (RandomForestVisitor) 01067 */ 01068 class ClusterImportanceVisitor : public visitors::VisitorBase 01069 { 01070 public: 01071 01072 /** List of variables as produced by GetClusterVariables 01073 */ 01074 MultiArray<2, int> variables; 01075 /** Corresponding importance measures 01076 */ 01077 MultiArray<2, double> cluster_importance_; 01078 /** Corresponding error 01079 */ 01080 MultiArray<2, double> cluster_stdev_; 01081 int repetition_count_; 01082 bool in_place_; 01083 HClustering & clustering; 01084 01085 01086 #ifdef HasHDF5 01087 void save(std::string filename, std::string prefix) 01088 { 01089 std::string prefix1 = "cluster_importance_" + prefix; 01090 writeHDF5(filename.c_str(), 01091 prefix1.c_str(), 01092 cluster_importance_); 01093 prefix1 = "vars_" + prefix; 01094 writeHDF5(filename.c_str(), 01095 prefix1.c_str(), 01096 variables); 01097 } 01098 #endif 01099 01100 ClusterImportanceVisitor(HClustering & clst, int rep_cnt = 10) 01101 : repetition_count_(rep_cnt), clustering(clst) 01102 01103 {} 01104 01105 /** Allocate enough memory 01106 */ 01107 template<class RF, class PR> 01108 void visit_at_beginning(RF const & rf, PR const & pr) 01109 { 01110 Int32 const class_count = rf.ext_param_.class_count_; 01111 Int32 const column_count = rf.ext_param_.column_count_+1; 01112 cluster_importance_ 01113 .reshape(MultiArrayShape<2>::type(2*column_count-1, 01114 class_count+1)); 01115 cluster_stdev_ 01116 .reshape(MultiArrayShape<2>::type(2*column_count-1, 01117 class_count+1)); 01118 variables 01119 .reshape(MultiArrayShape<2>::type(2*column_count-1, 01120 column_count), -1); 01121 GetClusterVariables gcv(variables); 01122 clustering.iterate(gcv); 01123 01124 } 01125 01126 /**compute permutation based var imp. 01127 * (Only an Array of size oob_sample_count x 1 is created. 01128 * - apposed to oob_sample_count x feature_count in the other method. 01129 * 01130 * \sa FieldProxy 01131 */ 01132 template<class RF, class PR, class SM, class ST> 01133 void after_tree_ip_impl(RF& rf, PR & pr, SM & sm, ST & st, int index) 01134 { 01135 typedef MultiArrayShape<2>::type Shp_t; 01136 Int32 column_count = rf.ext_param_.column_count_ +1; 01137 Int32 class_count = rf.ext_param_.class_count_; 01138 01139 // remove the const cast on the features (yep , I know what I am 01140 // doing here.) data is not destroyed. 01141 typename PR::Feature_t & features 01142 = const_cast<typename PR::Feature_t &>(pr.features()); 01143 01144 //find the oob indices of current tree. 01145 ArrayVector<Int32> oob_indices; 01146 ArrayVector<Int32>::iterator 01147 iter; 01148 01149 if(rf.ext_param_.actual_msample_ < pr.features().shape(0)- 10000) 01150 { 01151 ArrayVector<int> cts(2, 0); 01152 ArrayVector<Int32> indices(pr.features().shape(0)); 01153 for(int ii = 0; ii < pr.features().shape(0); ++ii) 01154 indices.push_back(ii); 01155 std::random_shuffle(indices.begin(), indices.end()); 01156 for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii) 01157 { 01158 if(!sm.is_used()[indices[ii]] && cts[pr.response()(indices[ii], 0)] < 3000) 01159 { 01160 oob_indices.push_back(indices[ii]); 01161 ++cts[pr.response()(indices[ii], 0)]; 01162 } 01163 } 01164 } 01165 else 01166 { 01167 for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii) 01168 if(!sm.is_used()[ii]) 01169 oob_indices.push_back(ii); 01170 } 01171 01172 // Random foo 01173 RandomMT19937 random(RandomSeed); 01174 UniformIntRandomFunctor<RandomMT19937> 01175 randint(random); 01176 01177 //make some space for the results 01178 MultiArray<2, double> 01179 oob_right(Shp_t(1, class_count + 1)); 01180 01181 // get the oob success rate with the original samples 01182 for(iter = oob_indices.begin(); 01183 iter != oob_indices.end(); 01184 ++iter) 01185 { 01186 if(rf.tree(index) 01187 .predictLabel(rowVector(features, *iter)) 01188 == pr.response()(*iter, 0)) 01189 { 01190 //per class 01191 ++oob_right[pr.response()(*iter,0)]; 01192 //total 01193 ++oob_right[class_count]; 01194 } 01195 } 01196 01197 MultiArray<2, double> 01198 perm_oob_right (Shp_t(2* column_count-1, class_count + 1)); 01199 01200 PermuteCluster<ArrayVector<Int32>::iterator,typename RF::DecisionTree_t> 01201 pc(oob_indices.begin(), oob_indices.end(), 01202 pr.features(), 01203 pr.response(), 01204 perm_oob_right, 01205 oob_right, 01206 repetition_count_, 01207 rf.tree(index)); 01208 clustering.iterate(pc); 01209 01210 perm_oob_right /= repetition_count_; 01211 for(int ii = 0; ii < rowCount(perm_oob_right); ++ii) 01212 rowVector(perm_oob_right, ii) -= oob_right; 01213 01214 perm_oob_right *= -1; 01215 perm_oob_right /= oob_indices.size(); 01216 cluster_importance_ += perm_oob_right; 01217 } 01218 01219 /** calculate permutation based impurity after every tree has been 01220 * learned default behaviour is that this happens out of place. 01221 * If you have very big data sets and want to avoid copying of data 01222 * set the in_place_ flag to true. 01223 */ 01224 template<class RF, class PR, class SM, class ST> 01225 void visit_after_tree(RF& rf, PR & pr, SM & sm, ST & st, int index) 01226 { 01227 after_tree_ip_impl(rf, pr, sm, st, index); 01228 } 01229 01230 /** Normalise variable importance after the number of trees is known. 01231 */ 01232 template<class RF, class PR> 01233 void visit_at_end(RF & rf, PR & pr) 01234 { 01235 NormalizeStatus nrm(rf.tree_count()); 01236 clustering.iterate(nrm); 01237 cluster_importance_ /= rf.trees_.size(); 01238 } 01239 }; 01240 01241 /** Perform hierarchical clustering of variables and assess importance of clusters 01242 * 01243 * \param features IN: n x p matrix containing n instances with p attributes/features 01244 * used in the variable selection algorithm 01245 * \param response IN: n x 1 matrix containing the corresponding response 01246 * \param linkage OUT: Hierarchical grouping of variables. 01247 * \param distance OUT: distance matrix used for creating the linkage 01248 * 01249 * Performs Hierarchical clustering of variables. And calculates the permutation importance 01250 * measures of each of the clusters. Use the Draw functor to create human readable output 01251 * The cluster-permutation importance measure corresponds to the normal permutation importance 01252 * measure with all columns corresponding to a cluster permuted. 01253 * The importance measure for each cluster is stored as the status() field of each clusternode 01254 * \sa HClustering 01255 * 01256 * usage: 01257 * \code 01258 * MultiArray<2, double> features = createSomeFeatures(); 01259 * MultiArray<2, int> labels = createCorrespondingLabels(); 01260 * HClustering linkage; 01261 * MultiArray<2, double> distance; 01262 * cluster_permutation_importance(features, labels, linkage, distance) 01263 * // create graphviz output 01264 * 01265 * Draw<double, int> draw(features, labels, "linkagetree.graph"); 01266 * linkage.breadth_first_traversal(draw); 01267 * 01268 * \endcode 01269 * 01270 * 01271 */ 01272 template<class FeatureT, class ResponseT> 01273 void cluster_permutation_importance(FeatureT const & features, 01274 ResponseT const & response, 01275 HClustering & linkage, 01276 MultiArray<2, double> & distance) 01277 { 01278 01279 RandomForestOptions opt; 01280 opt.tree_count(100); 01281 if(features.shape(0) > 40000) 01282 opt.samples_per_tree(20000).use_stratification(RF_EQUAL); 01283 01284 01285 vigra::RandomForest<int> RF(opt); 01286 visitors::RandomForestProgressVisitor progress; 01287 visitors::CorrelationVisitor missc; 01288 RF.learn(features, response, 01289 create_visitor(missc, progress)); 01290 distance = missc.distance; 01291 /* 01292 missc.save(exp_dir + dset.name() + "_result.h5", dset.name()+"MACH"); 01293 */ 01294 01295 01296 // Produce linkage 01297 linkage.cluster(distance); 01298 01299 //linkage.save(exp_dir + dset.name() + "_result.h5", "_linkage_CC/"); 01300 vigra::RandomForest<int> RF2(opt); 01301 ClusterImportanceVisitor ci(linkage); 01302 RF2.learn(features, 01303 response, 01304 create_visitor(progress, ci)); 01305 01306 01307 CorrectStatus cs; 01308 linkage.breadth_first_traversal(cs); 01309 01310 //ci.save(exp_dir + dset.name() + "_result.h5", dset.name()); 01311 //Draw<double, int> draw(dset.features(), dset.response(), exp_dir+ dset.name() + ".graph"); 01312 //linkage.breadth_first_traversal(draw); 01313 01314 } 01315 01316 01317 template<class FeatureT, class ResponseT> 01318 void cluster_permutation_importance(FeatureT const & features, 01319 ResponseT const & response, 01320 HClustering & linkage) 01321 { 01322 MultiArray<2, double> distance; 01323 cluster_permutation_importance(features, response, linkage, distance); 01324 } 01325 01326 01327 template<class Array1, class Vector1> 01328 void get_ranking(Array1 const & in, Vector1 & out) 01329 { 01330 std::map<double, int> mymap; 01331 for(int ii = 0; ii < in.size(); ++ii) 01332 mymap[in[ii]] = ii; 01333 for(std::map<double, int>::reverse_iterator iter = mymap.rbegin(); iter!= mymap.rend(); ++iter) 01334 { 01335 out.push_back(iter->second); 01336 } 01337 } 01338 }//namespace algorithms 01339 }//namespace rf 01340 }//namespace vigra
© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de) |
html generated using doxygen and Python
|